movier/server/pkg/dto/vector.go

59 lines
1.1 KiB
Go

package dto
import (
"fmt"
"math"
)
type CountVectorizer struct {
WordIndex map[string]int
}
func NewCountVectorizer() *CountVectorizer {
return &CountVectorizer{}
}
func (cv *CountVectorizer) SetWordIndexes(docs [][]string) {
cv.WordIndex = make(map[string]int)
index := 0
for _, doc := range docs {
for _, word := range doc {
if word == "" {
continue
}
if _, exists := cv.WordIndex[word]; !exists {
cv.WordIndex[word] = index
index++
}
}
}
}
func (cv *CountVectorizer) Vectorize(doc []string) []uint8 {
vector := make([]uint8, len(cv.WordIndex))
for _, word := range doc {
vector[cv.WordIndex[word]]++
}
return vector
}
func CosineSimilarity(a, b []uint8) (float32, error) {
if len(a) != len(b) {
return 0, fmt.Errorf("slices must have the same length")
}
var dotProduct, normA, normB float64
for i := 0; i < len(a); i++ {
x := float64(a[i])
y := float64(b[i])
dotProduct += x * y
normA += x * x
normB += y * y
}
if normA == 0 || normB == 0 {
return 0, nil
}
return float32(dotProduct / (math.Sqrt(normA) * math.Sqrt(normB))), nil
}