mirror of
https://github.com/aykhans/movier.git
synced 2025-04-17 04:13:12 +00:00
59 lines
1.1 KiB
Go
59 lines
1.1 KiB
Go
package dto
|
|
|
|
import (
|
|
"fmt"
|
|
"math"
|
|
)
|
|
|
|
type CountVectorizer struct {
|
|
WordIndex map[string]int
|
|
}
|
|
|
|
func NewCountVectorizer() *CountVectorizer {
|
|
return &CountVectorizer{}
|
|
}
|
|
|
|
func (cv *CountVectorizer) SetWordIndexes(docs [][]string) {
|
|
cv.WordIndex = make(map[string]int)
|
|
index := 0
|
|
for _, doc := range docs {
|
|
for _, word := range doc {
|
|
if word == "" {
|
|
continue
|
|
}
|
|
if _, exists := cv.WordIndex[word]; !exists {
|
|
cv.WordIndex[word] = index
|
|
index++
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (cv *CountVectorizer) Vectorize(doc []string) []uint8 {
|
|
vector := make([]uint8, len(cv.WordIndex))
|
|
for _, word := range doc {
|
|
vector[cv.WordIndex[word]]++
|
|
}
|
|
return vector
|
|
}
|
|
|
|
func CosineSimilarity(a, b []uint8) (float32, error) {
|
|
if len(a) != len(b) {
|
|
return 0, fmt.Errorf("slices must have the same length")
|
|
}
|
|
var dotProduct, normA, normB float64
|
|
for i := 0; i < len(a); i++ {
|
|
x := float64(a[i])
|
|
y := float64(b[i])
|
|
dotProduct += x * y
|
|
normA += x * x
|
|
normB += y * y
|
|
}
|
|
|
|
if normA == 0 || normB == 0 {
|
|
return 0, nil
|
|
}
|
|
|
|
return float32(dotProduct / (math.Sqrt(normA) * math.Sqrt(normB))), nil
|
|
}
|