mirror of
				https://github.com/aykhans/movier.git
				synced 2025-11-04 07:59:59 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			59 lines
		
	
	
		
			1.1 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			59 lines
		
	
	
		
			1.1 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
package dto
 | 
						|
 | 
						|
import (
 | 
						|
	"fmt"
 | 
						|
	"math"
 | 
						|
)
 | 
						|
 | 
						|
type CountVectorizer struct {
 | 
						|
	WordIndex map[string]int
 | 
						|
}
 | 
						|
 | 
						|
func NewCountVectorizer() *CountVectorizer {
 | 
						|
	return &CountVectorizer{}
 | 
						|
}
 | 
						|
 | 
						|
func (cv *CountVectorizer) SetWordIndexes(docs [][]string) {
 | 
						|
	cv.WordIndex = make(map[string]int)
 | 
						|
	index := 0
 | 
						|
	for _, doc := range docs {
 | 
						|
		for _, word := range doc {
 | 
						|
			if word == "" {
 | 
						|
				continue
 | 
						|
			}
 | 
						|
			if _, exists := cv.WordIndex[word]; !exists {
 | 
						|
				cv.WordIndex[word] = index
 | 
						|
				index++
 | 
						|
			}
 | 
						|
		}
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
func (cv *CountVectorizer) Vectorize(doc []string) []uint8 {
 | 
						|
	vector := make([]uint8, len(cv.WordIndex))
 | 
						|
	for _, word := range doc {
 | 
						|
		vector[cv.WordIndex[word]]++
 | 
						|
	}
 | 
						|
	return vector
 | 
						|
}
 | 
						|
 | 
						|
func CosineSimilarity(a, b []uint8) (float32, error) {
 | 
						|
	if len(a) != len(b) {
 | 
						|
		return 0, fmt.Errorf("slices must have the same length")
 | 
						|
	}
 | 
						|
	var dotProduct, normA, normB float64
 | 
						|
	for i := 0; i < len(a); i++ {
 | 
						|
		x := float64(a[i])
 | 
						|
		y := float64(b[i])
 | 
						|
		dotProduct += x * y
 | 
						|
		normA += x * x
 | 
						|
		normB += y * y
 | 
						|
	}
 | 
						|
 | 
						|
	if normA == 0 || normB == 0 {
 | 
						|
		return 0, nil
 | 
						|
	}
 | 
						|
 | 
						|
	return float32(dotProduct / (math.Sqrt(normA) * math.Sqrt(normB))), nil
 | 
						|
}
 |