mirror of
https://github.com/aykhans/movier.git
synced 2025-07-19 16:54:00 +00:00
Rewritten in go and python
This commit is contained in:
31
server/pkg/dto/download.go
Normal file
31
server/pkg/dto/download.go
Normal file
@@ -0,0 +1,31 @@
|
||||
package dto
|
||||
|
||||
import (
|
||||
"io"
|
||||
"net/http"
|
||||
"os"
|
||||
)
|
||||
|
||||
func DownloadAndExtractGz(url, downloadFilepath, extractFilepath string) error {
|
||||
if err := Download(url, downloadFilepath); err != nil {
|
||||
return err
|
||||
}
|
||||
return ExtractGzFile(downloadFilepath, extractFilepath)
|
||||
}
|
||||
|
||||
func Download(url, filepath string) error {
|
||||
resp, err := http.Get(url)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
out, err := os.Create(filepath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer out.Close()
|
||||
|
||||
_, err = io.Copy(out, resp.Body)
|
||||
return err
|
||||
}
|
30
server/pkg/dto/extract.go
Normal file
30
server/pkg/dto/extract.go
Normal file
@@ -0,0 +1,30 @@
|
||||
package dto
|
||||
|
||||
import (
|
||||
"compress/gzip"
|
||||
"io"
|
||||
"os"
|
||||
)
|
||||
|
||||
func ExtractGzFile(gzFile, extractedFilepath string) error {
|
||||
file, err := os.Open(gzFile)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
gzReader, err := gzip.NewReader(file)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer gzReader.Close()
|
||||
|
||||
outFile, err := os.Create(extractedFilepath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer outFile.Close()
|
||||
|
||||
_, err = io.Copy(outFile, gzReader)
|
||||
return err
|
||||
}
|
259
server/pkg/dto/filter.go
Normal file
259
server/pkg/dto/filter.go
Normal file
@@ -0,0 +1,259 @@
|
||||
package dto
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"math"
|
||||
"os"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/aykhans/movier/server/pkg/config"
|
||||
)
|
||||
|
||||
func FilterBasics(filePath string) ([]Basic, error) {
|
||||
file, err := os.Open(filePath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("could not open file: %v", err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
scanner := bufio.NewScanner(file)
|
||||
|
||||
columnCount := 9
|
||||
var headers []string
|
||||
if scanner.Scan() {
|
||||
headers = strings.Split(scanner.Text(), "\t")
|
||||
if len(headers) != columnCount {
|
||||
return nil, fmt.Errorf("expected %d column headers, found %d", columnCount, len(headers))
|
||||
}
|
||||
} else {
|
||||
return nil, fmt.Errorf("could not read column headers: %v", scanner.Err())
|
||||
}
|
||||
|
||||
var (
|
||||
tconstIndex int = -1
|
||||
titleTypeIndex int = -1
|
||||
startYearIndex int = -1
|
||||
genresIndex int = -1
|
||||
)
|
||||
for i, header := range headers {
|
||||
switch header {
|
||||
case "tconst":
|
||||
tconstIndex = i
|
||||
case "titleType":
|
||||
titleTypeIndex = i
|
||||
case "startYear":
|
||||
startYearIndex = i
|
||||
case "genres":
|
||||
genresIndex = i
|
||||
}
|
||||
}
|
||||
switch {
|
||||
case tconstIndex == -1:
|
||||
return nil, fmt.Errorf("column %s not found", "`tconst`")
|
||||
case titleTypeIndex == -1:
|
||||
return nil, fmt.Errorf("column %s not found", "`titleType`")
|
||||
case startYearIndex == -1:
|
||||
return nil, fmt.Errorf("column %s not found", "`startYear`")
|
||||
case genresIndex == -1:
|
||||
return nil, fmt.Errorf("column %s not found", "`genres`")
|
||||
}
|
||||
|
||||
var basics []Basic
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
columns := strings.Split(line, "\t")
|
||||
if len(columns) != columnCount {
|
||||
fmt.Println("Columns are:", columns)
|
||||
return nil, fmt.Errorf("expected %d columns, found %d", columnCount, len(columns))
|
||||
}
|
||||
|
||||
if slices.Contains(config.TitleTypes, columns[titleTypeIndex]) {
|
||||
var startYearUint16 uint16
|
||||
startYear, err := strconv.Atoi(columns[startYearIndex])
|
||||
if err != nil {
|
||||
startYearUint16 = 0
|
||||
} else {
|
||||
startYearUint16 = uint16(startYear)
|
||||
}
|
||||
|
||||
var genres string
|
||||
if columns[genresIndex] == "\\N" {
|
||||
genres = ""
|
||||
} else {
|
||||
genres = strings.ReplaceAll(strings.ToLower(columns[genresIndex]), " ", "")
|
||||
}
|
||||
|
||||
basics = append(basics, Basic{
|
||||
Tconst: columns[tconstIndex],
|
||||
StartYear: startYearUint16,
|
||||
Genres: genres,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
if err := scanner.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return basics, nil
|
||||
}
|
||||
|
||||
func FilterPrincipals(filePath string, tconsts []string) ([]Principal, error) {
|
||||
file, err := os.Open(filePath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("could not open file: %v", err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
scanner := bufio.NewScanner(file)
|
||||
|
||||
columnCount := 6
|
||||
var headers []string
|
||||
if scanner.Scan() {
|
||||
headers = strings.Split(scanner.Text(), "\t")
|
||||
if len(headers) != columnCount {
|
||||
return nil, fmt.Errorf("expected %d column headers, found %d", columnCount, len(headers))
|
||||
}
|
||||
} else {
|
||||
return nil, fmt.Errorf("could not read column headers: %v", scanner.Err())
|
||||
}
|
||||
|
||||
var (
|
||||
tconstIndex int = -1
|
||||
nconstIndex int = -1
|
||||
categoryIndex int = -1
|
||||
)
|
||||
for i, header := range headers {
|
||||
switch header {
|
||||
case "tconst":
|
||||
tconstIndex = i
|
||||
case "nconst":
|
||||
nconstIndex = i
|
||||
case "category":
|
||||
categoryIndex = i
|
||||
}
|
||||
}
|
||||
switch {
|
||||
case tconstIndex == -1:
|
||||
return nil, fmt.Errorf("column %s not found", "`tconst`")
|
||||
case nconstIndex == -1:
|
||||
return nil, fmt.Errorf("column %s not found", "`nconst`")
|
||||
case categoryIndex == -1:
|
||||
return nil, fmt.Errorf("column %s not found", "`category`")
|
||||
}
|
||||
|
||||
tconstMap := make(map[string][]string)
|
||||
for _, tconst := range tconsts {
|
||||
tconstMap[tconst] = []string{}
|
||||
}
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
columns := strings.Split(line, "\t")
|
||||
if len(columns) != columnCount {
|
||||
fmt.Println("Columns are:", columns)
|
||||
return nil, fmt.Errorf("expected %d columns, found %d", columnCount, len(columns))
|
||||
}
|
||||
|
||||
if slices.Contains(config.NconstCategories, columns[categoryIndex]) {
|
||||
if _, ok := tconstMap[columns[tconstIndex]]; ok {
|
||||
tconstMap[columns[tconstIndex]] = append(tconstMap[columns[tconstIndex]], columns[nconstIndex])
|
||||
}
|
||||
}
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var principals []Principal
|
||||
for tconst, nconsts := range tconstMap {
|
||||
principals = append(principals, Principal{
|
||||
Tconst: tconst,
|
||||
Nconsts: strings.Join(nconsts, ","),
|
||||
})
|
||||
}
|
||||
return principals, nil
|
||||
}
|
||||
|
||||
func FilterRatings(filePath string, tconsts []string) ([]Ratings, error) {
|
||||
file, err := os.Open(filePath)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("could not open file: %v", err)
|
||||
}
|
||||
defer file.Close()
|
||||
|
||||
scanner := bufio.NewScanner(file)
|
||||
|
||||
columnCount := 3
|
||||
var headers []string
|
||||
if scanner.Scan() {
|
||||
headers = strings.Split(scanner.Text(), "\t")
|
||||
if len(headers) != columnCount {
|
||||
return nil, fmt.Errorf("expected %d column headers, found %d", columnCount, len(headers))
|
||||
}
|
||||
} else {
|
||||
return nil, fmt.Errorf("could not read column headers: %v", scanner.Err())
|
||||
}
|
||||
|
||||
var (
|
||||
tconstIndex int = -1
|
||||
averageRatingIndex int = -1
|
||||
numVotesIndex int = -1
|
||||
)
|
||||
for i, header := range headers {
|
||||
switch header {
|
||||
case "tconst":
|
||||
tconstIndex = i
|
||||
case "averageRating":
|
||||
averageRatingIndex = i
|
||||
case "numVotes":
|
||||
numVotesIndex = i
|
||||
}
|
||||
}
|
||||
switch {
|
||||
case tconstIndex == -1:
|
||||
return nil, fmt.Errorf("column %s not found", "`tconst`")
|
||||
case averageRatingIndex == -1:
|
||||
return nil, fmt.Errorf("column %s not found", "`averageRating`")
|
||||
case numVotesIndex == -1:
|
||||
return nil, fmt.Errorf("column %s not found", "`numVotes`")
|
||||
}
|
||||
|
||||
tconstMap := make(map[string][]string)
|
||||
for _, tconst := range tconsts {
|
||||
tconstMap[tconst] = []string{}
|
||||
}
|
||||
var ratings []Ratings
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
columns := strings.Split(line, "\t")
|
||||
if len(columns) != columnCount {
|
||||
fmt.Println("Columns are:", columns)
|
||||
return nil, fmt.Errorf("expected %d columns, found %d", columnCount, len(columns))
|
||||
}
|
||||
|
||||
if _, ok := tconstMap[columns[tconstIndex]]; ok {
|
||||
rating, err := strconv.ParseFloat(columns[averageRatingIndex], 32)
|
||||
if err != nil {
|
||||
rating = 0
|
||||
}
|
||||
|
||||
votes, err := strconv.Atoi(columns[numVotesIndex])
|
||||
if err != nil {
|
||||
votes = 0
|
||||
}
|
||||
|
||||
ratings = append(ratings, Ratings{
|
||||
Tconst: columns[tconstIndex],
|
||||
Rating: math.Round(rating*10) / 10,
|
||||
Votes: votes,
|
||||
})
|
||||
}
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return ratings, nil
|
||||
}
|
27
server/pkg/dto/models.go
Normal file
27
server/pkg/dto/models.go
Normal file
@@ -0,0 +1,27 @@
|
||||
package dto
|
||||
|
||||
type Basic struct {
|
||||
Tconst string `json:"tconst"`
|
||||
StartYear uint16 `json:"startYear"`
|
||||
Genres string `json:"genres"`
|
||||
}
|
||||
|
||||
type Principal struct {
|
||||
Tconst string `json:"tconst"`
|
||||
Nconsts string `json:"nconsts"`
|
||||
}
|
||||
|
||||
type Ratings struct {
|
||||
Tconst string `json:"tconst"`
|
||||
Rating float64 `json:"rating"`
|
||||
Votes int `json:"votes"`
|
||||
}
|
||||
|
||||
type MinMax struct {
|
||||
MinVotes uint `json:"minVotes"`
|
||||
MaxVotes uint `json:"maxVotes"`
|
||||
MinYear uint `json:"minYear"`
|
||||
MaxYear uint `json:"maxYear"`
|
||||
MinRating float64 `json:"minRating"`
|
||||
MaxRating float64 `json:"maxRating"`
|
||||
}
|
58
server/pkg/dto/vector.go
Normal file
58
server/pkg/dto/vector.go
Normal file
@@ -0,0 +1,58 @@
|
||||
package dto
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"math"
|
||||
)
|
||||
|
||||
type CountVectorizer struct {
|
||||
WordIndex map[string]int
|
||||
}
|
||||
|
||||
func NewCountVectorizer() *CountVectorizer {
|
||||
return &CountVectorizer{}
|
||||
}
|
||||
|
||||
func (cv *CountVectorizer) SetWordIndexes(docs [][]string) {
|
||||
cv.WordIndex = make(map[string]int)
|
||||
index := 0
|
||||
for _, doc := range docs {
|
||||
for _, word := range doc {
|
||||
if word == "" {
|
||||
continue
|
||||
}
|
||||
if _, exists := cv.WordIndex[word]; !exists {
|
||||
cv.WordIndex[word] = index
|
||||
index++
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (cv *CountVectorizer) Vectorize(doc []string) []uint8 {
|
||||
vector := make([]uint8, len(cv.WordIndex))
|
||||
for _, word := range doc {
|
||||
vector[cv.WordIndex[word]]++
|
||||
}
|
||||
return vector
|
||||
}
|
||||
|
||||
func CosineSimilarity(a, b []uint8) (float32, error) {
|
||||
if len(a) != len(b) {
|
||||
return 0, fmt.Errorf("slices must have the same length")
|
||||
}
|
||||
var dotProduct, normA, normB float64
|
||||
for i := 0; i < len(a); i++ {
|
||||
x := float64(a[i])
|
||||
y := float64(b[i])
|
||||
dotProduct += x * y
|
||||
normA += x * x
|
||||
normB += y * y
|
||||
}
|
||||
|
||||
if normA == 0 || normB == 0 {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
return float32(dotProduct / (math.Sqrt(normA) * math.Sqrt(normB))), nil
|
||||
}
|
Reference in New Issue
Block a user