Rewritten in go and python

This commit is contained in:
2024-11-06 01:25:27 +04:00
parent 9f22d9678d
commit d8449237bb
50 changed files with 3824 additions and 879 deletions

View File

@@ -0,0 +1,31 @@
package dto
import (
"io"
"net/http"
"os"
)
func DownloadAndExtractGz(url, downloadFilepath, extractFilepath string) error {
if err := Download(url, downloadFilepath); err != nil {
return err
}
return ExtractGzFile(downloadFilepath, extractFilepath)
}
func Download(url, filepath string) error {
resp, err := http.Get(url)
if err != nil {
return err
}
defer resp.Body.Close()
out, err := os.Create(filepath)
if err != nil {
return err
}
defer out.Close()
_, err = io.Copy(out, resp.Body)
return err
}

30
server/pkg/dto/extract.go Normal file
View File

@@ -0,0 +1,30 @@
package dto
import (
"compress/gzip"
"io"
"os"
)
func ExtractGzFile(gzFile, extractedFilepath string) error {
file, err := os.Open(gzFile)
if err != nil {
return err
}
defer file.Close()
gzReader, err := gzip.NewReader(file)
if err != nil {
return err
}
defer gzReader.Close()
outFile, err := os.Create(extractedFilepath)
if err != nil {
return err
}
defer outFile.Close()
_, err = io.Copy(outFile, gzReader)
return err
}

259
server/pkg/dto/filter.go Normal file
View File

@@ -0,0 +1,259 @@
package dto
import (
"bufio"
"fmt"
"math"
"os"
"slices"
"strconv"
"strings"
"github.com/aykhans/movier/server/pkg/config"
)
func FilterBasics(filePath string) ([]Basic, error) {
file, err := os.Open(filePath)
if err != nil {
return nil, fmt.Errorf("could not open file: %v", err)
}
defer file.Close()
scanner := bufio.NewScanner(file)
columnCount := 9
var headers []string
if scanner.Scan() {
headers = strings.Split(scanner.Text(), "\t")
if len(headers) != columnCount {
return nil, fmt.Errorf("expected %d column headers, found %d", columnCount, len(headers))
}
} else {
return nil, fmt.Errorf("could not read column headers: %v", scanner.Err())
}
var (
tconstIndex int = -1
titleTypeIndex int = -1
startYearIndex int = -1
genresIndex int = -1
)
for i, header := range headers {
switch header {
case "tconst":
tconstIndex = i
case "titleType":
titleTypeIndex = i
case "startYear":
startYearIndex = i
case "genres":
genresIndex = i
}
}
switch {
case tconstIndex == -1:
return nil, fmt.Errorf("column %s not found", "`tconst`")
case titleTypeIndex == -1:
return nil, fmt.Errorf("column %s not found", "`titleType`")
case startYearIndex == -1:
return nil, fmt.Errorf("column %s not found", "`startYear`")
case genresIndex == -1:
return nil, fmt.Errorf("column %s not found", "`genres`")
}
var basics []Basic
for scanner.Scan() {
line := scanner.Text()
columns := strings.Split(line, "\t")
if len(columns) != columnCount {
fmt.Println("Columns are:", columns)
return nil, fmt.Errorf("expected %d columns, found %d", columnCount, len(columns))
}
if slices.Contains(config.TitleTypes, columns[titleTypeIndex]) {
var startYearUint16 uint16
startYear, err := strconv.Atoi(columns[startYearIndex])
if err != nil {
startYearUint16 = 0
} else {
startYearUint16 = uint16(startYear)
}
var genres string
if columns[genresIndex] == "\\N" {
genres = ""
} else {
genres = strings.ReplaceAll(strings.ToLower(columns[genresIndex]), " ", "")
}
basics = append(basics, Basic{
Tconst: columns[tconstIndex],
StartYear: startYearUint16,
Genres: genres,
})
}
}
if err := scanner.Err(); err != nil {
return nil, err
}
return basics, nil
}
func FilterPrincipals(filePath string, tconsts []string) ([]Principal, error) {
file, err := os.Open(filePath)
if err != nil {
return nil, fmt.Errorf("could not open file: %v", err)
}
defer file.Close()
scanner := bufio.NewScanner(file)
columnCount := 6
var headers []string
if scanner.Scan() {
headers = strings.Split(scanner.Text(), "\t")
if len(headers) != columnCount {
return nil, fmt.Errorf("expected %d column headers, found %d", columnCount, len(headers))
}
} else {
return nil, fmt.Errorf("could not read column headers: %v", scanner.Err())
}
var (
tconstIndex int = -1
nconstIndex int = -1
categoryIndex int = -1
)
for i, header := range headers {
switch header {
case "tconst":
tconstIndex = i
case "nconst":
nconstIndex = i
case "category":
categoryIndex = i
}
}
switch {
case tconstIndex == -1:
return nil, fmt.Errorf("column %s not found", "`tconst`")
case nconstIndex == -1:
return nil, fmt.Errorf("column %s not found", "`nconst`")
case categoryIndex == -1:
return nil, fmt.Errorf("column %s not found", "`category`")
}
tconstMap := make(map[string][]string)
for _, tconst := range tconsts {
tconstMap[tconst] = []string{}
}
for scanner.Scan() {
line := scanner.Text()
columns := strings.Split(line, "\t")
if len(columns) != columnCount {
fmt.Println("Columns are:", columns)
return nil, fmt.Errorf("expected %d columns, found %d", columnCount, len(columns))
}
if slices.Contains(config.NconstCategories, columns[categoryIndex]) {
if _, ok := tconstMap[columns[tconstIndex]]; ok {
tconstMap[columns[tconstIndex]] = append(tconstMap[columns[tconstIndex]], columns[nconstIndex])
}
}
}
if err := scanner.Err(); err != nil {
return nil, err
}
var principals []Principal
for tconst, nconsts := range tconstMap {
principals = append(principals, Principal{
Tconst: tconst,
Nconsts: strings.Join(nconsts, ","),
})
}
return principals, nil
}
func FilterRatings(filePath string, tconsts []string) ([]Ratings, error) {
file, err := os.Open(filePath)
if err != nil {
return nil, fmt.Errorf("could not open file: %v", err)
}
defer file.Close()
scanner := bufio.NewScanner(file)
columnCount := 3
var headers []string
if scanner.Scan() {
headers = strings.Split(scanner.Text(), "\t")
if len(headers) != columnCount {
return nil, fmt.Errorf("expected %d column headers, found %d", columnCount, len(headers))
}
} else {
return nil, fmt.Errorf("could not read column headers: %v", scanner.Err())
}
var (
tconstIndex int = -1
averageRatingIndex int = -1
numVotesIndex int = -1
)
for i, header := range headers {
switch header {
case "tconst":
tconstIndex = i
case "averageRating":
averageRatingIndex = i
case "numVotes":
numVotesIndex = i
}
}
switch {
case tconstIndex == -1:
return nil, fmt.Errorf("column %s not found", "`tconst`")
case averageRatingIndex == -1:
return nil, fmt.Errorf("column %s not found", "`averageRating`")
case numVotesIndex == -1:
return nil, fmt.Errorf("column %s not found", "`numVotes`")
}
tconstMap := make(map[string][]string)
for _, tconst := range tconsts {
tconstMap[tconst] = []string{}
}
var ratings []Ratings
for scanner.Scan() {
line := scanner.Text()
columns := strings.Split(line, "\t")
if len(columns) != columnCount {
fmt.Println("Columns are:", columns)
return nil, fmt.Errorf("expected %d columns, found %d", columnCount, len(columns))
}
if _, ok := tconstMap[columns[tconstIndex]]; ok {
rating, err := strconv.ParseFloat(columns[averageRatingIndex], 32)
if err != nil {
rating = 0
}
votes, err := strconv.Atoi(columns[numVotesIndex])
if err != nil {
votes = 0
}
ratings = append(ratings, Ratings{
Tconst: columns[tconstIndex],
Rating: math.Round(rating*10) / 10,
Votes: votes,
})
}
}
if err := scanner.Err(); err != nil {
return nil, err
}
return ratings, nil
}

27
server/pkg/dto/models.go Normal file
View File

@@ -0,0 +1,27 @@
package dto
type Basic struct {
Tconst string `json:"tconst"`
StartYear uint16 `json:"startYear"`
Genres string `json:"genres"`
}
type Principal struct {
Tconst string `json:"tconst"`
Nconsts string `json:"nconsts"`
}
type Ratings struct {
Tconst string `json:"tconst"`
Rating float64 `json:"rating"`
Votes int `json:"votes"`
}
type MinMax struct {
MinVotes uint `json:"minVotes"`
MaxVotes uint `json:"maxVotes"`
MinYear uint `json:"minYear"`
MaxYear uint `json:"maxYear"`
MinRating float64 `json:"minRating"`
MaxRating float64 `json:"maxRating"`
}

58
server/pkg/dto/vector.go Normal file
View File

@@ -0,0 +1,58 @@
package dto
import (
"fmt"
"math"
)
type CountVectorizer struct {
WordIndex map[string]int
}
func NewCountVectorizer() *CountVectorizer {
return &CountVectorizer{}
}
func (cv *CountVectorizer) SetWordIndexes(docs [][]string) {
cv.WordIndex = make(map[string]int)
index := 0
for _, doc := range docs {
for _, word := range doc {
if word == "" {
continue
}
if _, exists := cv.WordIndex[word]; !exists {
cv.WordIndex[word] = index
index++
}
}
}
}
func (cv *CountVectorizer) Vectorize(doc []string) []uint8 {
vector := make([]uint8, len(cv.WordIndex))
for _, word := range doc {
vector[cv.WordIndex[word]]++
}
return vector
}
func CosineSimilarity(a, b []uint8) (float32, error) {
if len(a) != len(b) {
return 0, fmt.Errorf("slices must have the same length")
}
var dotProduct, normA, normB float64
for i := 0; i < len(a); i++ {
x := float64(a[i])
y := float64(b[i])
dotProduct += x * y
normA += x * x
normB += y * y
}
if normA == 0 || normB == 0 {
return 0, nil
}
return float32(dotProduct / (math.Sqrt(normA) * math.Sqrt(normB))), nil
}