mirror of
				https://github.com/aykhans/movier.git
				synced 2025-10-26 04:29:20 +00:00 
			
		
		
		
	Rewritten in go and python
This commit is contained in:
		
							
								
								
									
										31
									
								
								server/pkg/dto/download.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										31
									
								
								server/pkg/dto/download.go
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,31 @@ | ||||
| package dto | ||||
|  | ||||
| import ( | ||||
| 	"io" | ||||
| 	"net/http" | ||||
| 	"os" | ||||
| ) | ||||
|  | ||||
| func DownloadAndExtractGz(url, downloadFilepath, extractFilepath string) error { | ||||
| 	if err := Download(url, downloadFilepath); err != nil { | ||||
| 		return err | ||||
| 	} | ||||
| 	return ExtractGzFile(downloadFilepath, extractFilepath) | ||||
| } | ||||
|  | ||||
| func Download(url, filepath string) error { | ||||
| 	resp, err := http.Get(url) | ||||
| 	if err != nil { | ||||
| 		return err | ||||
| 	} | ||||
| 	defer resp.Body.Close() | ||||
|  | ||||
| 	out, err := os.Create(filepath) | ||||
| 	if err != nil { | ||||
| 		return err | ||||
| 	} | ||||
| 	defer out.Close() | ||||
|  | ||||
| 	_, err = io.Copy(out, resp.Body) | ||||
| 	return err | ||||
| } | ||||
							
								
								
									
										30
									
								
								server/pkg/dto/extract.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										30
									
								
								server/pkg/dto/extract.go
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,30 @@ | ||||
| package dto | ||||
|  | ||||
| import ( | ||||
| 	"compress/gzip" | ||||
| 	"io" | ||||
| 	"os" | ||||
| ) | ||||
|  | ||||
| func ExtractGzFile(gzFile, extractedFilepath string) error { | ||||
| 	file, err := os.Open(gzFile) | ||||
| 	if err != nil { | ||||
| 		return err | ||||
| 	} | ||||
| 	defer file.Close() | ||||
|  | ||||
| 	gzReader, err := gzip.NewReader(file) | ||||
| 	if err != nil { | ||||
| 		return err | ||||
| 	} | ||||
| 	defer gzReader.Close() | ||||
|  | ||||
| 	outFile, err := os.Create(extractedFilepath) | ||||
| 	if err != nil { | ||||
| 		return err | ||||
| 	} | ||||
| 	defer outFile.Close() | ||||
|  | ||||
| 	_, err = io.Copy(outFile, gzReader) | ||||
| 	return err | ||||
| } | ||||
							
								
								
									
										259
									
								
								server/pkg/dto/filter.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										259
									
								
								server/pkg/dto/filter.go
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,259 @@ | ||||
| package dto | ||||
|  | ||||
| import ( | ||||
| 	"bufio" | ||||
| 	"fmt" | ||||
| 	"math" | ||||
| 	"os" | ||||
| 	"slices" | ||||
| 	"strconv" | ||||
| 	"strings" | ||||
|  | ||||
| 	"github.com/aykhans/movier/server/pkg/config" | ||||
| ) | ||||
|  | ||||
| func FilterBasics(filePath string) ([]Basic, error) { | ||||
| 	file, err := os.Open(filePath) | ||||
| 	if err != nil { | ||||
| 		return nil, fmt.Errorf("could not open file: %v", err) | ||||
| 	} | ||||
| 	defer file.Close() | ||||
|  | ||||
| 	scanner := bufio.NewScanner(file) | ||||
|  | ||||
| 	columnCount := 9 | ||||
| 	var headers []string | ||||
| 	if scanner.Scan() { | ||||
| 		headers = strings.Split(scanner.Text(), "\t") | ||||
| 		if len(headers) != columnCount { | ||||
| 			return nil, fmt.Errorf("expected %d column headers, found %d", columnCount, len(headers)) | ||||
| 		} | ||||
| 	} else { | ||||
| 		return nil, fmt.Errorf("could not read column headers: %v", scanner.Err()) | ||||
| 	} | ||||
|  | ||||
| 	var ( | ||||
| 		tconstIndex    int = -1 | ||||
| 		titleTypeIndex int = -1 | ||||
| 		startYearIndex int = -1 | ||||
| 		genresIndex    int = -1 | ||||
| 	) | ||||
| 	for i, header := range headers { | ||||
| 		switch header { | ||||
| 		case "tconst": | ||||
| 			tconstIndex = i | ||||
| 		case "titleType": | ||||
| 			titleTypeIndex = i | ||||
| 		case "startYear": | ||||
| 			startYearIndex = i | ||||
| 		case "genres": | ||||
| 			genresIndex = i | ||||
| 		} | ||||
| 	} | ||||
| 	switch { | ||||
| 	case tconstIndex == -1: | ||||
| 		return nil, fmt.Errorf("column %s not found", "`tconst`") | ||||
| 	case titleTypeIndex == -1: | ||||
| 		return nil, fmt.Errorf("column %s not found", "`titleType`") | ||||
| 	case startYearIndex == -1: | ||||
| 		return nil, fmt.Errorf("column %s not found", "`startYear`") | ||||
| 	case genresIndex == -1: | ||||
| 		return nil, fmt.Errorf("column %s not found", "`genres`") | ||||
| 	} | ||||
|  | ||||
| 	var basics []Basic | ||||
| 	for scanner.Scan() { | ||||
| 		line := scanner.Text() | ||||
| 		columns := strings.Split(line, "\t") | ||||
| 		if len(columns) != columnCount { | ||||
| 			fmt.Println("Columns are:", columns) | ||||
| 			return nil, fmt.Errorf("expected %d columns, found %d", columnCount, len(columns)) | ||||
| 		} | ||||
|  | ||||
| 		if slices.Contains(config.TitleTypes, columns[titleTypeIndex]) { | ||||
| 			var startYearUint16 uint16 | ||||
| 			startYear, err := strconv.Atoi(columns[startYearIndex]) | ||||
| 			if err != nil { | ||||
| 				startYearUint16 = 0 | ||||
| 			} else { | ||||
| 				startYearUint16 = uint16(startYear) | ||||
| 			} | ||||
|  | ||||
| 			var genres string | ||||
| 			if columns[genresIndex] == "\\N" { | ||||
| 				genres = "" | ||||
| 			} else { | ||||
| 				genres = strings.ReplaceAll(strings.ToLower(columns[genresIndex]), " ", "") | ||||
| 			} | ||||
|  | ||||
| 			basics = append(basics, Basic{ | ||||
| 				Tconst:    columns[tconstIndex], | ||||
| 				StartYear: startYearUint16, | ||||
| 				Genres:    genres, | ||||
| 			}) | ||||
| 		} | ||||
| 	} | ||||
|  | ||||
| 	if err := scanner.Err(); err != nil { | ||||
| 		return nil, err | ||||
| 	} | ||||
| 	return basics, nil | ||||
| } | ||||
|  | ||||
| func FilterPrincipals(filePath string, tconsts []string) ([]Principal, error) { | ||||
| 	file, err := os.Open(filePath) | ||||
| 	if err != nil { | ||||
| 		return nil, fmt.Errorf("could not open file: %v", err) | ||||
| 	} | ||||
| 	defer file.Close() | ||||
|  | ||||
| 	scanner := bufio.NewScanner(file) | ||||
|  | ||||
| 	columnCount := 6 | ||||
| 	var headers []string | ||||
| 	if scanner.Scan() { | ||||
| 		headers = strings.Split(scanner.Text(), "\t") | ||||
| 		if len(headers) != columnCount { | ||||
| 			return nil, fmt.Errorf("expected %d column headers, found %d", columnCount, len(headers)) | ||||
| 		} | ||||
| 	} else { | ||||
| 		return nil, fmt.Errorf("could not read column headers: %v", scanner.Err()) | ||||
| 	} | ||||
|  | ||||
| 	var ( | ||||
| 		tconstIndex   int = -1 | ||||
| 		nconstIndex   int = -1 | ||||
| 		categoryIndex int = -1 | ||||
| 	) | ||||
| 	for i, header := range headers { | ||||
| 		switch header { | ||||
| 		case "tconst": | ||||
| 			tconstIndex = i | ||||
| 		case "nconst": | ||||
| 			nconstIndex = i | ||||
| 		case "category": | ||||
| 			categoryIndex = i | ||||
| 		} | ||||
| 	} | ||||
| 	switch { | ||||
| 	case tconstIndex == -1: | ||||
| 		return nil, fmt.Errorf("column %s not found", "`tconst`") | ||||
| 	case nconstIndex == -1: | ||||
| 		return nil, fmt.Errorf("column %s not found", "`nconst`") | ||||
| 	case categoryIndex == -1: | ||||
| 		return nil, fmt.Errorf("column %s not found", "`category`") | ||||
| 	} | ||||
|  | ||||
| 	tconstMap := make(map[string][]string) | ||||
| 	for _, tconst := range tconsts { | ||||
| 		tconstMap[tconst] = []string{} | ||||
| 	} | ||||
| 	for scanner.Scan() { | ||||
| 		line := scanner.Text() | ||||
| 		columns := strings.Split(line, "\t") | ||||
| 		if len(columns) != columnCount { | ||||
| 			fmt.Println("Columns are:", columns) | ||||
| 			return nil, fmt.Errorf("expected %d columns, found %d", columnCount, len(columns)) | ||||
| 		} | ||||
|  | ||||
| 		if slices.Contains(config.NconstCategories, columns[categoryIndex]) { | ||||
| 			if _, ok := tconstMap[columns[tconstIndex]]; ok { | ||||
| 				tconstMap[columns[tconstIndex]] = append(tconstMap[columns[tconstIndex]], columns[nconstIndex]) | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
| 	if err := scanner.Err(); err != nil { | ||||
| 		return nil, err | ||||
| 	} | ||||
|  | ||||
| 	var principals []Principal | ||||
| 	for tconst, nconsts := range tconstMap { | ||||
| 		principals = append(principals, Principal{ | ||||
| 			Tconst:  tconst, | ||||
| 			Nconsts: strings.Join(nconsts, ","), | ||||
| 		}) | ||||
| 	} | ||||
| 	return principals, nil | ||||
| } | ||||
|  | ||||
| func FilterRatings(filePath string, tconsts []string) ([]Ratings, error) { | ||||
| 	file, err := os.Open(filePath) | ||||
| 	if err != nil { | ||||
| 		return nil, fmt.Errorf("could not open file: %v", err) | ||||
| 	} | ||||
| 	defer file.Close() | ||||
|  | ||||
| 	scanner := bufio.NewScanner(file) | ||||
|  | ||||
| 	columnCount := 3 | ||||
| 	var headers []string | ||||
| 	if scanner.Scan() { | ||||
| 		headers = strings.Split(scanner.Text(), "\t") | ||||
| 		if len(headers) != columnCount { | ||||
| 			return nil, fmt.Errorf("expected %d column headers, found %d", columnCount, len(headers)) | ||||
| 		} | ||||
| 	} else { | ||||
| 		return nil, fmt.Errorf("could not read column headers: %v", scanner.Err()) | ||||
| 	} | ||||
|  | ||||
| 	var ( | ||||
| 		tconstIndex        int = -1 | ||||
| 		averageRatingIndex int = -1 | ||||
| 		numVotesIndex      int = -1 | ||||
| 	) | ||||
| 	for i, header := range headers { | ||||
| 		switch header { | ||||
| 		case "tconst": | ||||
| 			tconstIndex = i | ||||
| 		case "averageRating": | ||||
| 			averageRatingIndex = i | ||||
| 		case "numVotes": | ||||
| 			numVotesIndex = i | ||||
| 		} | ||||
| 	} | ||||
| 	switch { | ||||
| 	case tconstIndex == -1: | ||||
| 		return nil, fmt.Errorf("column %s not found", "`tconst`") | ||||
| 	case averageRatingIndex == -1: | ||||
| 		return nil, fmt.Errorf("column %s not found", "`averageRating`") | ||||
| 	case numVotesIndex == -1: | ||||
| 		return nil, fmt.Errorf("column %s not found", "`numVotes`") | ||||
| 	} | ||||
|  | ||||
| 	tconstMap := make(map[string][]string) | ||||
| 	for _, tconst := range tconsts { | ||||
| 		tconstMap[tconst] = []string{} | ||||
| 	} | ||||
| 	var ratings []Ratings | ||||
| 	for scanner.Scan() { | ||||
| 		line := scanner.Text() | ||||
| 		columns := strings.Split(line, "\t") | ||||
| 		if len(columns) != columnCount { | ||||
| 			fmt.Println("Columns are:", columns) | ||||
| 			return nil, fmt.Errorf("expected %d columns, found %d", columnCount, len(columns)) | ||||
| 		} | ||||
|  | ||||
| 		if _, ok := tconstMap[columns[tconstIndex]]; ok { | ||||
| 			rating, err := strconv.ParseFloat(columns[averageRatingIndex], 32) | ||||
| 			if err != nil { | ||||
| 				rating = 0 | ||||
| 			} | ||||
|  | ||||
| 			votes, err := strconv.Atoi(columns[numVotesIndex]) | ||||
| 			if err != nil { | ||||
| 				votes = 0 | ||||
| 			} | ||||
|  | ||||
| 			ratings = append(ratings, Ratings{ | ||||
| 				Tconst: columns[tconstIndex], | ||||
| 				Rating: math.Round(rating*10) / 10, | ||||
| 				Votes:  votes, | ||||
| 			}) | ||||
| 		} | ||||
| 	} | ||||
| 	if err := scanner.Err(); err != nil { | ||||
| 		return nil, err | ||||
| 	} | ||||
|  | ||||
| 	return ratings, nil | ||||
| } | ||||
							
								
								
									
										27
									
								
								server/pkg/dto/models.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										27
									
								
								server/pkg/dto/models.go
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,27 @@ | ||||
| package dto | ||||
|  | ||||
| type Basic struct { | ||||
| 	Tconst    string `json:"tconst"` | ||||
| 	StartYear uint16 `json:"startYear"` | ||||
| 	Genres    string `json:"genres"` | ||||
| } | ||||
|  | ||||
| type Principal struct { | ||||
| 	Tconst  string `json:"tconst"` | ||||
| 	Nconsts string `json:"nconsts"` | ||||
| } | ||||
|  | ||||
| type Ratings struct { | ||||
| 	Tconst string  `json:"tconst"` | ||||
| 	Rating float64 `json:"rating"` | ||||
| 	Votes  int     `json:"votes"` | ||||
| } | ||||
|  | ||||
| type MinMax struct { | ||||
| 	MinVotes  uint    `json:"minVotes"` | ||||
| 	MaxVotes  uint    `json:"maxVotes"` | ||||
| 	MinYear   uint    `json:"minYear"` | ||||
| 	MaxYear   uint    `json:"maxYear"` | ||||
| 	MinRating float64 `json:"minRating"` | ||||
| 	MaxRating float64 `json:"maxRating"` | ||||
| } | ||||
							
								
								
									
										58
									
								
								server/pkg/dto/vector.go
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										58
									
								
								server/pkg/dto/vector.go
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,58 @@ | ||||
| package dto | ||||
|  | ||||
| import ( | ||||
| 	"fmt" | ||||
| 	"math" | ||||
| ) | ||||
|  | ||||
| type CountVectorizer struct { | ||||
| 	WordIndex map[string]int | ||||
| } | ||||
|  | ||||
| func NewCountVectorizer() *CountVectorizer { | ||||
| 	return &CountVectorizer{} | ||||
| } | ||||
|  | ||||
| func (cv *CountVectorizer) SetWordIndexes(docs [][]string) { | ||||
| 	cv.WordIndex = make(map[string]int) | ||||
| 	index := 0 | ||||
| 	for _, doc := range docs { | ||||
| 		for _, word := range doc { | ||||
| 			if word == "" { | ||||
| 				continue | ||||
| 			} | ||||
| 			if _, exists := cv.WordIndex[word]; !exists { | ||||
| 				cv.WordIndex[word] = index | ||||
| 				index++ | ||||
| 			} | ||||
| 		} | ||||
| 	} | ||||
| } | ||||
|  | ||||
| func (cv *CountVectorizer) Vectorize(doc []string) []uint8 { | ||||
| 	vector := make([]uint8, len(cv.WordIndex)) | ||||
| 	for _, word := range doc { | ||||
| 		vector[cv.WordIndex[word]]++ | ||||
| 	} | ||||
| 	return vector | ||||
| } | ||||
|  | ||||
| func CosineSimilarity(a, b []uint8) (float32, error) { | ||||
| 	if len(a) != len(b) { | ||||
| 		return 0, fmt.Errorf("slices must have the same length") | ||||
| 	} | ||||
| 	var dotProduct, normA, normB float64 | ||||
| 	for i := 0; i < len(a); i++ { | ||||
| 		x := float64(a[i]) | ||||
| 		y := float64(b[i]) | ||||
| 		dotProduct += x * y | ||||
| 		normA += x * x | ||||
| 		normB += y * y | ||||
| 	} | ||||
|  | ||||
| 	if normA == 0 || normB == 0 { | ||||
| 		return 0, nil | ||||
| 	} | ||||
|  | ||||
| 	return float32(dotProduct / (math.Sqrt(normA) * math.Sqrt(normB))), nil | ||||
| } | ||||
		Reference in New Issue
	
	Block a user