From b60673ad25023727eba0f348175cab8bb1487e5a Mon Sep 17 00:00:00 2001 From: Aykhan Date: Sun, 6 Aug 2023 22:48:47 +0400 Subject: [PATCH] First commit --- .gitignore | 4 + IMDB_DTO.py | 332 ++++++++++++++++++++++++ exceptions.py | 3 + imdb_recommendation.ipynb | 517 ++++++++++++++++++++++++++++++++++++++ main.py | 23 ++ 5 files changed, 879 insertions(+) create mode 100644 .gitignore create mode 100755 IMDB_DTO.py create mode 100644 exceptions.py create mode 100755 imdb_recommendation.ipynb create mode 100644 main.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ee25eca --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +IMDB_data_sets +.venv +.ipynb_checkpoints +__pycache__ \ No newline at end of file diff --git a/IMDB_DTO.py b/IMDB_DTO.py new file mode 100755 index 0000000..9a4abac --- /dev/null +++ b/IMDB_DTO.py @@ -0,0 +1,332 @@ +from pathlib import Path +import pandas as pd +import numpy as np +from time import time +from exceptions import ( + FileExistException, + FileNotExistException +) + + +BASE_DIR = Path(__file__).resolve().parent + +class DTO: + def __init__( + self, + save_dir=(BASE_DIR / 'IMDB_data_sets/filtered/'), + read_dir=(BASE_DIR / 'IMDB_data_sets/'), + default_chunksize: int=3_000_000 + ) -> None: + """ + Parameters + ---------- + save_dir : str, optional + Folder location to save files (default is BASE_DIR / 'IMDB_data_sets/filtered/') + get_dir : str, optional + Folder location to get files (default is BASE_DIR / 'IMDB_data_sets/') + default_chunksize : int, optional + Default value to be used when chunksize is not given in methods that take + chunksize parameters (default is 3_000_000) + """ + + self.save_dir = save_dir + self.save_dir.mkdir(parents=True, exist_ok=True) + self.read_dir = read_dir + self.default_chunksize = default_chunksize + + def timing_decorator(func): + def wrapper(*args, **kwargs): + start_time = time() + result = func(*args, **kwargs) + print(f"Function {func.__name__} took {time() - start_time} seconds to run.") + return result + return wrapper + + def is_exist(self, file_dir: Path) -> None: + """ + Parameters + ---------- + file_dir : pathlib.Path + File path + + Raises + ------ + FileExistException + If the file exists + """ + + if file_dir.is_file(): + raise FileExistException(f"file is exist: {file_dir}") + + def is_not_exist(self, file_dir: Path) -> None: + """ + Parameters + ---------- + file_dir : pathlib.Path + File path + + Raises + ------ + FileNotExistException + If the file does not exist + """ + + if not file_dir.is_file(): + raise FileNotExistException(f"file is not exist: {file_dir}") + + def df2csv( + self, + df: pd.DataFrame, + name: str, + overwrite: bool=False, + index: bool=False + ) -> None: + """ + Parameters + ---------- + df : DataFrame + DataFrame object you want to save + name : str + The name you want to save the DataFrame object + overwrite : bool, optional + When True, overwrite if file exists (default is False) + index : bool, optional + Save index column or no (deafault is False) + + Raises + ------ + FileExistException + If the overwrite parameter is false and the file exists + """ + + if not overwrite: + self.is_exist(self.save_dir / name) + df.to_csv(self.save_dir / name, index=index) + + @timing_decorator + def filter_tconst( + self, + name: str, + title_types: list[str]=['movie', 'tvMovie'], + chunksize: int=None + ) -> list[str]: + """ + Parameters + ---------- + name : str + Name of the basics file to be read + title_type : list, optional + 'titleType' type of lines to be read from file (default is ['movie', 'tvMovie']) + chunksize : int + Chunk size for reading data (default is self.default_chunksize (default is 3_000_000)). + + Returns + ------- + list + A list of tconst + + Raises + ------ + FileNotExistException + If the file does not exist + """ + + self.is_not_exist(self.read_dir / name) + if chunksize is None: + chunksize = self.default_chunksize + + tconst_list = [] + + with pd.read_csv( + self.read_dir / name, + sep=r'\t', + chunksize=chunksize, + engine='python', + usecols=['tconst', 'titleType'], + dtype={'tconst': str, 'titleType': str}, + na_values='\\N') as reader: + + for i, r in enumerate(reader): + tconst_list += list(r[r.titleType.isin(title_types)]['tconst']) + return tconst_list + + def get_tconst(self, name: str) -> list[str]: + """ + Parameters + ---------- + name : str + Name of the tconst file to be read + + Returns + ------- + list + A list of tconst + + Raises + ------ + FileNotExistException + If the file does not exist + """ + + self.is_not_exist(self.save_dir / name) + return list(pd.read_csv(self.save_dir / name, usecols=['tconst'], dtype={'tconst': str})['tconst']) + + @timing_decorator + def filter_principal( + self, + name: str, + tconst_list: list[str], + category_list: list[str]=['actress', 'actor', 'director', 'writer'], + chunksize: int=None + ) -> pd.DataFrame: + """ + Parameters + ---------- + name : str + Name of the principals file to be read + tconst_list : list + List of tconst (It can be obtained by the get_tconst or read_tconst method). + category : list + List of categories of rows to be selected (default is ['actress', 'actor', 'director', 'writer']). + chunksize : int + Chunk size for reading data (default is self.default_chunksize (default is 3_000_000)). + + Returns + ------- + DataFrame + A DataFrame object with columns tconst, nconst, and category. + + Raises + ------ + FileNotExistException + If the file does not exist + """ + + self.is_not_exist(self.read_dir / name) + if chunksize is None: + chunksize = self.default_chunksize + + df = pd.DataFrame({ + 'tconst': tconst_list, + 'nconst': np.empty((len(tconst_list), 0)).tolist(), + 'category': np.empty((len(tconst_list), 0)).tolist() + }) + + # index = pd.Index(tconst_list, name='tconst') + # df = pd.DataFrame({ + # 'nconst': pd.Series(dtype='object', index=index), + # 'category': pd.Series(dtype='object', index=index) + # }) + + cnt = 0 + + with pd.read_csv(self.read_dir / name, + sep=r'\t', + chunksize=chunksize, + engine='python', + usecols=['tconst', 'nconst', 'category']) as reader: + + for i, r in enumerate(reader): + r = r.query(f"(tconst in @tconst_list) and (category in @category_list)") + r_group = r.groupby('tconst', as_index=0).agg({'nconst': lambda x: list(x), 'category': lambda x: list(x)}) + df = pd.concat([df, r_group]).groupby('tconst', as_index=0).agg(sum) + + # r_group.index.name = 'tconst' + # df.update(r_group) + del r_group + + print(cnt) + return df + + @timing_decorator + def filter_rating( + self, + name: str, + tconst_list: list[str], + chunksize: int=None + ) -> pd.DataFrame: + """ + Parameters + ---------- + name : str + Name of the ratings file to be read + tconst_list : list + List of tconst (It can be obtained by the get_tconst or read_tconst method). + chunksize : int + Chunk size for reading data (default is self.default_chunksize (default is 3_000_000)). + + Returns + ------- + DataFrame + A DataFrame object with columns tconst, and averageRating. + + Raises + ------ + FileNotExistException + If the file does not exist + """ + + self.is_not_exist(self.read_dir / name) + if chunksize is None: + chunksize = self.default_chunksize + + df = pd.DataFrame({'tconst': tconst_list}) + + with pd.read_csv( + self.read_dir / name, + sep=r'\t', + chunksize=chunksize, + engine='python', + usecols=['tconst', 'averageRating', 'numVotes'], + dtype={'tconst': str, 'averageRating': np.float16, 'numVotes': int}, + na_values='\\N') as reader: + + for i, r in enumerate(reader): + df = pd.concat([df, r.query("tconst in @tconst_list")]).groupby('tconst', as_index=0).first() + return df + + @timing_decorator + def filter_basic( + self, + name: str, + tconst_list: list[str], + chunksize: int=None + ) -> pd.DataFrame: + """ + Parameters + ---------- + name : str + Name of the basics file to be read + tconst_list : list + List of tconst (It can be obtained by the get_tconst or read_tconst method). + chunksize : int + Chunk size for reading data (default is self.default_chunksize (default is 3_000_000)). + + Returns + ------- + DataFrame + A DataFrame object with columns tconst, startYear and genres. + + Raises + ------ + FileNotExistException + If the file does not exist + """ + + self.is_not_exist(self.read_dir / name) + if chunksize is None: + chunksize = self.default_chunksize + + df = pd.DataFrame({'tconst': tconst_list}) + + with pd.read_csv(self.read_dir / name, + sep=r'\t', + chunksize=chunksize, + engine='python', + usecols=['tconst', 'startYear', 'genres'], + dtype={'tconst': str, 'startYear': 'Int16', 'genres': str}, + na_values='\\N') as reader: + + for i, r in enumerate(reader): + df = pd.concat([df, r.query("tconst in @tconst_list")]).groupby('tconst', as_index=0).first() + return df \ No newline at end of file diff --git a/exceptions.py b/exceptions.py new file mode 100644 index 0000000..654dd5c --- /dev/null +++ b/exceptions.py @@ -0,0 +1,3 @@ +class FileExistException(Exception): ... + +class FileNotExistException(Exception): ... \ No newline at end of file diff --git a/imdb_recommendation.ipynb b/imdb_recommendation.ipynb new file mode 100755 index 0000000..b1c7872 --- /dev/null +++ b/imdb_recommendation.ipynb @@ -0,0 +1,517 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "from ast import literal_eval\n", + "from functools import reduce" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "input_film = 'tt0816692'\n", + "\n", + "trained = {'basics': {}, 'principals': {}, 'ratings': {}}\n", + "\n", + "def drop_by_tconst(df, tconst: str, inplace=True) -> pd.DataFrame:\n", + " return df.drop(df[df['tconst'] == tconst].index, inplace=inplace)\n", + "\n", + "# Basics\n", + "\n", + "df = pd.read_csv('./IMDB_data_sets/filtered/basics.csv', dtype={'tconst': str, 'startYear': 'Int16', 'genres': str})\n", + "df['genres'].fillna('', inplace=True)\n", + "\n", + "cv = CountVectorizer(dtype=np.int8, token_pattern=\"(?u)[\\w'-]+\")\n", + "count_matrix = cv.fit_transform(df['genres'])\n", + "\n", + "trained['basics']['genres'] = pd.DataFrame(\n", + " {\n", + " 'genres': cosine_similarity(count_matrix[df[df['tconst'] == input_film].index[0]], count_matrix)[0],\n", + " 'tconst': df['tconst']\n", + " }\n", + " )\n", + "\n", + "drop_by_tconst(trained['basics']['genres'], input_film)\n", + "\n", + "trained['basics']['genres'].sort_values(ascending=False, by='genres', inplace=True, ignore_index=True)\n", + "trained['basics']['genres'].drop('genres', axis=1, inplace=True)\n", + "\n", + "\n", + "year = int(df[df['tconst'] == input_film].startYear.iloc[0])\n", + "\n", + "trained['basics']['years'] = pd.DataFrame(\n", + " {\n", + " 'years': df['startYear'],\n", + " 'tconst': df['tconst']\n", + " }\n", + ")\n", + "\n", + "drop_by_tconst(trained['basics']['years'], input_film)\n", + "trained['basics']['years'].sort_values(by='years', key=lambda x: abs(year-x), inplace=True, ignore_index=True)\n", + "trained['basics']['years'].drop('years', axis=1, inplace=True)\n", + "trained['basics']['years'].reset_index(names='years_index', inplace=True)\n", + "\n", + "# Principals\n", + "\n", + "df = pd.read_csv('./IMDB_data_sets/filtered/principals.csv', dtype={'tconst': str, 'nconst': str}, usecols=['tconst', 'nconst'])\n", + "df.nconst = df.nconst.apply(lambda n: ','.join(literal_eval(n)))\n", + "\n", + "cv = CountVectorizer(dtype=np.int8, token_pattern=\"(?u)[\\w'-]+\")\n", + "count_matrix = cv.fit_transform(df['nconst'])\n", + "\n", + "trained['principals']['nconst'] = pd.DataFrame(\n", + " {\n", + " 'nconst': cosine_similarity(count_matrix[df[df['tconst'] == input_film].index[0]], count_matrix)[0],\n", + " 'tconst': df['tconst']\n", + " }\n", + " )\n", + "\n", + "drop_by_tconst(trained['principals']['nconst'], input_film)\n", + "trained['principals']['nconst'].sort_values(ascending=False, by='nconst', inplace=True, ignore_index=True)\n", + "trained['principals']['nconst'].drop('nconst', axis=1, inplace=True)\n", + "trained['principals']['nconst'].reset_index(names='nconst_index', inplace=True)\n", + "\n", + "# Ratings\n", + "\n", + "df = pd.read_csv('./IMDB_data_sets/filtered/ratings.csv', dtype={'tconst': str, 'averageRating': float, 'numVotes': 'Int64'})\n", + "\n", + "rating = float(df[df['tconst'] == input_film].averageRating.iloc[0])\n", + "votes = int(df[df['tconst'] == input_film].numVotes.iloc[0])\n", + "\n", + "drop_by_tconst(df, input_film)\n", + "\n", + "trained['ratings']['ratings'] = df.sort_values(by='averageRating', key=lambda x: abs(rating-x), ignore_index=True)\n", + "trained['ratings']['ratings'].drop(['averageRating', 'numVotes'], axis=1, inplace=True)\n", + "trained['ratings']['ratings'].reset_index(names='ratings_index', inplace=True)\n", + "\n", + "df.drop('averageRating', axis=1, inplace=True)\n", + "\n", + "trained['ratings']['votes'] = df.sort_values(by='numVotes', key=lambda x: abs(votes-x), ignore_index=True)\n", + "trained['ratings']['votes'].drop('numVotes', axis=1, inplace=True)\n", + "trained['ratings']['votes'].reset_index(names='votes_index', inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "merged = reduce(lambda left, right: pd.merge(\n", + " left,\n", + " right,\n", + " on=['tconst'],\n", + " how='outer'\n", + " ), [\n", + " trained['basics']['genres'],\n", + " trained['basics']['years'],\n", + " trained['principals']['nconst'],\n", + " trained['ratings']['ratings'],\n", + " trained['ratings']['votes']\n", + " ])" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tconstyears_indexnconst_indexratings_indexvotes_indexaverage
0tt4255564744690297616669670669670476329.2
1tt2203897272939705602978602978248591.0
2tt0355627344502708640318038205177315271.8
3tt1538737871021498486540358540358377883.8
4tt515534072975386406103102152733143044.0
.....................
777383tt123021118987159982627843425539374210.6
777384tt12302076149946599825483066483066498657.4
777385tt123020618988559982330196925847378981.8
777386tt1230179255769599809483065483065519818.8
777387tt991675439373777387777387777387629784.2
\n", + "

777388 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " tconst years_index nconst_index ratings_index votes_index \n", + "0 tt4255564 744690 297616 669670 669670 \\\n", + "1 tt2203897 27293 9705 602978 602978 \n", + "2 tt0355627 344502 708640 318038 205177 \n", + "3 tt15387378 710214 98486 540358 540358 \n", + "4 tt5155340 72975 386406 103102 152733 \n", + "... ... ... ... ... ... \n", + "777383 tt1230211 189871 599826 278434 25539 \n", + "777384 tt12302076 149946 599825 483066 483066 \n", + "777385 tt1230206 189885 599823 301969 25847 \n", + "777386 tt1230179 255769 599809 483065 483065 \n", + "777387 tt9916754 39373 777387 777387 777387 \n", + "\n", + " average \n", + "0 476329.2 \n", + "1 248591.0 \n", + "2 315271.8 \n", + "3 377883.8 \n", + "4 143044.0 \n", + "... ... \n", + "777383 374210.6 \n", + "777384 498657.4 \n", + "777385 378981.8 \n", + "777386 519818.8 \n", + "777387 629784.2 \n", + "\n", + "[777388 rows x 6 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged['average'] = (merged.index*20 + merged.years_index*20 + merged.nconst_index*20 + merged.ratings_index*20 + merged.votes_index*20) / (5*20)\n", + "merged" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
tconstyears_indexnconst_indexratings_indexvotes_indexaverage
8695tt233815177751258623860115110813.4
14tt36593884965498257587915120.6
8501tt17546563099311646193324717810.0
7374tt210328111453279104961834719340.4
7549tt2358592549851763312668918220403.4
.....................
758545tt13334656700841672174500543500543626529.2
758577tt13336544700845672184500576500576626551.6
758587tt13335546700843672231500564500564626557.8
758590tt13335152700842672247500557500557626558.6
777387tt991675439373777387777387777387629784.2
\n", + "

777388 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " tconst years_index nconst_index ratings_index votes_index \n", + "8695 tt2338151 7775 12586 23860 1151 \\\n", + "14 tt3659388 49654 98 25758 79 \n", + "8501 tt1754656 30993 116 46193 3247 \n", + "7374 tt2103281 11453 27910 49618 347 \n", + "7549 tt2358592 54985 17633 12668 9182 \n", + "... ... ... ... ... ... \n", + "758545 tt13334656 700841 672174 500543 500543 \n", + "758577 tt13336544 700845 672184 500576 500576 \n", + "758587 tt13335546 700843 672231 500564 500564 \n", + "758590 tt13335152 700842 672247 500557 500557 \n", + "777387 tt9916754 39373 777387 777387 777387 \n", + "\n", + " average \n", + "8695 10813.4 \n", + "14 15120.6 \n", + "8501 17810.0 \n", + "7374 19340.4 \n", + "7549 20403.4 \n", + "... ... \n", + "758545 626529.2 \n", + "758577 626551.6 \n", + "758587 626557.8 \n", + "758590 626558.6 \n", + "777387 629784.2 \n", + "\n", + "[777388 rows x 6 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged.sort_values(by='average')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "08dff0a1cb2e37beec5bc340112a669cde11fa0a1a1e2fde92884d26090bd6fc" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/main.py b/main.py new file mode 100644 index 0000000..e042b40 --- /dev/null +++ b/main.py @@ -0,0 +1,23 @@ +from IMDB_DTO import DTO +from time import time +import pandas as pd + + +if __name__ == '__main__': + start_time = time() + dto = DTO() + + tconst_list = dto.filter_tconst(name='title.basics.tsv') + dto.df2csv(df=pd.DataFrame({'tconst': tconst_list}), name='tconst.csv') + + tconst_list = dto.get_tconst('tconst.csv') + + df = dto.filter_basic(name='title.basics.tsv', tconst_list=tconst_list) + dto.df2csv(df=df, name='basics.csv') + del df + df = dto.filter_principal(name='title.principals.tsv', tconst_list=tconst_list) + dto.df2csv(df=df, name='principals_comma.csv', overwrite=True) + del df + df = dto.filter_rating(name='title.ratings.tsv', tconst_list=tconst_list) + dto.df2csv(df=df, name='ratings.csv', overwrite=1) + del df \ No newline at end of file