First commit

2025-10-21 02:35:57 +00:00 · 2023-08-06 22:48:47 +04:00
commit b60673ad25
5 changed files with 879 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,4 @@
 IMDB_data_sets
 .venv
 .ipynb_checkpoints
 __pycache__
--- a/IMDB_DTO.py
+++ b/IMDB_DTO.py
@@ -0,0 +1,332 @@
 from pathlib import Path
 import pandas as pd
 import numpy as np
 from time import time
 from exceptions import (
    FileExistException,
    FileNotExistException
 )
 BASE_DIR = Path(__file__).resolve().parent
 class DTO:
    def __init__(
            self,
            save_dir=(BASE_DIR / 'IMDB_data_sets/filtered/'),
            read_dir=(BASE_DIR / 'IMDB_data_sets/'),
            default_chunksize: int=3_000_000
        ) -> None:
        """
            Parameters
            ----------
            save_dir : str, optional
                Folder location to save files (default is BASE_DIR / 'IMDB_data_sets/filtered/')
            get_dir : str, optional
                Folder location to get files (default is BASE_DIR / 'IMDB_data_sets/')
            default_chunksize : int, optional
                Default value to be used when chunksize is not given in methods that take
                chunksize parameters (default is 3_000_000)
        """
        self.save_dir = save_dir
        self.save_dir.mkdir(parents=True, exist_ok=True)
        self.read_dir = read_dir
        self.default_chunksize = default_chunksize
    def timing_decorator(func):
        def wrapper(*args, **kwargs):
            start_time = time()
            result = func(*args, **kwargs)
            print(f"Function {func.__name__} took {time() - start_time} seconds to run.")
            return result
        return wrapper
    def is_exist(self, file_dir: Path) -> None:
        """
            Parameters
            ----------
            file_dir : pathlib.Path
                File path
            Raises
            ------
            FileExistException
                If the file exists
        """
        if file_dir.is_file():
            raise FileExistException(f"file is exist: {file_dir}")
    def is_not_exist(self, file_dir: Path) -> None:
        """
            Parameters
            ----------
            file_dir : pathlib.Path
                File path
            Raises
            ------
            FileNotExistException
                If the file does not exist
        """
        if not file_dir.is_file():
            raise FileNotExistException(f"file is not exist: {file_dir}")
    def df2csv(
            self,
            df: pd.DataFrame,
            name: str,
            overwrite: bool=False,
            index: bool=False
        ) -> None:
        """
            Parameters
            ----------
            df : DataFrame 
                DataFrame object you want to save
            name : str
                The name you want to save the DataFrame object
            overwrite : bool, optional
                When True, overwrite if file exists (default is False)
            index : bool, optional
                Save index column or no (deafault is False)
            Raises
            ------
            FileExistException
                If the overwrite parameter is false and the file exists
        """
        if not overwrite:
            self.is_exist(self.save_dir / name)
        df.to_csv(self.save_dir / name, index=index)
    @timing_decorator
    def filter_tconst(
            self,
            name: str,
            title_types: list[str]=['movie', 'tvMovie'],
            chunksize: int=None
        ) -> list[str]:
        """
            Parameters
            ----------
            name : str
                Name of the basics file to be read
            title_type : list, optional
                'titleType' type of lines to be read from file (default is ['movie', 'tvMovie'])
            chunksize : int
                Chunk size for reading data (default is self.default_chunksize (default is 3_000_000)).
            Returns
            -------
            list
                A list of tconst
            Raises
            ------
            FileNotExistException
                If the file does not exist
        """
        self.is_not_exist(self.read_dir / name)
        if chunksize is None:
            chunksize = self.default_chunksize
        tconst_list = []
        with pd.read_csv(
                    self.read_dir / name,
                    sep=r'\t',
                    chunksize=chunksize,
                    engine='python',
                    usecols=['tconst', 'titleType'],
                    dtype={'tconst': str, 'titleType': str},
                    na_values='\\N') as reader:
            for i, r in enumerate(reader):
                tconst_list += list(r[r.titleType.isin(title_types)]['tconst'])
        return tconst_list
    def get_tconst(self, name: str) -> list[str]:
        """
            Parameters
            ----------
            name : str
                Name of the tconst file to be read
            Returns
            -------
            list
                A list of tconst
            Raises
            ------
            FileNotExistException
                If the file does not exist
        """
        self.is_not_exist(self.save_dir / name)
        return list(pd.read_csv(self.save_dir / name, usecols=['tconst'], dtype={'tconst': str})['tconst'])
    @timing_decorator
    def filter_principal(
            self,
            name: str,
            tconst_list: list[str],
            category_list: list[str]=['actress', 'actor', 'director', 'writer'],
            chunksize: int=None
        ) -> pd.DataFrame:
        """
            Parameters
            ----------
            name : str
                Name of the principals file to be read
            tconst_list : list
                List of tconst (It can be obtained by the get_tconst or read_tconst method).
            category : list
                List of categories of rows to be selected (default is ['actress', 'actor', 'director', 'writer']).
            chunksize : int
                Chunk size for reading data (default is self.default_chunksize (default is 3_000_000)).
            Returns
            -------
            DataFrame
                A DataFrame object with columns tconst, nconst, and category.
            Raises
            ------
            FileNotExistException
                If the file does not exist
        """
        self.is_not_exist(self.read_dir / name)
        if chunksize is None:
            chunksize = self.default_chunksize
        df = pd.DataFrame({
                'tconst': tconst_list,
                'nconst': np.empty((len(tconst_list), 0)).tolist(),
                'category': np.empty((len(tconst_list), 0)).tolist()
            })
        # index = pd.Index(tconst_list, name='tconst')
        # df = pd.DataFrame({
        #     'nconst': pd.Series(dtype='object', index=index),
        #     'category': pd.Series(dtype='object', index=index)
        # })
        cnt = 0
        with pd.read_csv(self.read_dir / name,
                        sep=r'\t',
                        chunksize=chunksize,
                        engine='python',
                        usecols=['tconst', 'nconst', 'category']) as reader:
            for i, r in enumerate(reader):
                r = r.query(f"(tconst in @tconst_list) and (category in @category_list)")
                r_group = r.groupby('tconst', as_index=0).agg({'nconst': lambda x: list(x), 'category': lambda x: list(x)})
                df = pd.concat([df, r_group]).groupby('tconst', as_index=0).agg(sum)
                # r_group.index.name = 'tconst'
                # df.update(r_group)
                del r_group
        print(cnt)
        return df
    @timing_decorator
    def filter_rating(
            self,
            name: str,
            tconst_list: list[str],
            chunksize: int=None
        ) -> pd.DataFrame:
        """
            Parameters
            ----------
            name : str
                Name of the ratings file to be read
            tconst_list : list
                List of tconst (It can be obtained by the get_tconst or read_tconst method).
            chunksize : int
                Chunk size for reading data (default is self.default_chunksize (default is 3_000_000)).
            Returns
            -------
            DataFrame
                A DataFrame object with columns tconst, and averageRating.
            Raises
            ------
            FileNotExistException
                If the file does not exist
        """
        self.is_not_exist(self.read_dir / name)
        if chunksize is None:
            chunksize = self.default_chunksize
        df = pd.DataFrame({'tconst': tconst_list})
        with pd.read_csv(
                self.read_dir / name,
                sep=r'\t',
                chunksize=chunksize,
                engine='python',
                usecols=['tconst', 'averageRating', 'numVotes'],
                dtype={'tconst': str, 'averageRating': np.float16, 'numVotes': int},
                na_values='\\N') as reader:
            for i, r in enumerate(reader):
                df = pd.concat([df, r.query("tconst in @tconst_list")]).groupby('tconst', as_index=0).first()
        return df
    @timing_decorator
    def filter_basic(
            self,
            name: str,
            tconst_list: list[str],
            chunksize: int=None
        ) -> pd.DataFrame:
        """
            Parameters
            ----------
            name : str
                Name of the basics file to be read
            tconst_list : list
                List of tconst (It can be obtained by the get_tconst or read_tconst method).
            chunksize : int
                Chunk size for reading data (default is self.default_chunksize (default is 3_000_000)).
            Returns
            -------
            DataFrame
                A DataFrame object with columns tconst, startYear and genres.
            Raises
            ------
            FileNotExistException
                If the file does not exist
        """
        self.is_not_exist(self.read_dir / name)
        if chunksize is None:
            chunksize = self.default_chunksize
        df = pd.DataFrame({'tconst': tconst_list})
        with pd.read_csv(self.read_dir / name,
                        sep=r'\t',
                        chunksize=chunksize,
                        engine='python',
                        usecols=['tconst', 'startYear', 'genres'],
                        dtype={'tconst': str, 'startYear': 'Int16', 'genres': str},
                        na_values='\\N') as reader:
            for i, r in enumerate(reader):
                df = pd.concat([df, r.query("tconst in @tconst_list")]).groupby('tconst', as_index=0).first()
        return df
--- a/exceptions.py
+++ b/exceptions.py
@@ -0,0 +1,3 @@
 class FileExistException(Exception): ...
 class FileNotExistException(Exception): ...
--- a/imdb_recommendation.ipynb
+++ b/imdb_recommendation.ipynb
@@ -0,0 +1,517 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "from ast import literal_eval\n",
    "from functools import reduce"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "input_film = 'tt0816692'\n",
    "\n",
    "trained = {'basics': {}, 'principals': {}, 'ratings': {}}\n",
    "\n",
    "def drop_by_tconst(df, tconst: str, inplace=True) -> pd.DataFrame:\n",
    "    return df.drop(df[df['tconst'] == tconst].index, inplace=inplace)\n",
    "\n",
    "# Basics\n",
    "\n",
    "df = pd.read_csv('./IMDB_data_sets/filtered/basics.csv', dtype={'tconst': str, 'startYear': 'Int16', 'genres': str})\n",
    "df['genres'].fillna('', inplace=True)\n",
    "\n",
    "cv = CountVectorizer(dtype=np.int8, token_pattern=\"(?u)[\\w'-]+\")\n",
    "count_matrix = cv.fit_transform(df['genres'])\n",
    "\n",
    "trained['basics']['genres'] = pd.DataFrame(\n",
    "        {\n",
    "            'genres': cosine_similarity(count_matrix[df[df['tconst'] == input_film].index[0]], count_matrix)[0],\n",
    "            'tconst': df['tconst']\n",
    "        }\n",
    "    )\n",
    "\n",
    "drop_by_tconst(trained['basics']['genres'], input_film)\n",
    "\n",
    "trained['basics']['genres'].sort_values(ascending=False, by='genres', inplace=True, ignore_index=True)\n",
    "trained['basics']['genres'].drop('genres', axis=1, inplace=True)\n",
    "\n",
    "\n",
    "year = int(df[df['tconst'] == input_film].startYear.iloc[0])\n",
    "\n",
    "trained['basics']['years'] = pd.DataFrame(\n",
    "    {\n",
    "        'years': df['startYear'],\n",
    "        'tconst': df['tconst']\n",
    "    }\n",
    ")\n",
    "\n",
    "drop_by_tconst(trained['basics']['years'], input_film)\n",
    "trained['basics']['years'].sort_values(by='years', key=lambda x: abs(year-x), inplace=True, ignore_index=True)\n",
    "trained['basics']['years'].drop('years', axis=1, inplace=True)\n",
    "trained['basics']['years'].reset_index(names='years_index', inplace=True)\n",
    "\n",
    "# Principals\n",
    "\n",
    "df = pd.read_csv('./IMDB_data_sets/filtered/principals.csv', dtype={'tconst': str, 'nconst': str}, usecols=['tconst', 'nconst'])\n",
    "df.nconst = df.nconst.apply(lambda n: ','.join(literal_eval(n)))\n",
    "\n",
    "cv = CountVectorizer(dtype=np.int8, token_pattern=\"(?u)[\\w'-]+\")\n",
    "count_matrix = cv.fit_transform(df['nconst'])\n",
    "\n",
    "trained['principals']['nconst'] = pd.DataFrame(\n",
    "        {\n",
    "            'nconst': cosine_similarity(count_matrix[df[df['tconst'] == input_film].index[0]], count_matrix)[0],\n",
    "            'tconst': df['tconst']\n",
    "        }\n",
    "    )\n",
    "\n",
    "drop_by_tconst(trained['principals']['nconst'], input_film)\n",
    "trained['principals']['nconst'].sort_values(ascending=False, by='nconst', inplace=True, ignore_index=True)\n",
    "trained['principals']['nconst'].drop('nconst', axis=1, inplace=True)\n",
    "trained['principals']['nconst'].reset_index(names='nconst_index', inplace=True)\n",
    "\n",
    "# Ratings\n",
    "\n",
    "df = pd.read_csv('./IMDB_data_sets/filtered/ratings.csv', dtype={'tconst': str, 'averageRating': float, 'numVotes': 'Int64'})\n",
    "\n",
    "rating = float(df[df['tconst'] == input_film].averageRating.iloc[0])\n",
    "votes = int(df[df['tconst'] == input_film].numVotes.iloc[0])\n",
    "\n",
    "drop_by_tconst(df, input_film)\n",
    "\n",
    "trained['ratings']['ratings'] = df.sort_values(by='averageRating', key=lambda x: abs(rating-x), ignore_index=True)\n",
    "trained['ratings']['ratings'].drop(['averageRating', 'numVotes'], axis=1, inplace=True)\n",
    "trained['ratings']['ratings'].reset_index(names='ratings_index', inplace=True)\n",
    "\n",
    "df.drop('averageRating', axis=1, inplace=True)\n",
    "\n",
    "trained['ratings']['votes'] = df.sort_values(by='numVotes', key=lambda x: abs(votes-x), ignore_index=True)\n",
    "trained['ratings']['votes'].drop('numVotes', axis=1, inplace=True)\n",
    "trained['ratings']['votes'].reset_index(names='votes_index', inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "merged = reduce(lambda  left, right: pd.merge(\n",
    "                                left,\n",
    "                                right,\n",
    "                                on=['tconst'],\n",
    "                                how='outer'\n",
    "                            ), [\n",
    "                                trained['basics']['genres'],\n",
    "                                trained['basics']['years'],\n",
    "                                trained['principals']['nconst'],\n",
    "                                trained['ratings']['ratings'],\n",
    "                                trained['ratings']['votes']\n",
    "                            ])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>tconst</th>\n",
       "      <th>years_index</th>\n",
       "      <th>nconst_index</th>\n",
       "      <th>ratings_index</th>\n",
       "      <th>votes_index</th>\n",
       "      <th>average</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>tt4255564</td>\n",
       "      <td>744690</td>\n",
       "      <td>297616</td>\n",
       "      <td>669670</td>\n",
       "      <td>669670</td>\n",
       "      <td>476329.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>tt2203897</td>\n",
       "      <td>27293</td>\n",
       "      <td>9705</td>\n",
       "      <td>602978</td>\n",
       "      <td>602978</td>\n",
       "      <td>248591.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>tt0355627</td>\n",
       "      <td>344502</td>\n",
       "      <td>708640</td>\n",
       "      <td>318038</td>\n",
       "      <td>205177</td>\n",
       "      <td>315271.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>tt15387378</td>\n",
       "      <td>710214</td>\n",
       "      <td>98486</td>\n",
       "      <td>540358</td>\n",
       "      <td>540358</td>\n",
       "      <td>377883.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>tt5155340</td>\n",
       "      <td>72975</td>\n",
       "      <td>386406</td>\n",
       "      <td>103102</td>\n",
       "      <td>152733</td>\n",
       "      <td>143044.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>777383</th>\n",
       "      <td>tt1230211</td>\n",
       "      <td>189871</td>\n",
       "      <td>599826</td>\n",
       "      <td>278434</td>\n",
       "      <td>25539</td>\n",
       "      <td>374210.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>777384</th>\n",
       "      <td>tt12302076</td>\n",
       "      <td>149946</td>\n",
       "      <td>599825</td>\n",
       "      <td>483066</td>\n",
       "      <td>483066</td>\n",
       "      <td>498657.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>777385</th>\n",
       "      <td>tt1230206</td>\n",
       "      <td>189885</td>\n",
       "      <td>599823</td>\n",
       "      <td>301969</td>\n",
       "      <td>25847</td>\n",
       "      <td>378981.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>777386</th>\n",
       "      <td>tt1230179</td>\n",
       "      <td>255769</td>\n",
       "      <td>599809</td>\n",
       "      <td>483065</td>\n",
       "      <td>483065</td>\n",
       "      <td>519818.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>777387</th>\n",
       "      <td>tt9916754</td>\n",
       "      <td>39373</td>\n",
       "      <td>777387</td>\n",
       "      <td>777387</td>\n",
       "      <td>777387</td>\n",
       "      <td>629784.2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>777388 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            tconst  years_index  nconst_index  ratings_index  votes_index   \n",
       "0        tt4255564       744690        297616         669670       669670  \\\n",
       "1        tt2203897        27293          9705         602978       602978   \n",
       "2        tt0355627       344502        708640         318038       205177   \n",
       "3       tt15387378       710214         98486         540358       540358   \n",
       "4        tt5155340        72975        386406         103102       152733   \n",
       "...            ...          ...           ...            ...          ...   \n",
       "777383   tt1230211       189871        599826         278434        25539   \n",
       "777384  tt12302076       149946        599825         483066       483066   \n",
       "777385   tt1230206       189885        599823         301969        25847   \n",
       "777386   tt1230179       255769        599809         483065       483065   \n",
       "777387   tt9916754        39373        777387         777387       777387   \n",
       "\n",
       "         average  \n",
       "0       476329.2  \n",
       "1       248591.0  \n",
       "2       315271.8  \n",
       "3       377883.8  \n",
       "4       143044.0  \n",
       "...          ...  \n",
       "777383  374210.6  \n",
       "777384  498657.4  \n",
       "777385  378981.8  \n",
       "777386  519818.8  \n",
       "777387  629784.2  \n",
       "\n",
       "[777388 rows x 6 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merged['average'] = (merged.index*20 + merged.years_index*20 + merged.nconst_index*20 + merged.ratings_index*20 + merged.votes_index*20) / (5*20)\n",
    "merged"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>tconst</th>\n",
       "      <th>years_index</th>\n",
       "      <th>nconst_index</th>\n",
       "      <th>ratings_index</th>\n",
       "      <th>votes_index</th>\n",
       "      <th>average</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>8695</th>\n",
       "      <td>tt2338151</td>\n",
       "      <td>7775</td>\n",
       "      <td>12586</td>\n",
       "      <td>23860</td>\n",
       "      <td>1151</td>\n",
       "      <td>10813.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>tt3659388</td>\n",
       "      <td>49654</td>\n",
       "      <td>98</td>\n",
       "      <td>25758</td>\n",
       "      <td>79</td>\n",
       "      <td>15120.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8501</th>\n",
       "      <td>tt1754656</td>\n",
       "      <td>30993</td>\n",
       "      <td>116</td>\n",
       "      <td>46193</td>\n",
       "      <td>3247</td>\n",
       "      <td>17810.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7374</th>\n",
       "      <td>tt2103281</td>\n",
       "      <td>11453</td>\n",
       "      <td>27910</td>\n",
       "      <td>49618</td>\n",
       "      <td>347</td>\n",
       "      <td>19340.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7549</th>\n",
       "      <td>tt2358592</td>\n",
       "      <td>54985</td>\n",
       "      <td>17633</td>\n",
       "      <td>12668</td>\n",
       "      <td>9182</td>\n",
       "      <td>20403.4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>758545</th>\n",
       "      <td>tt13334656</td>\n",
       "      <td>700841</td>\n",
       "      <td>672174</td>\n",
       "      <td>500543</td>\n",
       "      <td>500543</td>\n",
       "      <td>626529.2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>758577</th>\n",
       "      <td>tt13336544</td>\n",
       "      <td>700845</td>\n",
       "      <td>672184</td>\n",
       "      <td>500576</td>\n",
       "      <td>500576</td>\n",
       "      <td>626551.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>758587</th>\n",
       "      <td>tt13335546</td>\n",
       "      <td>700843</td>\n",
       "      <td>672231</td>\n",
       "      <td>500564</td>\n",
       "      <td>500564</td>\n",
       "      <td>626557.8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>758590</th>\n",
       "      <td>tt13335152</td>\n",
       "      <td>700842</td>\n",
       "      <td>672247</td>\n",
       "      <td>500557</td>\n",
       "      <td>500557</td>\n",
       "      <td>626558.6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>777387</th>\n",
       "      <td>tt9916754</td>\n",
       "      <td>39373</td>\n",
       "      <td>777387</td>\n",
       "      <td>777387</td>\n",
       "      <td>777387</td>\n",
       "      <td>629784.2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>777388 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "            tconst  years_index  nconst_index  ratings_index  votes_index   \n",
       "8695     tt2338151         7775         12586          23860         1151  \\\n",
       "14       tt3659388        49654            98          25758           79   \n",
       "8501     tt1754656        30993           116          46193         3247   \n",
       "7374     tt2103281        11453         27910          49618          347   \n",
       "7549     tt2358592        54985         17633          12668         9182   \n",
       "...            ...          ...           ...            ...          ...   \n",
       "758545  tt13334656       700841        672174         500543       500543   \n",
       "758577  tt13336544       700845        672184         500576       500576   \n",
       "758587  tt13335546       700843        672231         500564       500564   \n",
       "758590  tt13335152       700842        672247         500557       500557   \n",
       "777387   tt9916754        39373        777387         777387       777387   \n",
       "\n",
       "         average  \n",
       "8695     10813.4  \n",
       "14       15120.6  \n",
       "8501     17810.0  \n",
       "7374     19340.4  \n",
       "7549     20403.4  \n",
       "...          ...  \n",
       "758545  626529.2  \n",
       "758577  626551.6  \n",
       "758587  626557.8  \n",
       "758590  626558.6  \n",
       "777387  629784.2  \n",
       "\n",
       "[777388 rows x 6 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "merged.sort_values(by='average')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "08dff0a1cb2e37beec5bc340112a669cde11fa0a1a1e2fde92884d26090bd6fc"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/main.py
+++ b/main.py
@@ -0,0 +1,23 @@
 from IMDB_DTO import DTO
 from time import time
 import pandas as pd
 if __name__ == '__main__':
    start_time = time()
    dto = DTO()
    tconst_list = dto.filter_tconst(name='title.basics.tsv')
    dto.df2csv(df=pd.DataFrame({'tconst': tconst_list}), name='tconst.csv')
    tconst_list = dto.get_tconst('tconst.csv')
    df = dto.filter_basic(name='title.basics.tsv', tconst_list=tconst_list)
    dto.df2csv(df=df, name='basics.csv')
    del df
    df = dto.filter_principal(name='title.principals.tsv', tconst_list=tconst_list)
    dto.df2csv(df=df, name='principals_comma.csv', overwrite=True)
    del df
    df = dto.filter_rating(name='title.ratings.tsv', tconst_list=tconst_list)
    dto.df2csv(df=df, name='ratings.csv', overwrite=1)
    del df
		`@@ -0,0 +1,3 @@`
							`class FileExistException(Exception): ...`

							`class FileNotExistException(Exception): ...`