From b60673ad25023727eba0f348175cab8bb1487e5a Mon Sep 17 00:00:00 2001
From: Aykhan <aykhan.shahs0@gmail.com>
Date: Sun, 6 Aug 2023 22:48:47 +0400
Subject: [PATCH] First commit

---
 .gitignore                |   4 +
 IMDB_DTO.py               | 332 ++++++++++++++++++++++++
 exceptions.py             |   3 +
 imdb_recommendation.ipynb | 517 ++++++++++++++++++++++++++++++++++++++
 main.py                   |  23 ++
 5 files changed, 879 insertions(+)
 create mode 100644 .gitignore
 create mode 100755 IMDB_DTO.py
 create mode 100644 exceptions.py
 create mode 100755 imdb_recommendation.ipynb
 create mode 100644 main.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..ee25eca
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+IMDB_data_sets
+.venv
+.ipynb_checkpoints
+__pycache__
\ No newline at end of file
diff --git a/IMDB_DTO.py b/IMDB_DTO.py
new file mode 100755
index 0000000..9a4abac
--- /dev/null
+++ b/IMDB_DTO.py
@@ -0,0 +1,332 @@
+from pathlib import Path
+import pandas as pd
+import numpy as np
+from time import time
+from exceptions import (
+    FileExistException,
+    FileNotExistException
+)
+
+
+BASE_DIR = Path(__file__).resolve().parent
+
+class DTO:
+    def __init__(
+            self,
+            save_dir=(BASE_DIR / 'IMDB_data_sets/filtered/'),
+            read_dir=(BASE_DIR / 'IMDB_data_sets/'),
+            default_chunksize: int=3_000_000
+        ) -> None:
+        """
+            Parameters
+            ----------
+            save_dir : str, optional
+                Folder location to save files (default is BASE_DIR / 'IMDB_data_sets/filtered/')
+            get_dir : str, optional
+                Folder location to get files (default is BASE_DIR / 'IMDB_data_sets/')
+            default_chunksize : int, optional
+                Default value to be used when chunksize is not given in methods that take
+                chunksize parameters (default is 3_000_000)
+        """
+
+        self.save_dir = save_dir
+        self.save_dir.mkdir(parents=True, exist_ok=True)
+        self.read_dir = read_dir
+        self.default_chunksize = default_chunksize
+
+    def timing_decorator(func):
+        def wrapper(*args, **kwargs):
+            start_time = time()
+            result = func(*args, **kwargs)
+            print(f"Function {func.__name__} took {time() - start_time} seconds to run.")
+            return result
+        return wrapper
+
+    def is_exist(self, file_dir: Path) -> None:
+        """
+            Parameters
+            ----------
+            file_dir : pathlib.Path
+                File path
+
+            Raises
+            ------
+            FileExistException
+                If the file exists
+        """
+        
+        if file_dir.is_file():
+            raise FileExistException(f"file is exist: {file_dir}")
+
+    def is_not_exist(self, file_dir: Path) -> None:
+        """
+            Parameters
+            ----------
+            file_dir : pathlib.Path
+                File path
+
+            Raises
+            ------
+            FileNotExistException
+                If the file does not exist
+        """
+
+        if not file_dir.is_file():
+            raise FileNotExistException(f"file is not exist: {file_dir}")
+
+    def df2csv(
+            self,
+            df: pd.DataFrame,
+            name: str,
+            overwrite: bool=False,
+            index: bool=False
+        ) -> None:
+        """
+            Parameters
+            ----------
+            df : DataFrame 
+                DataFrame object you want to save
+            name : str
+                The name you want to save the DataFrame object
+            overwrite : bool, optional
+                When True, overwrite if file exists (default is False)
+            index : bool, optional
+                Save index column or no (deafault is False)
+
+            Raises
+            ------
+            FileExistException
+                If the overwrite parameter is false and the file exists
+        """
+
+        if not overwrite:
+            self.is_exist(self.save_dir / name)
+        df.to_csv(self.save_dir / name, index=index)
+
+    @timing_decorator
+    def filter_tconst(
+            self,
+            name: str,
+            title_types: list[str]=['movie', 'tvMovie'],
+            chunksize: int=None
+        ) -> list[str]:
+        """
+            Parameters
+            ----------
+            name : str
+                Name of the basics file to be read
+            title_type : list, optional
+                'titleType' type of lines to be read from file (default is ['movie', 'tvMovie'])
+            chunksize : int
+                Chunk size for reading data (default is self.default_chunksize (default is 3_000_000)).
+
+            Returns
+            -------
+            list
+                A list of tconst
+
+            Raises
+            ------
+            FileNotExistException
+                If the file does not exist
+        """
+
+        self.is_not_exist(self.read_dir / name)
+        if chunksize is None:
+            chunksize = self.default_chunksize
+
+        tconst_list = []
+
+        with pd.read_csv(
+                    self.read_dir / name,
+                    sep=r'\t',
+                    chunksize=chunksize,
+                    engine='python',
+                    usecols=['tconst', 'titleType'],
+                    dtype={'tconst': str, 'titleType': str},
+                    na_values='\\N') as reader:
+
+            for i, r in enumerate(reader):
+                tconst_list += list(r[r.titleType.isin(title_types)]['tconst'])
+        return tconst_list
+
+    def get_tconst(self, name: str) -> list[str]:
+        """
+            Parameters
+            ----------
+            name : str
+                Name of the tconst file to be read
+
+            Returns
+            -------
+            list
+                A list of tconst
+
+            Raises
+            ------
+            FileNotExistException
+                If the file does not exist
+        """
+        
+        self.is_not_exist(self.save_dir / name)
+        return list(pd.read_csv(self.save_dir / name, usecols=['tconst'], dtype={'tconst': str})['tconst'])
+
+    @timing_decorator
+    def filter_principal(
+            self,
+            name: str,
+            tconst_list: list[str],
+            category_list: list[str]=['actress', 'actor', 'director', 'writer'],
+            chunksize: int=None
+        ) -> pd.DataFrame:
+        """
+            Parameters
+            ----------
+            name : str
+                Name of the principals file to be read
+            tconst_list : list
+                List of tconst (It can be obtained by the get_tconst or read_tconst method).
+            category : list
+                List of categories of rows to be selected (default is ['actress', 'actor', 'director', 'writer']).
+            chunksize : int
+                Chunk size for reading data (default is self.default_chunksize (default is 3_000_000)).
+
+            Returns
+            -------
+            DataFrame
+                A DataFrame object with columns tconst, nconst, and category.
+
+            Raises
+            ------
+            FileNotExistException
+                If the file does not exist
+        """
+
+        self.is_not_exist(self.read_dir / name)
+        if chunksize is None:
+            chunksize = self.default_chunksize
+
+        df = pd.DataFrame({
+                'tconst': tconst_list,
+                'nconst': np.empty((len(tconst_list), 0)).tolist(),
+                'category': np.empty((len(tconst_list), 0)).tolist()
+            })
+
+        # index = pd.Index(tconst_list, name='tconst')
+        # df = pd.DataFrame({
+        #     'nconst': pd.Series(dtype='object', index=index),
+        #     'category': pd.Series(dtype='object', index=index)
+        # })
+
+        cnt = 0
+
+        with pd.read_csv(self.read_dir / name,
+                        sep=r'\t',
+                        chunksize=chunksize,
+                        engine='python',
+                        usecols=['tconst', 'nconst', 'category']) as reader:
+
+            for i, r in enumerate(reader):
+                r = r.query(f"(tconst in @tconst_list) and (category in @category_list)")
+                r_group = r.groupby('tconst', as_index=0).agg({'nconst': lambda x: list(x), 'category': lambda x: list(x)})
+                df = pd.concat([df, r_group]).groupby('tconst', as_index=0).agg(sum)
+
+                # r_group.index.name = 'tconst'
+                # df.update(r_group)
+                del r_group
+
+        print(cnt)
+        return df
+
+    @timing_decorator
+    def filter_rating(
+            self,
+            name: str,
+            tconst_list: list[str],
+            chunksize: int=None
+        ) -> pd.DataFrame:
+        """
+            Parameters
+            ----------
+            name : str
+                Name of the ratings file to be read
+            tconst_list : list
+                List of tconst (It can be obtained by the get_tconst or read_tconst method).
+            chunksize : int
+                Chunk size for reading data (default is self.default_chunksize (default is 3_000_000)).
+
+            Returns
+            -------
+            DataFrame
+                A DataFrame object with columns tconst, and averageRating.
+
+            Raises
+            ------
+            FileNotExistException
+                If the file does not exist
+        """
+
+        self.is_not_exist(self.read_dir / name)
+        if chunksize is None:
+            chunksize = self.default_chunksize
+
+        df = pd.DataFrame({'tconst': tconst_list})
+
+        with pd.read_csv(
+                self.read_dir / name,
+                sep=r'\t',
+                chunksize=chunksize,
+                engine='python',
+                usecols=['tconst', 'averageRating', 'numVotes'],
+                dtype={'tconst': str, 'averageRating': np.float16, 'numVotes': int},
+                na_values='\\N') as reader:
+
+            for i, r in enumerate(reader):
+                df = pd.concat([df, r.query("tconst in @tconst_list")]).groupby('tconst', as_index=0).first()
+        return df
+
+    @timing_decorator
+    def filter_basic(
+            self,
+            name: str,
+            tconst_list: list[str],
+            chunksize: int=None
+        ) -> pd.DataFrame:
+        """
+            Parameters
+            ----------
+            name : str
+                Name of the basics file to be read
+            tconst_list : list
+                List of tconst (It can be obtained by the get_tconst or read_tconst method).
+            chunksize : int
+                Chunk size for reading data (default is self.default_chunksize (default is 3_000_000)).
+
+            Returns
+            -------
+            DataFrame
+                A DataFrame object with columns tconst, startYear and genres.
+
+            Raises
+            ------
+            FileNotExistException
+                If the file does not exist
+        """
+
+        self.is_not_exist(self.read_dir / name)
+        if chunksize is None:
+            chunksize = self.default_chunksize
+
+        df = pd.DataFrame({'tconst': tconst_list})
+
+        with pd.read_csv(self.read_dir / name,
+                        sep=r'\t',
+                        chunksize=chunksize,
+                        engine='python',
+                        usecols=['tconst', 'startYear', 'genres'],
+                        dtype={'tconst': str, 'startYear': 'Int16', 'genres': str},
+                        na_values='\\N') as reader:
+
+            for i, r in enumerate(reader):
+                df = pd.concat([df, r.query("tconst in @tconst_list")]).groupby('tconst', as_index=0).first()
+        return df
\ No newline at end of file
diff --git a/exceptions.py b/exceptions.py
new file mode 100644
index 0000000..654dd5c
--- /dev/null
+++ b/exceptions.py
@@ -0,0 +1,3 @@
+class FileExistException(Exception): ...
+
+class FileNotExistException(Exception): ...
\ No newline at end of file
diff --git a/imdb_recommendation.ipynb b/imdb_recommendation.ipynb
new file mode 100755
index 0000000..b1c7872
--- /dev/null
+++ b/imdb_recommendation.ipynb
@@ -0,0 +1,517 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "from sklearn.metrics.pairwise import cosine_similarity\n",
+    "from ast import literal_eval\n",
+    "from functools import reduce"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_film = 'tt0816692'\n",
+    "\n",
+    "trained = {'basics': {}, 'principals': {}, 'ratings': {}}\n",
+    "\n",
+    "def drop_by_tconst(df, tconst: str, inplace=True) -> pd.DataFrame:\n",
+    "    return df.drop(df[df['tconst'] == tconst].index, inplace=inplace)\n",
+    "\n",
+    "# Basics\n",
+    "\n",
+    "df = pd.read_csv('./IMDB_data_sets/filtered/basics.csv', dtype={'tconst': str, 'startYear': 'Int16', 'genres': str})\n",
+    "df['genres'].fillna('', inplace=True)\n",
+    "\n",
+    "cv = CountVectorizer(dtype=np.int8, token_pattern=\"(?u)[\\w'-]+\")\n",
+    "count_matrix = cv.fit_transform(df['genres'])\n",
+    "\n",
+    "trained['basics']['genres'] = pd.DataFrame(\n",
+    "        {\n",
+    "            'genres': cosine_similarity(count_matrix[df[df['tconst'] == input_film].index[0]], count_matrix)[0],\n",
+    "            'tconst': df['tconst']\n",
+    "        }\n",
+    "    )\n",
+    "\n",
+    "drop_by_tconst(trained['basics']['genres'], input_film)\n",
+    "\n",
+    "trained['basics']['genres'].sort_values(ascending=False, by='genres', inplace=True, ignore_index=True)\n",
+    "trained['basics']['genres'].drop('genres', axis=1, inplace=True)\n",
+    "\n",
+    "\n",
+    "year = int(df[df['tconst'] == input_film].startYear.iloc[0])\n",
+    "\n",
+    "trained['basics']['years'] = pd.DataFrame(\n",
+    "    {\n",
+    "        'years': df['startYear'],\n",
+    "        'tconst': df['tconst']\n",
+    "    }\n",
+    ")\n",
+    "\n",
+    "drop_by_tconst(trained['basics']['years'], input_film)\n",
+    "trained['basics']['years'].sort_values(by='years', key=lambda x: abs(year-x), inplace=True, ignore_index=True)\n",
+    "trained['basics']['years'].drop('years', axis=1, inplace=True)\n",
+    "trained['basics']['years'].reset_index(names='years_index', inplace=True)\n",
+    "\n",
+    "# Principals\n",
+    "\n",
+    "df = pd.read_csv('./IMDB_data_sets/filtered/principals.csv', dtype={'tconst': str, 'nconst': str}, usecols=['tconst', 'nconst'])\n",
+    "df.nconst = df.nconst.apply(lambda n: ','.join(literal_eval(n)))\n",
+    "\n",
+    "cv = CountVectorizer(dtype=np.int8, token_pattern=\"(?u)[\\w'-]+\")\n",
+    "count_matrix = cv.fit_transform(df['nconst'])\n",
+    "\n",
+    "trained['principals']['nconst'] = pd.DataFrame(\n",
+    "        {\n",
+    "            'nconst': cosine_similarity(count_matrix[df[df['tconst'] == input_film].index[0]], count_matrix)[0],\n",
+    "            'tconst': df['tconst']\n",
+    "        }\n",
+    "    )\n",
+    "\n",
+    "drop_by_tconst(trained['principals']['nconst'], input_film)\n",
+    "trained['principals']['nconst'].sort_values(ascending=False, by='nconst', inplace=True, ignore_index=True)\n",
+    "trained['principals']['nconst'].drop('nconst', axis=1, inplace=True)\n",
+    "trained['principals']['nconst'].reset_index(names='nconst_index', inplace=True)\n",
+    "\n",
+    "# Ratings\n",
+    "\n",
+    "df = pd.read_csv('./IMDB_data_sets/filtered/ratings.csv', dtype={'tconst': str, 'averageRating': float, 'numVotes': 'Int64'})\n",
+    "\n",
+    "rating = float(df[df['tconst'] == input_film].averageRating.iloc[0])\n",
+    "votes = int(df[df['tconst'] == input_film].numVotes.iloc[0])\n",
+    "\n",
+    "drop_by_tconst(df, input_film)\n",
+    "\n",
+    "trained['ratings']['ratings'] = df.sort_values(by='averageRating', key=lambda x: abs(rating-x), ignore_index=True)\n",
+    "trained['ratings']['ratings'].drop(['averageRating', 'numVotes'], axis=1, inplace=True)\n",
+    "trained['ratings']['ratings'].reset_index(names='ratings_index', inplace=True)\n",
+    "\n",
+    "df.drop('averageRating', axis=1, inplace=True)\n",
+    "\n",
+    "trained['ratings']['votes'] = df.sort_values(by='numVotes', key=lambda x: abs(votes-x), ignore_index=True)\n",
+    "trained['ratings']['votes'].drop('numVotes', axis=1, inplace=True)\n",
+    "trained['ratings']['votes'].reset_index(names='votes_index', inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "merged = reduce(lambda  left, right: pd.merge(\n",
+    "                                left,\n",
+    "                                right,\n",
+    "                                on=['tconst'],\n",
+    "                                how='outer'\n",
+    "                            ), [\n",
+    "                                trained['basics']['genres'],\n",
+    "                                trained['basics']['years'],\n",
+    "                                trained['principals']['nconst'],\n",
+    "                                trained['ratings']['ratings'],\n",
+    "                                trained['ratings']['votes']\n",
+    "                            ])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>tconst</th>\n",
+       "      <th>years_index</th>\n",
+       "      <th>nconst_index</th>\n",
+       "      <th>ratings_index</th>\n",
+       "      <th>votes_index</th>\n",
+       "      <th>average</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>tt4255564</td>\n",
+       "      <td>744690</td>\n",
+       "      <td>297616</td>\n",
+       "      <td>669670</td>\n",
+       "      <td>669670</td>\n",
+       "      <td>476329.2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>tt2203897</td>\n",
+       "      <td>27293</td>\n",
+       "      <td>9705</td>\n",
+       "      <td>602978</td>\n",
+       "      <td>602978</td>\n",
+       "      <td>248591.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>tt0355627</td>\n",
+       "      <td>344502</td>\n",
+       "      <td>708640</td>\n",
+       "      <td>318038</td>\n",
+       "      <td>205177</td>\n",
+       "      <td>315271.8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>tt15387378</td>\n",
+       "      <td>710214</td>\n",
+       "      <td>98486</td>\n",
+       "      <td>540358</td>\n",
+       "      <td>540358</td>\n",
+       "      <td>377883.8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>tt5155340</td>\n",
+       "      <td>72975</td>\n",
+       "      <td>386406</td>\n",
+       "      <td>103102</td>\n",
+       "      <td>152733</td>\n",
+       "      <td>143044.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>777383</th>\n",
+       "      <td>tt1230211</td>\n",
+       "      <td>189871</td>\n",
+       "      <td>599826</td>\n",
+       "      <td>278434</td>\n",
+       "      <td>25539</td>\n",
+       "      <td>374210.6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>777384</th>\n",
+       "      <td>tt12302076</td>\n",
+       "      <td>149946</td>\n",
+       "      <td>599825</td>\n",
+       "      <td>483066</td>\n",
+       "      <td>483066</td>\n",
+       "      <td>498657.4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>777385</th>\n",
+       "      <td>tt1230206</td>\n",
+       "      <td>189885</td>\n",
+       "      <td>599823</td>\n",
+       "      <td>301969</td>\n",
+       "      <td>25847</td>\n",
+       "      <td>378981.8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>777386</th>\n",
+       "      <td>tt1230179</td>\n",
+       "      <td>255769</td>\n",
+       "      <td>599809</td>\n",
+       "      <td>483065</td>\n",
+       "      <td>483065</td>\n",
+       "      <td>519818.8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>777387</th>\n",
+       "      <td>tt9916754</td>\n",
+       "      <td>39373</td>\n",
+       "      <td>777387</td>\n",
+       "      <td>777387</td>\n",
+       "      <td>777387</td>\n",
+       "      <td>629784.2</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>777388 rows × 6 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "            tconst  years_index  nconst_index  ratings_index  votes_index   \n",
+       "0        tt4255564       744690        297616         669670       669670  \\\n",
+       "1        tt2203897        27293          9705         602978       602978   \n",
+       "2        tt0355627       344502        708640         318038       205177   \n",
+       "3       tt15387378       710214         98486         540358       540358   \n",
+       "4        tt5155340        72975        386406         103102       152733   \n",
+       "...            ...          ...           ...            ...          ...   \n",
+       "777383   tt1230211       189871        599826         278434        25539   \n",
+       "777384  tt12302076       149946        599825         483066       483066   \n",
+       "777385   tt1230206       189885        599823         301969        25847   \n",
+       "777386   tt1230179       255769        599809         483065       483065   \n",
+       "777387   tt9916754        39373        777387         777387       777387   \n",
+       "\n",
+       "         average  \n",
+       "0       476329.2  \n",
+       "1       248591.0  \n",
+       "2       315271.8  \n",
+       "3       377883.8  \n",
+       "4       143044.0  \n",
+       "...          ...  \n",
+       "777383  374210.6  \n",
+       "777384  498657.4  \n",
+       "777385  378981.8  \n",
+       "777386  519818.8  \n",
+       "777387  629784.2  \n",
+       "\n",
+       "[777388 rows x 6 columns]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "merged['average'] = (merged.index*20 + merged.years_index*20 + merged.nconst_index*20 + merged.ratings_index*20 + merged.votes_index*20) / (5*20)\n",
+    "merged"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>tconst</th>\n",
+       "      <th>years_index</th>\n",
+       "      <th>nconst_index</th>\n",
+       "      <th>ratings_index</th>\n",
+       "      <th>votes_index</th>\n",
+       "      <th>average</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>8695</th>\n",
+       "      <td>tt2338151</td>\n",
+       "      <td>7775</td>\n",
+       "      <td>12586</td>\n",
+       "      <td>23860</td>\n",
+       "      <td>1151</td>\n",
+       "      <td>10813.4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>14</th>\n",
+       "      <td>tt3659388</td>\n",
+       "      <td>49654</td>\n",
+       "      <td>98</td>\n",
+       "      <td>25758</td>\n",
+       "      <td>79</td>\n",
+       "      <td>15120.6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8501</th>\n",
+       "      <td>tt1754656</td>\n",
+       "      <td>30993</td>\n",
+       "      <td>116</td>\n",
+       "      <td>46193</td>\n",
+       "      <td>3247</td>\n",
+       "      <td>17810.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7374</th>\n",
+       "      <td>tt2103281</td>\n",
+       "      <td>11453</td>\n",
+       "      <td>27910</td>\n",
+       "      <td>49618</td>\n",
+       "      <td>347</td>\n",
+       "      <td>19340.4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7549</th>\n",
+       "      <td>tt2358592</td>\n",
+       "      <td>54985</td>\n",
+       "      <td>17633</td>\n",
+       "      <td>12668</td>\n",
+       "      <td>9182</td>\n",
+       "      <td>20403.4</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>758545</th>\n",
+       "      <td>tt13334656</td>\n",
+       "      <td>700841</td>\n",
+       "      <td>672174</td>\n",
+       "      <td>500543</td>\n",
+       "      <td>500543</td>\n",
+       "      <td>626529.2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>758577</th>\n",
+       "      <td>tt13336544</td>\n",
+       "      <td>700845</td>\n",
+       "      <td>672184</td>\n",
+       "      <td>500576</td>\n",
+       "      <td>500576</td>\n",
+       "      <td>626551.6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>758587</th>\n",
+       "      <td>tt13335546</td>\n",
+       "      <td>700843</td>\n",
+       "      <td>672231</td>\n",
+       "      <td>500564</td>\n",
+       "      <td>500564</td>\n",
+       "      <td>626557.8</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>758590</th>\n",
+       "      <td>tt13335152</td>\n",
+       "      <td>700842</td>\n",
+       "      <td>672247</td>\n",
+       "      <td>500557</td>\n",
+       "      <td>500557</td>\n",
+       "      <td>626558.6</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>777387</th>\n",
+       "      <td>tt9916754</td>\n",
+       "      <td>39373</td>\n",
+       "      <td>777387</td>\n",
+       "      <td>777387</td>\n",
+       "      <td>777387</td>\n",
+       "      <td>629784.2</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>777388 rows × 6 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "            tconst  years_index  nconst_index  ratings_index  votes_index   \n",
+       "8695     tt2338151         7775         12586          23860         1151  \\\n",
+       "14       tt3659388        49654            98          25758           79   \n",
+       "8501     tt1754656        30993           116          46193         3247   \n",
+       "7374     tt2103281        11453         27910          49618          347   \n",
+       "7549     tt2358592        54985         17633          12668         9182   \n",
+       "...            ...          ...           ...            ...          ...   \n",
+       "758545  tt13334656       700841        672174         500543       500543   \n",
+       "758577  tt13336544       700845        672184         500576       500576   \n",
+       "758587  tt13335546       700843        672231         500564       500564   \n",
+       "758590  tt13335152       700842        672247         500557       500557   \n",
+       "777387   tt9916754        39373        777387         777387       777387   \n",
+       "\n",
+       "         average  \n",
+       "8695     10813.4  \n",
+       "14       15120.6  \n",
+       "8501     17810.0  \n",
+       "7374     19340.4  \n",
+       "7549     20403.4  \n",
+       "...          ...  \n",
+       "758545  626529.2  \n",
+       "758577  626551.6  \n",
+       "758587  626557.8  \n",
+       "758590  626558.6  \n",
+       "777387  629784.2  \n",
+       "\n",
+       "[777388 rows x 6 columns]"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "merged.sort_values(by='average')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "08dff0a1cb2e37beec5bc340112a669cde11fa0a1a1e2fde92884d26090bd6fc"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..e042b40
--- /dev/null
+++ b/main.py
@@ -0,0 +1,23 @@
+from IMDB_DTO import DTO
+from time import time
+import pandas as pd
+
+
+if __name__ == '__main__':
+    start_time = time()
+    dto = DTO()
+
+    tconst_list = dto.filter_tconst(name='title.basics.tsv')
+    dto.df2csv(df=pd.DataFrame({'tconst': tconst_list}), name='tconst.csv')
+
+    tconst_list = dto.get_tconst('tconst.csv')
+
+    df = dto.filter_basic(name='title.basics.tsv', tconst_list=tconst_list)
+    dto.df2csv(df=df, name='basics.csv')
+    del df
+    df = dto.filter_principal(name='title.principals.tsv', tconst_list=tconst_list)
+    dto.df2csv(df=df, name='principals_comma.csv', overwrite=True)
+    del df
+    df = dto.filter_rating(name='title.ratings.tsv', tconst_list=tconst_list)
+    dto.df2csv(df=df, name='ratings.csv', overwrite=1)
+    del df
\ No newline at end of file