First commit

This commit is contained in:
Aykhan 2023-08-06 22:48:47 +04:00
commit b60673ad25
5 changed files with 879 additions and 0 deletions

4
.gitignore vendored Normal file
View File

@ -0,0 +1,4 @@
IMDB_data_sets
.venv
.ipynb_checkpoints
__pycache__

332
IMDB_DTO.py Executable file
View File

@ -0,0 +1,332 @@
from pathlib import Path
import pandas as pd
import numpy as np
from time import time
from exceptions import (
FileExistException,
FileNotExistException
)
BASE_DIR = Path(__file__).resolve().parent
class DTO:
def __init__(
self,
save_dir=(BASE_DIR / 'IMDB_data_sets/filtered/'),
read_dir=(BASE_DIR / 'IMDB_data_sets/'),
default_chunksize: int=3_000_000
) -> None:
"""
Parameters
----------
save_dir : str, optional
Folder location to save files (default is BASE_DIR / 'IMDB_data_sets/filtered/')
get_dir : str, optional
Folder location to get files (default is BASE_DIR / 'IMDB_data_sets/')
default_chunksize : int, optional
Default value to be used when chunksize is not given in methods that take
chunksize parameters (default is 3_000_000)
"""
self.save_dir = save_dir
self.save_dir.mkdir(parents=True, exist_ok=True)
self.read_dir = read_dir
self.default_chunksize = default_chunksize
def timing_decorator(func):
def wrapper(*args, **kwargs):
start_time = time()
result = func(*args, **kwargs)
print(f"Function {func.__name__} took {time() - start_time} seconds to run.")
return result
return wrapper
def is_exist(self, file_dir: Path) -> None:
"""
Parameters
----------
file_dir : pathlib.Path
File path
Raises
------
FileExistException
If the file exists
"""
if file_dir.is_file():
raise FileExistException(f"file is exist: {file_dir}")
def is_not_exist(self, file_dir: Path) -> None:
"""
Parameters
----------
file_dir : pathlib.Path
File path
Raises
------
FileNotExistException
If the file does not exist
"""
if not file_dir.is_file():
raise FileNotExistException(f"file is not exist: {file_dir}")
def df2csv(
self,
df: pd.DataFrame,
name: str,
overwrite: bool=False,
index: bool=False
) -> None:
"""
Parameters
----------
df : DataFrame
DataFrame object you want to save
name : str
The name you want to save the DataFrame object
overwrite : bool, optional
When True, overwrite if file exists (default is False)
index : bool, optional
Save index column or no (deafault is False)
Raises
------
FileExistException
If the overwrite parameter is false and the file exists
"""
if not overwrite:
self.is_exist(self.save_dir / name)
df.to_csv(self.save_dir / name, index=index)
@timing_decorator
def filter_tconst(
self,
name: str,
title_types: list[str]=['movie', 'tvMovie'],
chunksize: int=None
) -> list[str]:
"""
Parameters
----------
name : str
Name of the basics file to be read
title_type : list, optional
'titleType' type of lines to be read from file (default is ['movie', 'tvMovie'])
chunksize : int
Chunk size for reading data (default is self.default_chunksize (default is 3_000_000)).
Returns
-------
list
A list of tconst
Raises
------
FileNotExistException
If the file does not exist
"""
self.is_not_exist(self.read_dir / name)
if chunksize is None:
chunksize = self.default_chunksize
tconst_list = []
with pd.read_csv(
self.read_dir / name,
sep=r'\t',
chunksize=chunksize,
engine='python',
usecols=['tconst', 'titleType'],
dtype={'tconst': str, 'titleType': str},
na_values='\\N') as reader:
for i, r in enumerate(reader):
tconst_list += list(r[r.titleType.isin(title_types)]['tconst'])
return tconst_list
def get_tconst(self, name: str) -> list[str]:
"""
Parameters
----------
name : str
Name of the tconst file to be read
Returns
-------
list
A list of tconst
Raises
------
FileNotExistException
If the file does not exist
"""
self.is_not_exist(self.save_dir / name)
return list(pd.read_csv(self.save_dir / name, usecols=['tconst'], dtype={'tconst': str})['tconst'])
@timing_decorator
def filter_principal(
self,
name: str,
tconst_list: list[str],
category_list: list[str]=['actress', 'actor', 'director', 'writer'],
chunksize: int=None
) -> pd.DataFrame:
"""
Parameters
----------
name : str
Name of the principals file to be read
tconst_list : list
List of tconst (It can be obtained by the get_tconst or read_tconst method).
category : list
List of categories of rows to be selected (default is ['actress', 'actor', 'director', 'writer']).
chunksize : int
Chunk size for reading data (default is self.default_chunksize (default is 3_000_000)).
Returns
-------
DataFrame
A DataFrame object with columns tconst, nconst, and category.
Raises
------
FileNotExistException
If the file does not exist
"""
self.is_not_exist(self.read_dir / name)
if chunksize is None:
chunksize = self.default_chunksize
df = pd.DataFrame({
'tconst': tconst_list,
'nconst': np.empty((len(tconst_list), 0)).tolist(),
'category': np.empty((len(tconst_list), 0)).tolist()
})
# index = pd.Index(tconst_list, name='tconst')
# df = pd.DataFrame({
# 'nconst': pd.Series(dtype='object', index=index),
# 'category': pd.Series(dtype='object', index=index)
# })
cnt = 0
with pd.read_csv(self.read_dir / name,
sep=r'\t',
chunksize=chunksize,
engine='python',
usecols=['tconst', 'nconst', 'category']) as reader:
for i, r in enumerate(reader):
r = r.query(f"(tconst in @tconst_list) and (category in @category_list)")
r_group = r.groupby('tconst', as_index=0).agg({'nconst': lambda x: list(x), 'category': lambda x: list(x)})
df = pd.concat([df, r_group]).groupby('tconst', as_index=0).agg(sum)
# r_group.index.name = 'tconst'
# df.update(r_group)
del r_group
print(cnt)
return df
@timing_decorator
def filter_rating(
self,
name: str,
tconst_list: list[str],
chunksize: int=None
) -> pd.DataFrame:
"""
Parameters
----------
name : str
Name of the ratings file to be read
tconst_list : list
List of tconst (It can be obtained by the get_tconst or read_tconst method).
chunksize : int
Chunk size for reading data (default is self.default_chunksize (default is 3_000_000)).
Returns
-------
DataFrame
A DataFrame object with columns tconst, and averageRating.
Raises
------
FileNotExistException
If the file does not exist
"""
self.is_not_exist(self.read_dir / name)
if chunksize is None:
chunksize = self.default_chunksize
df = pd.DataFrame({'tconst': tconst_list})
with pd.read_csv(
self.read_dir / name,
sep=r'\t',
chunksize=chunksize,
engine='python',
usecols=['tconst', 'averageRating', 'numVotes'],
dtype={'tconst': str, 'averageRating': np.float16, 'numVotes': int},
na_values='\\N') as reader:
for i, r in enumerate(reader):
df = pd.concat([df, r.query("tconst in @tconst_list")]).groupby('tconst', as_index=0).first()
return df
@timing_decorator
def filter_basic(
self,
name: str,
tconst_list: list[str],
chunksize: int=None
) -> pd.DataFrame:
"""
Parameters
----------
name : str
Name of the basics file to be read
tconst_list : list
List of tconst (It can be obtained by the get_tconst or read_tconst method).
chunksize : int
Chunk size for reading data (default is self.default_chunksize (default is 3_000_000)).
Returns
-------
DataFrame
A DataFrame object with columns tconst, startYear and genres.
Raises
------
FileNotExistException
If the file does not exist
"""
self.is_not_exist(self.read_dir / name)
if chunksize is None:
chunksize = self.default_chunksize
df = pd.DataFrame({'tconst': tconst_list})
with pd.read_csv(self.read_dir / name,
sep=r'\t',
chunksize=chunksize,
engine='python',
usecols=['tconst', 'startYear', 'genres'],
dtype={'tconst': str, 'startYear': 'Int16', 'genres': str},
na_values='\\N') as reader:
for i, r in enumerate(reader):
df = pd.concat([df, r.query("tconst in @tconst_list")]).groupby('tconst', as_index=0).first()
return df

3
exceptions.py Normal file
View File

@ -0,0 +1,3 @@
class FileExistException(Exception): ...
class FileNotExistException(Exception): ...

517
imdb_recommendation.ipynb Executable file
View File

@ -0,0 +1,517 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from sklearn.metrics.pairwise import cosine_similarity\n",
"from ast import literal_eval\n",
"from functools import reduce"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"input_film = 'tt0816692'\n",
"\n",
"trained = {'basics': {}, 'principals': {}, 'ratings': {}}\n",
"\n",
"def drop_by_tconst(df, tconst: str, inplace=True) -> pd.DataFrame:\n",
" return df.drop(df[df['tconst'] == tconst].index, inplace=inplace)\n",
"\n",
"# Basics\n",
"\n",
"df = pd.read_csv('./IMDB_data_sets/filtered/basics.csv', dtype={'tconst': str, 'startYear': 'Int16', 'genres': str})\n",
"df['genres'].fillna('', inplace=True)\n",
"\n",
"cv = CountVectorizer(dtype=np.int8, token_pattern=\"(?u)[\\w'-]+\")\n",
"count_matrix = cv.fit_transform(df['genres'])\n",
"\n",
"trained['basics']['genres'] = pd.DataFrame(\n",
" {\n",
" 'genres': cosine_similarity(count_matrix[df[df['tconst'] == input_film].index[0]], count_matrix)[0],\n",
" 'tconst': df['tconst']\n",
" }\n",
" )\n",
"\n",
"drop_by_tconst(trained['basics']['genres'], input_film)\n",
"\n",
"trained['basics']['genres'].sort_values(ascending=False, by='genres', inplace=True, ignore_index=True)\n",
"trained['basics']['genres'].drop('genres', axis=1, inplace=True)\n",
"\n",
"\n",
"year = int(df[df['tconst'] == input_film].startYear.iloc[0])\n",
"\n",
"trained['basics']['years'] = pd.DataFrame(\n",
" {\n",
" 'years': df['startYear'],\n",
" 'tconst': df['tconst']\n",
" }\n",
")\n",
"\n",
"drop_by_tconst(trained['basics']['years'], input_film)\n",
"trained['basics']['years'].sort_values(by='years', key=lambda x: abs(year-x), inplace=True, ignore_index=True)\n",
"trained['basics']['years'].drop('years', axis=1, inplace=True)\n",
"trained['basics']['years'].reset_index(names='years_index', inplace=True)\n",
"\n",
"# Principals\n",
"\n",
"df = pd.read_csv('./IMDB_data_sets/filtered/principals.csv', dtype={'tconst': str, 'nconst': str}, usecols=['tconst', 'nconst'])\n",
"df.nconst = df.nconst.apply(lambda n: ','.join(literal_eval(n)))\n",
"\n",
"cv = CountVectorizer(dtype=np.int8, token_pattern=\"(?u)[\\w'-]+\")\n",
"count_matrix = cv.fit_transform(df['nconst'])\n",
"\n",
"trained['principals']['nconst'] = pd.DataFrame(\n",
" {\n",
" 'nconst': cosine_similarity(count_matrix[df[df['tconst'] == input_film].index[0]], count_matrix)[0],\n",
" 'tconst': df['tconst']\n",
" }\n",
" )\n",
"\n",
"drop_by_tconst(trained['principals']['nconst'], input_film)\n",
"trained['principals']['nconst'].sort_values(ascending=False, by='nconst', inplace=True, ignore_index=True)\n",
"trained['principals']['nconst'].drop('nconst', axis=1, inplace=True)\n",
"trained['principals']['nconst'].reset_index(names='nconst_index', inplace=True)\n",
"\n",
"# Ratings\n",
"\n",
"df = pd.read_csv('./IMDB_data_sets/filtered/ratings.csv', dtype={'tconst': str, 'averageRating': float, 'numVotes': 'Int64'})\n",
"\n",
"rating = float(df[df['tconst'] == input_film].averageRating.iloc[0])\n",
"votes = int(df[df['tconst'] == input_film].numVotes.iloc[0])\n",
"\n",
"drop_by_tconst(df, input_film)\n",
"\n",
"trained['ratings']['ratings'] = df.sort_values(by='averageRating', key=lambda x: abs(rating-x), ignore_index=True)\n",
"trained['ratings']['ratings'].drop(['averageRating', 'numVotes'], axis=1, inplace=True)\n",
"trained['ratings']['ratings'].reset_index(names='ratings_index', inplace=True)\n",
"\n",
"df.drop('averageRating', axis=1, inplace=True)\n",
"\n",
"trained['ratings']['votes'] = df.sort_values(by='numVotes', key=lambda x: abs(votes-x), ignore_index=True)\n",
"trained['ratings']['votes'].drop('numVotes', axis=1, inplace=True)\n",
"trained['ratings']['votes'].reset_index(names='votes_index', inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"merged = reduce(lambda left, right: pd.merge(\n",
" left,\n",
" right,\n",
" on=['tconst'],\n",
" how='outer'\n",
" ), [\n",
" trained['basics']['genres'],\n",
" trained['basics']['years'],\n",
" trained['principals']['nconst'],\n",
" trained['ratings']['ratings'],\n",
" trained['ratings']['votes']\n",
" ])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>tconst</th>\n",
" <th>years_index</th>\n",
" <th>nconst_index</th>\n",
" <th>ratings_index</th>\n",
" <th>votes_index</th>\n",
" <th>average</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>tt4255564</td>\n",
" <td>744690</td>\n",
" <td>297616</td>\n",
" <td>669670</td>\n",
" <td>669670</td>\n",
" <td>476329.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>tt2203897</td>\n",
" <td>27293</td>\n",
" <td>9705</td>\n",
" <td>602978</td>\n",
" <td>602978</td>\n",
" <td>248591.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>tt0355627</td>\n",
" <td>344502</td>\n",
" <td>708640</td>\n",
" <td>318038</td>\n",
" <td>205177</td>\n",
" <td>315271.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>tt15387378</td>\n",
" <td>710214</td>\n",
" <td>98486</td>\n",
" <td>540358</td>\n",
" <td>540358</td>\n",
" <td>377883.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>tt5155340</td>\n",
" <td>72975</td>\n",
" <td>386406</td>\n",
" <td>103102</td>\n",
" <td>152733</td>\n",
" <td>143044.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>777383</th>\n",
" <td>tt1230211</td>\n",
" <td>189871</td>\n",
" <td>599826</td>\n",
" <td>278434</td>\n",
" <td>25539</td>\n",
" <td>374210.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>777384</th>\n",
" <td>tt12302076</td>\n",
" <td>149946</td>\n",
" <td>599825</td>\n",
" <td>483066</td>\n",
" <td>483066</td>\n",
" <td>498657.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>777385</th>\n",
" <td>tt1230206</td>\n",
" <td>189885</td>\n",
" <td>599823</td>\n",
" <td>301969</td>\n",
" <td>25847</td>\n",
" <td>378981.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>777386</th>\n",
" <td>tt1230179</td>\n",
" <td>255769</td>\n",
" <td>599809</td>\n",
" <td>483065</td>\n",
" <td>483065</td>\n",
" <td>519818.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>777387</th>\n",
" <td>tt9916754</td>\n",
" <td>39373</td>\n",
" <td>777387</td>\n",
" <td>777387</td>\n",
" <td>777387</td>\n",
" <td>629784.2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>777388 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" tconst years_index nconst_index ratings_index votes_index \n",
"0 tt4255564 744690 297616 669670 669670 \\\n",
"1 tt2203897 27293 9705 602978 602978 \n",
"2 tt0355627 344502 708640 318038 205177 \n",
"3 tt15387378 710214 98486 540358 540358 \n",
"4 tt5155340 72975 386406 103102 152733 \n",
"... ... ... ... ... ... \n",
"777383 tt1230211 189871 599826 278434 25539 \n",
"777384 tt12302076 149946 599825 483066 483066 \n",
"777385 tt1230206 189885 599823 301969 25847 \n",
"777386 tt1230179 255769 599809 483065 483065 \n",
"777387 tt9916754 39373 777387 777387 777387 \n",
"\n",
" average \n",
"0 476329.2 \n",
"1 248591.0 \n",
"2 315271.8 \n",
"3 377883.8 \n",
"4 143044.0 \n",
"... ... \n",
"777383 374210.6 \n",
"777384 498657.4 \n",
"777385 378981.8 \n",
"777386 519818.8 \n",
"777387 629784.2 \n",
"\n",
"[777388 rows x 6 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merged['average'] = (merged.index*20 + merged.years_index*20 + merged.nconst_index*20 + merged.ratings_index*20 + merged.votes_index*20) / (5*20)\n",
"merged"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>tconst</th>\n",
" <th>years_index</th>\n",
" <th>nconst_index</th>\n",
" <th>ratings_index</th>\n",
" <th>votes_index</th>\n",
" <th>average</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8695</th>\n",
" <td>tt2338151</td>\n",
" <td>7775</td>\n",
" <td>12586</td>\n",
" <td>23860</td>\n",
" <td>1151</td>\n",
" <td>10813.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>tt3659388</td>\n",
" <td>49654</td>\n",
" <td>98</td>\n",
" <td>25758</td>\n",
" <td>79</td>\n",
" <td>15120.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8501</th>\n",
" <td>tt1754656</td>\n",
" <td>30993</td>\n",
" <td>116</td>\n",
" <td>46193</td>\n",
" <td>3247</td>\n",
" <td>17810.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7374</th>\n",
" <td>tt2103281</td>\n",
" <td>11453</td>\n",
" <td>27910</td>\n",
" <td>49618</td>\n",
" <td>347</td>\n",
" <td>19340.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7549</th>\n",
" <td>tt2358592</td>\n",
" <td>54985</td>\n",
" <td>17633</td>\n",
" <td>12668</td>\n",
" <td>9182</td>\n",
" <td>20403.4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>758545</th>\n",
" <td>tt13334656</td>\n",
" <td>700841</td>\n",
" <td>672174</td>\n",
" <td>500543</td>\n",
" <td>500543</td>\n",
" <td>626529.2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>758577</th>\n",
" <td>tt13336544</td>\n",
" <td>700845</td>\n",
" <td>672184</td>\n",
" <td>500576</td>\n",
" <td>500576</td>\n",
" <td>626551.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>758587</th>\n",
" <td>tt13335546</td>\n",
" <td>700843</td>\n",
" <td>672231</td>\n",
" <td>500564</td>\n",
" <td>500564</td>\n",
" <td>626557.8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>758590</th>\n",
" <td>tt13335152</td>\n",
" <td>700842</td>\n",
" <td>672247</td>\n",
" <td>500557</td>\n",
" <td>500557</td>\n",
" <td>626558.6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>777387</th>\n",
" <td>tt9916754</td>\n",
" <td>39373</td>\n",
" <td>777387</td>\n",
" <td>777387</td>\n",
" <td>777387</td>\n",
" <td>629784.2</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>777388 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" tconst years_index nconst_index ratings_index votes_index \n",
"8695 tt2338151 7775 12586 23860 1151 \\\n",
"14 tt3659388 49654 98 25758 79 \n",
"8501 tt1754656 30993 116 46193 3247 \n",
"7374 tt2103281 11453 27910 49618 347 \n",
"7549 tt2358592 54985 17633 12668 9182 \n",
"... ... ... ... ... ... \n",
"758545 tt13334656 700841 672174 500543 500543 \n",
"758577 tt13336544 700845 672184 500576 500576 \n",
"758587 tt13335546 700843 672231 500564 500564 \n",
"758590 tt13335152 700842 672247 500557 500557 \n",
"777387 tt9916754 39373 777387 777387 777387 \n",
"\n",
" average \n",
"8695 10813.4 \n",
"14 15120.6 \n",
"8501 17810.0 \n",
"7374 19340.4 \n",
"7549 20403.4 \n",
"... ... \n",
"758545 626529.2 \n",
"758577 626551.6 \n",
"758587 626557.8 \n",
"758590 626558.6 \n",
"777387 629784.2 \n",
"\n",
"[777388 rows x 6 columns]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merged.sort_values(by='average')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "08dff0a1cb2e37beec5bc340112a669cde11fa0a1a1e2fde92884d26090bd6fc"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

23
main.py Normal file
View File

@ -0,0 +1,23 @@
from IMDB_DTO import DTO
from time import time
import pandas as pd
if __name__ == '__main__':
start_time = time()
dto = DTO()
tconst_list = dto.filter_tconst(name='title.basics.tsv')
dto.df2csv(df=pd.DataFrame({'tconst': tconst_list}), name='tconst.csv')
tconst_list = dto.get_tconst('tconst.csv')
df = dto.filter_basic(name='title.basics.tsv', tconst_list=tconst_list)
dto.df2csv(df=df, name='basics.csv')
del df
df = dto.filter_principal(name='title.principals.tsv', tconst_list=tconst_list)
dto.df2csv(df=df, name='principals_comma.csv', overwrite=True)
del df
df = dto.filter_rating(name='title.ratings.tsv', tconst_list=tconst_list)
dto.df2csv(df=df, name='ratings.csv', overwrite=1)
del df