{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "from ast import literal_eval\n", "from functools import reduce" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "input_film = 'tt0816692'\n", "\n", "trained = {'basics': {}, 'principals': {}, 'ratings': {}}\n", "\n", "def drop_by_tconst(df, tconst: str, inplace=True) -> pd.DataFrame:\n", " return df.drop(df[df['tconst'] == tconst].index, inplace=inplace)\n", "\n", "# Basics\n", "\n", "df = pd.read_csv('./IMDB_data_sets/filtered/basics.csv', dtype={'tconst': str, 'startYear': 'Int16', 'genres': str})\n", "df['genres'].fillna('', inplace=True)\n", "\n", "cv = CountVectorizer(dtype=np.int8, token_pattern=\"(?u)[\\w'-]+\")\n", "count_matrix = cv.fit_transform(df['genres'])\n", "\n", "trained['basics']['genres'] = pd.DataFrame(\n", " {\n", " 'genres': cosine_similarity(count_matrix[df[df['tconst'] == input_film].index[0]], count_matrix)[0],\n", " 'tconst': df['tconst']\n", " }\n", " )\n", "\n", "drop_by_tconst(trained['basics']['genres'], input_film)\n", "\n", "trained['basics']['genres'].sort_values(ascending=False, by='genres', inplace=True, ignore_index=True)\n", "trained['basics']['genres'].drop('genres', axis=1, inplace=True)\n", "\n", "\n", "year = int(df[df['tconst'] == input_film].startYear.iloc[0])\n", "\n", "trained['basics']['years'] = pd.DataFrame(\n", " {\n", " 'years': df['startYear'],\n", " 'tconst': df['tconst']\n", " }\n", ")\n", "\n", "drop_by_tconst(trained['basics']['years'], input_film)\n", "trained['basics']['years'].sort_values(by='years', key=lambda x: abs(year-x), inplace=True, ignore_index=True)\n", "trained['basics']['years'].drop('years', axis=1, inplace=True)\n", "trained['basics']['years'].reset_index(names='years_index', inplace=True)\n", "\n", "# Principals\n", "\n", "df = pd.read_csv('./IMDB_data_sets/filtered/principals.csv', dtype={'tconst': str, 'nconst': str}, usecols=['tconst', 'nconst'])\n", "df.nconst = df.nconst.apply(lambda n: ','.join(literal_eval(n)))\n", "\n", "cv = CountVectorizer(dtype=np.int8, token_pattern=\"(?u)[\\w'-]+\")\n", "count_matrix = cv.fit_transform(df['nconst'])\n", "\n", "trained['principals']['nconst'] = pd.DataFrame(\n", " {\n", " 'nconst': cosine_similarity(count_matrix[df[df['tconst'] == input_film].index[0]], count_matrix)[0],\n", " 'tconst': df['tconst']\n", " }\n", " )\n", "\n", "drop_by_tconst(trained['principals']['nconst'], input_film)\n", "trained['principals']['nconst'].sort_values(ascending=False, by='nconst', inplace=True, ignore_index=True)\n", "trained['principals']['nconst'].drop('nconst', axis=1, inplace=True)\n", "trained['principals']['nconst'].reset_index(names='nconst_index', inplace=True)\n", "\n", "# Ratings\n", "\n", "df = pd.read_csv('./IMDB_data_sets/filtered/ratings.csv', dtype={'tconst': str, 'averageRating': float, 'numVotes': 'Int64'})\n", "\n", "rating = float(df[df['tconst'] == input_film].averageRating.iloc[0])\n", "votes = int(df[df['tconst'] == input_film].numVotes.iloc[0])\n", "\n", "drop_by_tconst(df, input_film)\n", "\n", "trained['ratings']['ratings'] = df.sort_values(by='averageRating', key=lambda x: abs(rating-x), ignore_index=True)\n", "trained['ratings']['ratings'].drop(['averageRating', 'numVotes'], axis=1, inplace=True)\n", "trained['ratings']['ratings'].reset_index(names='ratings_index', inplace=True)\n", "\n", "df.drop('averageRating', axis=1, inplace=True)\n", "\n", "trained['ratings']['votes'] = df.sort_values(by='numVotes', key=lambda x: abs(votes-x), ignore_index=True)\n", "trained['ratings']['votes'].drop('numVotes', axis=1, inplace=True)\n", "trained['ratings']['votes'].reset_index(names='votes_index', inplace=True)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "merged = reduce(lambda left, right: pd.merge(\n", " left,\n", " right,\n", " on=['tconst'],\n", " how='outer'\n", " ), [\n", " trained['basics']['genres'],\n", " trained['basics']['years'],\n", " trained['principals']['nconst'],\n", " trained['ratings']['ratings'],\n", " trained['ratings']['votes']\n", " ])" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | tconst | \n", "years_index | \n", "nconst_index | \n", "ratings_index | \n", "votes_index | \n", "average | \n", "
---|---|---|---|---|---|---|
0 | \n", "tt4255564 | \n", "744690 | \n", "297616 | \n", "669670 | \n", "669670 | \n", "476329.2 | \n", "
1 | \n", "tt2203897 | \n", "27293 | \n", "9705 | \n", "602978 | \n", "602978 | \n", "248591.0 | \n", "
2 | \n", "tt0355627 | \n", "344502 | \n", "708640 | \n", "318038 | \n", "205177 | \n", "315271.8 | \n", "
3 | \n", "tt15387378 | \n", "710214 | \n", "98486 | \n", "540358 | \n", "540358 | \n", "377883.8 | \n", "
4 | \n", "tt5155340 | \n", "72975 | \n", "386406 | \n", "103102 | \n", "152733 | \n", "143044.0 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
777383 | \n", "tt1230211 | \n", "189871 | \n", "599826 | \n", "278434 | \n", "25539 | \n", "374210.6 | \n", "
777384 | \n", "tt12302076 | \n", "149946 | \n", "599825 | \n", "483066 | \n", "483066 | \n", "498657.4 | \n", "
777385 | \n", "tt1230206 | \n", "189885 | \n", "599823 | \n", "301969 | \n", "25847 | \n", "378981.8 | \n", "
777386 | \n", "tt1230179 | \n", "255769 | \n", "599809 | \n", "483065 | \n", "483065 | \n", "519818.8 | \n", "
777387 | \n", "tt9916754 | \n", "39373 | \n", "777387 | \n", "777387 | \n", "777387 | \n", "629784.2 | \n", "
777388 rows × 6 columns
\n", "\n", " | tconst | \n", "years_index | \n", "nconst_index | \n", "ratings_index | \n", "votes_index | \n", "average | \n", "
---|---|---|---|---|---|---|
8695 | \n", "tt2338151 | \n", "7775 | \n", "12586 | \n", "23860 | \n", "1151 | \n", "10813.4 | \n", "
14 | \n", "tt3659388 | \n", "49654 | \n", "98 | \n", "25758 | \n", "79 | \n", "15120.6 | \n", "
8501 | \n", "tt1754656 | \n", "30993 | \n", "116 | \n", "46193 | \n", "3247 | \n", "17810.0 | \n", "
7374 | \n", "tt2103281 | \n", "11453 | \n", "27910 | \n", "49618 | \n", "347 | \n", "19340.4 | \n", "
7549 | \n", "tt2358592 | \n", "54985 | \n", "17633 | \n", "12668 | \n", "9182 | \n", "20403.4 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
758545 | \n", "tt13334656 | \n", "700841 | \n", "672174 | \n", "500543 | \n", "500543 | \n", "626529.2 | \n", "
758577 | \n", "tt13336544 | \n", "700845 | \n", "672184 | \n", "500576 | \n", "500576 | \n", "626551.6 | \n", "
758587 | \n", "tt13335546 | \n", "700843 | \n", "672231 | \n", "500564 | \n", "500564 | \n", "626557.8 | \n", "
758590 | \n", "tt13335152 | \n", "700842 | \n", "672247 | \n", "500557 | \n", "500557 | \n", "626558.6 | \n", "
777387 | \n", "tt9916754 | \n", "39373 | \n", "777387 | \n", "777387 | \n", "777387 | \n", "629784.2 | \n", "
777388 rows × 6 columns
\n", "