In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from ast import literal_eval
from functools import reduce

In [2]:
input_film = 'tt0816692'

trained = {'basics': {}, 'principals': {}, 'ratings': {}}

def drop_by_tconst(df, tconst: str, inplace=True) -> pd.DataFrame:
    return df.drop(df[df['tconst'] == tconst].index, inplace=inplace)

# Basics

df = pd.read_csv('./IMDB_data_sets/filtered/basics.csv', dtype={'tconst': str, 'startYear': 'Int16', 'genres': str})
df['genres'].fillna('', inplace=True)

cv = CountVectorizer(dtype=np.int8, token_pattern="(?u)[\w'-]+")
count_matrix = cv.fit_transform(df['genres'])

trained['basics']['genres'] = pd.DataFrame(
        {
            'genres': cosine_similarity(count_matrix[df[df['tconst'] == input_film].index[0]], count_matrix)[0],
            'tconst': df['tconst']
        }
    )

drop_by_tconst(trained['basics']['genres'], input_film)

trained['basics']['genres'].sort_values(ascending=False, by='genres', inplace=True, ignore_index=True)
trained['basics']['genres'].drop('genres', axis=1, inplace=True)


year = int(df[df['tconst'] == input_film].startYear.iloc[0])

trained['basics']['years'] = pd.DataFrame(
    {
        'years': df['startYear'],
        'tconst': df['tconst']
    }
)

drop_by_tconst(trained['basics']['years'], input_film)
trained['basics']['years'].sort_values(by='years', key=lambda x: abs(year-x), inplace=True, ignore_index=True)
trained['basics']['years'].drop('years', axis=1, inplace=True)
trained['basics']['years'].reset_index(names='years_index', inplace=True)

# Principals

df = pd.read_csv('./IMDB_data_sets/filtered/principals.csv', dtype={'tconst': str, 'nconst': str}, usecols=['tconst', 'nconst'])
df.nconst = df.nconst.apply(lambda n: ','.join(literal_eval(n)))

cv = CountVectorizer(dtype=np.int8, token_pattern="(?u)[\w'-]+")
count_matrix = cv.fit_transform(df['nconst'])

trained['principals']['nconst'] = pd.DataFrame(
        {
            'nconst': cosine_similarity(count_matrix[df[df['tconst'] == input_film].index[0]], count_matrix)[0],
            'tconst': df['tconst']
        }
    )

drop_by_tconst(trained['principals']['nconst'], input_film)
trained['principals']['nconst'].sort_values(ascending=False, by='nconst', inplace=True, ignore_index=True)
trained['principals']['nconst'].drop('nconst', axis=1, inplace=True)
trained['principals']['nconst'].reset_index(names='nconst_index', inplace=True)

# Ratings

df = pd.read_csv('./IMDB_data_sets/filtered/ratings.csv', dtype={'tconst': str, 'averageRating': float, 'numVotes': 'Int64'})

rating = float(df[df['tconst'] == input_film].averageRating.iloc[0])
votes = int(df[df['tconst'] == input_film].numVotes.iloc[0])

drop_by_tconst(df, input_film)

trained['ratings']['ratings'] = df.sort_values(by='averageRating', key=lambda x: abs(rating-x), ignore_index=True)
trained['ratings']['ratings'].drop(['averageRating', 'numVotes'], axis=1, inplace=True)
trained['ratings']['ratings'].reset_index(names='ratings_index', inplace=True)

df.drop('averageRating', axis=1, inplace=True)

trained['ratings']['votes'] = df.sort_values(by='numVotes', key=lambda x: abs(votes-x), ignore_index=True)
trained['ratings']['votes'].drop('numVotes', axis=1, inplace=True)
trained['ratings']['votes'].reset_index(names='votes_index', inplace=True)

In [3]:
merged = reduce(lambda  left, right: pd.merge(
                                left,
                                right,
                                on=['tconst'],
                                how='outer'
                            ), [
                                trained['basics']['genres'],
                                trained['basics']['years'],
                                trained['principals']['nconst'],
                                trained['ratings']['ratings'],
                                trained['ratings']['votes']
                            ])

In [4]:
merged['average'] = (merged.index*20 + merged.years_index*20 + merged.nconst_index*20 + merged.ratings_index*20 + merged.votes_index*20) / (5*20)
merged

Unnamed: 0,tconst,years_index,nconst_index,ratings_index,votes_index,average
0,tt4255564,744690,297616,669670,669670,476329.2
1,tt2203897,27293,9705,602978,602978,248591.0
2,tt0355627,344502,708640,318038,205177,315271.8
3,tt15387378,710214,98486,540358,540358,377883.8
4,tt5155340,72975,386406,103102,152733,143044.0
...,...,...,...,...,...,...
777383,tt1230211,189871,599826,278434,25539,374210.6
777384,tt12302076,149946,599825,483066,483066,498657.4
777385,tt1230206,189885,599823,301969,25847,378981.8
777386,tt1230179,255769,599809,483065,483065,519818.8


In [5]:
merged.sort_values(by='average')

Unnamed: 0,tconst,years_index,nconst_index,ratings_index,votes_index,average
8695,tt2338151,7775,12586,23860,1151,10813.4
14,tt3659388,49654,98,25758,79,15120.6
8501,tt1754656,30993,116,46193,3247,17810.0
7374,tt2103281,11453,27910,49618,347,19340.4
7549,tt2358592,54985,17633,12668,9182,20403.4
...,...,...,...,...,...,...
758545,tt13334656,700841,672174,500543,500543,626529.2
758577,tt13336544,700845,672184,500576,500576,626551.6
758587,tt13335546,700843,672231,500564,500564,626557.8
758590,tt13335152,700842,672247,500557,500557,626558.6
