Predict Movies
Calculate the similarity between movies and find
10 most similar movies.

import pathlib
import pandas as pd
import numpy as np
import json
from scipy import spatial
import operator
DIR = pathlib.Path(__file__).resolve().parent
movies = pd.read_csv(DIR / 'data/tmdb_5000_movies.csv')
credits = pd.read_csv(DIR / 'data/tmdb_5000_credits.csv')
def convert_json(df, col):
df[col] = df[col] \
.apply(json.loads) \
.apply(lambda x: [i['name'] for i in x]) \
.apply(lambda x: str(x))
convert_json(movies, 'genres')
convert_json(movies, 'keywords')
convert_json(credits, 'cast')
movies = movies.merge(credits, left_on='id', right_on='movie_id', how='left')
movies = movies[['id', 'original_title', 'genres', 'cast', 'vote_average', 'keywords']]
movies['genres'] = \
movies['genres'].str.strip('[]').str.replace(' ', '').str.replace("'", '')
movies['cast'] = \
movies['cast'].str.strip('[]').str.replace(' ', '').str.replace("'", '')
movies['keywords'] = \
movies['keywords'].str.strip('[]').str.replace(' ', '').str.replace("'", '')
movies['genres'] = movies['genres'].str.split(',')
movies['cast'] = movies['cast'].str.split(',')
movies['keywords'] = movies['keywords'].str.split(',')
genreList = []
for _, row in movies.iterrows():
genres = row['genres']
for v in genres:
if v not in genreList:
genreList.append(v)
def binary_genres(movie_genres):
lst = []
for v in genreList:
if v in movie_genres:
lst.append(1)
else:
lst.append(0)
return lst
for val, index in zip(movies['cast'],movies.index):
lst = val[:4]
movies.loc[index, 'cast'] = str(lst)
movies['cast'] = movies['cast'].str.strip('[]').str.replace(" '",'').str.replace("'",'')
movies['cast'] = movies['cast'].str.split(',')
castList = []
for index, row in movies.iterrows():
cast = row["cast"]
for i in cast:
if i not in castList:
castList.append(i)
def binary_cast(movie_actors):
lst = []
for v in castList:
if v in movie_actors:
lst.append(1)
else:
lst.append(0)
return lst
keywordsList = []
for _, row in movies.iterrows():
keywords = row['keywords']
for v in keywords:
if v not in keywordsList:
keywordsList.append(v)
def binary_keywords(movie_keywords):
lst = []
for v in keywordsList:
if v in movie_keywords:
lst.append(1)
else:
lst.append(0)
return lst
movies['genres_bin'] = movies['genres'].apply(lambda x: binary_genres(x))
movies['cast_bin'] = movies['cast'].apply(lambda x: binary_cast(x))
movies['keywords_bin'] = movies['keywords'].apply(lambda x: binary_keywords(x))
def similarity(movieId1, movieId2):
a = movies.iloc[movieId1]
b = movies.iloc[movieId2]
d1 = spatial.distance.cosine(a['genres_bin'], b['genres_bin'])
d2 = spatial.distance.cosine(a['cast_bin'], b['cast_bin'])
d3 = spatial.distance.cosine(a['keywords_bin'], b['keywords_bin'])
return d1 + d2 + d3
new_id = list(range(0, movies.shape[0]))
movies['new_id'] = new_id
movies = movies[[
'original_title', 'genres', 'vote_average',
'genres_bin', 'cast_bin', 'keywords_bin', 'new_id',
]]
def predict_movies(name):
new_movie = movies[movies['original_title'].str.contains(name)]
new_movie = new_movie.iloc[0].to_frame().T
print('\nSelected Movie: ', new_movie.original_title.values[0], "\n")
def getNeighbors(baseMovie, k_neighbors):
distances = []
for i, movie in movies.iterrows():
if movie['new_id'] != baseMovie['new_id'].values[0]:
d = similarity(baseMovie['new_id'].values[0], movie['new_id'])
distances.append((movie['new_id'], d))
distances.sort(key=operator.itemgetter(1))
neighbors = []
for i in range(k_neighbors):
neighbors.append(distances[i])
return neighbors
neighbors = getNeighbors(new_movie, k_neighbors=10)
for neighbor in neighbors:
original_title = movies.iloc[neighbor[0]][0]
genres = str(movies.iloc[neighbor[0]][1]).strip("[]").replace("'", )
rating = str(movies.iloc[neighbor[0]][2])
print(original_title + " | Genres: " + genres + " | Rating: " + rating)
predict_movies('Godfather')
Count Vectorizer
Represent texts as vectors using
scikit library.

import pathlib
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
DIR = pathlib.Path(__file__).resolve().parent
movies = pd.read_csv(DIR / 'data/movies_dataset2.csv')
def combine_features(row):
return str(row['genres']) + " " + str(row['cast']) + " " + str(row['keywords'])
movies['combined_features'] = movies.apply(combine_features, axis=1)
cv = CountVectorizer()
count_matrix = cv.fit_transform(movies['combined_features'])
cosine_similarity = cosine_similarity(count_matrix)
def predict_movies(name):
index = movies[movies['original_title'].str.contains(name)].index[0]
similar_movies = list(enumerate(cosine_similarity[index]))
similar_sorted = sorted(similar_movies, key=lambda x: x[1], reverse=True)[1:]
for neighbor in similar_sorted[:10]:
movie_index = neighbor[0]
data = movies.iloc[movie_index]
original_title = data['original_title']
genres = data['genres']
rating = str(data['vote_average'])
print(original_title + " | " + genres + " | Rating: " + rating)
predict_movies('Avatar')