4 min readAug 16, 2021
Movie Recommendation System with Sentiment Analysis of the reviews(Part 2)
File_2 of Preprocessing
In this preprocessing file, I will import two datasets containing movies of 2017. Important features will be extracted from these datasets which will be required in the future processing
import numpy as np
import pandas as pd
import ast
import warnings
warnings.filterwarnings("ignore")df1 = pd.read_csv('2017_1.csv')
df2 = pd.read_csv('2017_2.csv')df1.head(5)
df2.head(5)
df2['release_date'] = pd.to_datetime(df2['release_date'], errors='coerce')
df2['year'] = df2['release_date'].dt.year
df2['year'].value_counts().sort_index()1874.0 1
1878.0 1
1883.0 1
1887.0 1
1888.0 2
...
2015.0 1905
2016.0 1604
2017.0 532
2018.0 5
2020.0 1
Name: year, Length: 135, dtype: int64# We will select 2017 movies as we already have movies up to the year 2016 in 2016data.csv file.
# We don't have enough data for the movies from 2018, 2019 and 2020.
df3 = df2.loc[df2.year == 2017,['genres','id','title','year']]df3
df3['id'] = df3['id'].astype(int)df4 = pd.merge(df3, df1, on='id')pd.set_option('display.max_colwidth', 75)
df4.head(2)
Here the values of the dataset are in string format, So I create an abstract syntax tree and save them as list type using python inbuilt ‘lambda function
# here i create a abstract syntax tree
df4['genres'] = df4['genres'].map(lambda x: ast.literal_eval(x))
df4['cast'] = df4['cast'].map(lambda x: ast.literal_eval(x))
df4['crew'] = df4['crew'].map(lambda x: ast.literal_eval(x))# this function selects only the genres name from genres column
def create_genres(x):
gen = []
st = " "
for i in x:
if i.get('name') == 'Science Fiction':
scifi = 'Sci-Fi'
gen.append(scifi)
else:
gen.append(i.get('name'))
if gen == []:
return np.NaN
else:
return (st.join(gen))df4['genres_list'] = df4['genres'].map(lambda x: create_genres(x))df4['genres_list']0 Adventure Action Fantasy Comedy
1 Action Adventure Fantasy Sci-Fi
2 Action Adventure Fantasy Sci-Fi
3 Action Adventure Comedy Sci-Fi
4 Fantasy Action Adventure
...
526 Romance Comedy
527 Crime Comedy Action Family
528 Family Animation Romance Comedy
529 Crime Drama Thriller
530 NaN
Name: genres_list, Length: 531, dtype: objectdef actor1(x):
casts = []
for i in x:
casts.append(i.get('name'))
if casts == []:
return np.NaN
else:
return (casts[0])df4['actor_1_name'] = df4['cast'].map(lambda x: actor1(x))
df4['actor_1_name']0 Johnny Depp
1 Ben Affleck
2 Chris Hemsworth
3 Chris Pratt
4 Pierce Brosnan
...
526 Inka Haapamäki
527 Lou Diamond Phillips
528 NaN
529 Sridevi Kapoor
530 NaN
Name: actor_1_name, Length: 531, dtype: objectdef actor2(x):
casts = []
for i in x:
casts.append(i.get('name'))
if casts == [] or len(casts)<=1:
return np.NaN
else:
return (casts[1])df4['actor_2_name'] = df4['cast'].map(lambda x: actor2(x))
df4['actor_2_name']0 Javier Bardem
1 Henry Cavill
2 Tom Hiddleston
3 Zoe Saldana
4 William Hurt
...
526 Rosa Honkonen
527 Wallace Shawn
528 NaN
529 Sajal Ali
530 NaN
Name: actor_2_name, Length: 531, dtype: objectdef actor3(x):
casts = []
for i in x:
casts.append(i.get('name'))
if casts == [] or len(casts)<=2:
return np.NaN
else:
return (casts[2])df4['actor_3_name'] = df4['cast'].map(lambda x: actor3(x))
df4['actor_3_name']0 Geoffrey Rush
1 Gal Gadot
2 Cate Blanchett
3 Dave Bautista
4 Benjamin Walker
...
526 Tiitus Rantala
527 Gina Holden
528 NaN
529 Akshaye Khanna
530 NaN
Name: actor_3_name, Length: 531, dtype: objectdef directors(x):
dt = []
st = " "
for i in x:
if i.get('job') == 'Director':
dt.append(i.get('name'))
if dt == []:
return np.NaN
else:
return (st.join(dt))df4['director_name'] = df4['crew'].map(lambda x: directors(x))
df4['director_name']0 Joachim Rønning Espen Sandberg
1 Zack Snyder
2 Taika Waititi
3 James Gunn
4 Sean McNamara
...
526 Hannaleena Hauru
527 Jonathan A. Rosenbaum
528 Beth David Esteban Bravo
529 Ravi Udyawar
530 Daisy Asquith
Name: director_name, Length: 531, dtype: object# selecting only those features which will be required for future processing
df5 = df4.loc[:,['director_name','actor_1_name','actor_2_name','actor_3_name','genres_list','title']]df5
# to check for NULL values
df5.isna().sum()director_name 4
actor_1_name 22
actor_2_name 55
actor_3_name 70
genres_list 7
title 0
dtype: int64df5 = df5.dropna(how='any')df5.isna().sum()director_name 0
actor_1_name 0
actor_2_name 0
actor_3_name 0
genres_list 0
title 0
dtype: int64df5 = df5.rename(columns={'genres_list':'genres'})
df5 = df5.rename(columns={'title':'movie_title'})df5['movie_title'] = df5['movie_title'].str.lower()df5['comb'] = df5['actor_1_name'] + ' ' + df5['actor_2_name'] + ' '+ df5['actor_3_name'] + ' '+ df5['director_name'] +' ' + df5['genres']df5
df2016 = pd.read_csv('2016data.csv')df2016
df2016['comb'] = df2016['actor_1_name'] + ' ' + df2016['actor_2_name'] + ' '+ df2016['actor_3_name'] + ' '+ df2016['director_name'] +' ' + df2016['genres']df2016
#here we merge 2016 and 2017 movies
df201617 = df2016.append(df5)df201617
#if there is any duplicate movies then we drop one of them
df201617.drop_duplicates(subset ="movie_title", keep = 'last', inplace = True)df201617
df201617.to_csv('2017data.csv',index=False)