Mintu Konwar
4 min readAug 16, 2021

Movie Recommendation System with Sentiment Analysis of the reviews(Part 2)

File_2 of Preprocessing

In this preprocessing file, I will import two datasets containing movies of 2017. Important features will be extracted from these datasets which will be required in the future processing

import numpy as np
import pandas as pd
import ast
import warnings
warnings.filterwarnings("ignore")
df1 = pd.read_csv('2017_1.csv')
df2 = pd.read_csv('2017_2.csv')
df1.head(5)
png
df2.head(5)
png
df2['release_date'] = pd.to_datetime(df2['release_date'], errors='coerce')
df2['year'] = df2['release_date'].dt.year
df2['year'].value_counts().sort_index()
1874.0 1
1878.0 1
1883.0 1
1887.0 1
1888.0 2
...
2015.0 1905
2016.0 1604
2017.0 532
2018.0 5
2020.0 1
Name: year, Length: 135, dtype: int64
# We will select 2017 movies as we already have movies up to the year 2016 in 2016data.csv file.
# We don't have enough data for the movies from 2018, 2019 and 2020.
df3 = df2.loc[df2.year == 2017,['genres','id','title','year']]
df3
png
df3['id'] = df3['id'].astype(int)df4 = pd.merge(df3, df1, on='id')pd.set_option('display.max_colwidth', 75)
df4.head(2)
png

Here the values of the dataset are in string format, So I create an abstract syntax tree and save them as list type using python inbuilt ‘lambda function

# here i create a abstract syntax tree
df4['genres'] = df4['genres'].map(lambda x: ast.literal_eval(x))
df4['cast'] = df4['cast'].map(lambda x: ast.literal_eval(x))
df4['crew'] = df4['crew'].map(lambda x: ast.literal_eval(x))
# this function selects only the genres name from genres column
def create_genres(x):
gen = []
st = " "
for i in x:
if i.get('name') == 'Science Fiction':
scifi = 'Sci-Fi'
gen.append(scifi)
else:
gen.append(i.get('name'))
if gen == []:
return np.NaN
else:
return (st.join(gen))
df4['genres_list'] = df4['genres'].map(lambda x: create_genres(x))df4['genres_list']0 Adventure Action Fantasy Comedy
1 Action Adventure Fantasy Sci-Fi
2 Action Adventure Fantasy Sci-Fi
3 Action Adventure Comedy Sci-Fi
4 Fantasy Action Adventure
...
526 Romance Comedy
527 Crime Comedy Action Family
528 Family Animation Romance Comedy
529 Crime Drama Thriller
530 NaN
Name: genres_list, Length: 531, dtype: object
def actor1(x):
casts = []
for i in x:
casts.append(i.get('name'))
if casts == []:
return np.NaN
else:
return (casts[0])
df4['actor_1_name'] = df4['cast'].map(lambda x: actor1(x))
df4['actor_1_name']
0 Johnny Depp
1 Ben Affleck
2 Chris Hemsworth
3 Chris Pratt
4 Pierce Brosnan
...
526 Inka Haapamäki
527 Lou Diamond Phillips
528 NaN
529 Sridevi Kapoor
530 NaN
Name: actor_1_name, Length: 531, dtype: object
def actor2(x):
casts = []
for i in x:
casts.append(i.get('name'))
if casts == [] or len(casts)<=1:
return np.NaN
else:
return (casts[1])
df4['actor_2_name'] = df4['cast'].map(lambda x: actor2(x))
df4['actor_2_name']
0 Javier Bardem
1 Henry Cavill
2 Tom Hiddleston
3 Zoe Saldana
4 William Hurt
...
526 Rosa Honkonen
527 Wallace Shawn
528 NaN
529 Sajal Ali
530 NaN
Name: actor_2_name, Length: 531, dtype: object
def actor3(x):
casts = []
for i in x:
casts.append(i.get('name'))
if casts == [] or len(casts)<=2:
return np.NaN
else:
return (casts[2])
df4['actor_3_name'] = df4['cast'].map(lambda x: actor3(x))
df4['actor_3_name']
0 Geoffrey Rush
1 Gal Gadot
2 Cate Blanchett
3 Dave Bautista
4 Benjamin Walker
...
526 Tiitus Rantala
527 Gina Holden
528 NaN
529 Akshaye Khanna
530 NaN
Name: actor_3_name, Length: 531, dtype: object
def directors(x):
dt = []
st = " "
for i in x:
if i.get('job') == 'Director':
dt.append(i.get('name'))
if dt == []:
return np.NaN
else:
return (st.join(dt))
df4['director_name'] = df4['crew'].map(lambda x: directors(x))
df4['director_name']
0 Joachim Rønning Espen Sandberg
1 Zack Snyder
2 Taika Waititi
3 James Gunn
4 Sean McNamara
...
526 Hannaleena Hauru
527 Jonathan A. Rosenbaum
528 Beth David Esteban Bravo
529 Ravi Udyawar
530 Daisy Asquith
Name: director_name, Length: 531, dtype: object
# selecting only those features which will be required for future processing
df5 = df4.loc[:,['director_name','actor_1_name','actor_2_name','actor_3_name','genres_list','title']]
df5
png
# to check for NULL values
df5.isna().sum()
director_name 4
actor_1_name 22
actor_2_name 55
actor_3_name 70
genres_list 7
title 0
dtype: int64
df5 = df5.dropna(how='any')df5.isna().sum()director_name 0
actor_1_name 0
actor_2_name 0
actor_3_name 0
genres_list 0
title 0
dtype: int64
df5 = df5.rename(columns={'genres_list':'genres'})
df5 = df5.rename(columns={'title':'movie_title'})
df5['movie_title'] = df5['movie_title'].str.lower()df5['comb'] = df5['actor_1_name'] + ' ' + df5['actor_2_name'] + ' '+ df5['actor_3_name'] + ' '+ df5['director_name'] +' ' + df5['genres']df5
png
df2016 = pd.read_csv('2016data.csv')df2016
png
df2016['comb'] = df2016['actor_1_name'] + ' ' + df2016['actor_2_name'] + ' '+ df2016['actor_3_name'] + ' '+ df2016['director_name'] +' ' + df2016['genres']df2016
png
#here we merge 2016 and 2017 movies
df201617 = df2016.append(df5)
df201617
png
#if there is any duplicate movies then we drop one of them
df201617.drop_duplicates(subset ="movie_title", keep = 'last', inplace = True)
df201617
png
df201617.to_csv('2017data.csv',index=False)
Mintu Konwar
Mintu Konwar

Written by Mintu Konwar

A third-year MCA student at Dibrugarh University with an interest in cybersecurity, software development, IT, and Machine Learning.

No responses yet