3 min readAug 22, 2021
Movie Recommendation System with Sentiment Analysis of the reviews(Part 4)
File_4 of preprocessing
In this preprocessing file, movies data from Wikipedia are fetched by implementing Web Scraping framework of Python called Beautiful Soup
import pandas as pd
import numpy as np
from tmdbv3api import TMDb
from tmdbv3api import Movie
import json
import requests
import bs4 as bs
import urllib.request
import warnings
warnings.filterwarnings("ignore")
Web Scraping features of 2020 movies from Wikipedia
movielink = "https://en.wikipedia.org/wiki/List_of_American_films_of_2020"moviesource = urllib.request.urlopen(movielink).read()
moviedata = bs.BeautifulSoup(moviesource,'lxml')movietables = moviedata.find_all('table',class_='wikitable sortable')df1 = pd.read_html(str(movietables[0]))[0]
df2 = pd.read_html(str(movietables[1]))[0]
df3 = pd.read_html(str(movietables[2]))[0]
df4 = pd.read_html(str(movietables[3]).replace("'1\"\'",'"1"'))[0] # avoided "ValueError: invalid literal for int() with base 10: '1"'df = df1.append(df2.append(df3.append(df4,ignore_index=True),ignore_index=True),ignore_index=True)df
df_2020 = df[['Title','Cast and crew']]df_2020
tmdb = TMDb()
tmdb.api_key = 'f82d3658e4fe4c21487f2c409f868517'tmdb_movie = Movie()
def get_genre(x):
genres = []
result = tmdb_movie.search(x)
if not result:
return np.NaN
else:
movie_id = result[0].id
response = requests.get('https://api.themoviedb.org/3/movie/{}?api_key=f82d3658e4fe4c21487f2c409f868517'.format(movie_id,tmdb.api_key))
data_json = response.json()
if data_json['genres']:
genre_str = " "
for i in range(0,len(data_json['genres'])):
genres.append(data_json['genres'][i]['name'])
return genre_str.join(genres)
else:
return np.NaNdf_2020['genres'] = df_2020['Title'].map(lambda x: get_genre(str(x)))df_2020
In the dataframe df_2020, director and actor names are combined. So I am going to split them and store them as new features
def get_director(x):
if " (director)" in x:
return x.split(" (director)")[0]
elif " (directors)" in x:
return x.split(" (directors)")[0]
else:
return x.split(" (director/screenplay)")[0]df_2020['director_name'] = df_2020['Cast and crew'].map(lambda x: get_director(str(x)))def get_actor1(x):
return ((x.split("screenplay); ")[-1]).split(", ")[0])df_2020['actor_1_name'] = df_2020['Cast and crew'].map(lambda x: get_actor1(str(x)))def get_actor2(x):
if len((x.split("screenplay); ")[-1]).split(", ")) < 2:
return np.NaN
else:
return ((x.split("screenplay); ")[-1]).split(", ")[1])df_2020['actor_2_name'] = df_2020['Cast and crew'].map(lambda x: get_actor2(str(x)))def get_actor3(x):
if len((x.split("screenplay); ")[-1]).split(", ")) < 3:
return np.NaN
else:
return ((x.split("screenplay); ")[-1]).split(", ")[2])df_2020['actor_3_name'] = df_2020['Cast and crew'].map(lambda x: get_actor3(str(x)))df_2020
df_2020 = df_2020.rename(columns={'Title':'movie_title'})new_df20 = df_2020.loc[:,['director_name','actor_1_name','actor_2_name','actor_3_name','genres','movie_title']]new_df20
new_df20['comb'] = new_df20['actor_1_name'] + ' ' + new_df20['actor_2_name'] + ' '+ new_df20['actor_3_name'] + ' '+ new_df20['director_name'] +' ' + new_df20['genres']new_df20 = new_df20.dropna(how='any')new_df20['movie_title'] = new_df20['movie_title'].str.lower()new_df20
old_df = pd.read_csv('final_16171819.csv')old_df
Combining all the movies datasets together
final_df = old_df.append(new_df20,ignore_index=True)final_df
This is the final dataset that contains all the movies till 2020
final_df.to_csv('main_data.csv',index=False)