movie_recommendation / models /data_preprocessing.py
shoukaku's picture
refactor code and fix imports
b75eb47
import ast
import pandas as pd
from recommendation_model import Model
"""
The dataset is obtained from TMDB 5000 Movie Dataset
https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata
"""
def get_name(x):
return ', '.join([i['name'].lower() for i in ast.literal_eval(x)][:5])
def get_director(x):
return ', '.join(i['name'].lower() for i in ast.literal_eval(x) if i['job'].lower() == 'director')
def get_year(x):
return str(x)[:4]
def normalize_data(x):
return (x - x.min()) / (x.max() - x.min())
raw1 = pd.read_csv('tmdb_5000_movies.csv')
raw2 = pd.read_csv('tmdb_5000_credits.csv')
raw2 = raw2.rename(columns={'movie_id': 'id'})
df = pd.merge(raw1, raw2, on='id')
df = df.drop([
'budget',
'homepage',
'overview',
'tagline',
'status',
'production_companies',
'production_countries',
'revenue',
'spoken_languages',
'title_x',
'title_y',
'vote_count'
], axis=1)
df['genres'] = df['genres'].map(get_name)
df['keywords'] = df['keywords'].map(get_name)
df['cast'] = df['cast'].map(get_name)
df['crew'] = df['crew'].map(get_director)
df['release_date'] = df['release_date'].map(get_year)
for i in range(len(df)):
df.loc[i, 'id'] = i
df = df.rename(columns={
'original_language': 'language',
'original_title': 'title',
'release_date': 'year',
'vote_average': 'rating',
'crew': 'director'
})
df = df[[
'id',
'title',
'genres',
'keywords',
'director',
'cast',
'year',
'language',
'runtime',
'popularity',
'rating'
]]
df['id'] = df['id'].apply(lambda x: str(x))
df['year'] = df['year'].apply(lambda x: str(x))
df['runtime'] = normalize_data(df['runtime'])
df['popularity'] = normalize_data(df['popularity'])
df['rating'] = normalize_data(df['rating'])
df_trim = df[['title', 'genres', 'keywords', 'director', 'cast']]
model = Model(df)
model.fit(save=True)