import ast import pandas as pd from recommendation_model import Model """ The dataset is obtained from TMDB 5000 Movie Dataset https://www.kaggle.com/datasets/tmdb/tmdb-movie-metadata """ def get_name(x): return ', '.join([i['name'].lower() for i in ast.literal_eval(x)][:5]) def get_director(x): return ', '.join(i['name'].lower() for i in ast.literal_eval(x) if i['job'].lower() == 'director') def get_year(x): return str(x)[:4] def normalize_data(x): return (x - x.min()) / (x.max() - x.min()) raw1 = pd.read_csv('tmdb_5000_movies.csv') raw2 = pd.read_csv('tmdb_5000_credits.csv') raw2 = raw2.rename(columns={'movie_id': 'id'}) df = pd.merge(raw1, raw2, on='id') df = df.drop([ 'budget', 'homepage', 'overview', 'tagline', 'status', 'production_companies', 'production_countries', 'revenue', 'spoken_languages', 'title_x', 'title_y', 'vote_count' ], axis=1) df['genres'] = df['genres'].map(get_name) df['keywords'] = df['keywords'].map(get_name) df['cast'] = df['cast'].map(get_name) df['crew'] = df['crew'].map(get_director) df['release_date'] = df['release_date'].map(get_year) for i in range(len(df)): df.loc[i, 'id'] = i df = df.rename(columns={ 'original_language': 'language', 'original_title': 'title', 'release_date': 'year', 'vote_average': 'rating', 'crew': 'director' }) df = df[[ 'id', 'title', 'genres', 'keywords', 'director', 'cast', 'year', 'language', 'runtime', 'popularity', 'rating' ]] df['id'] = df['id'].apply(lambda x: str(x)) df['year'] = df['year'].apply(lambda x: str(x)) df['runtime'] = normalize_data(df['runtime']) df['popularity'] = normalize_data(df['popularity']) df['rating'] = normalize_data(df['rating']) df_trim = df[['title', 'genres', 'keywords', 'director', 'cast']] model = Model(df) model.fit(save=True)