import gradio as gr import numpy as np import pandas as pd import ast from datasets import load_dataset, concatenate_datasets movies = load_dataset("jyshbgde/cinescopeDataset", data_files = "tmdb_5000_movies.csv") credits = load_dataset("jyshbgde/cinescopeDataset", data_files = "tmdb_5000_credits.csv") movies = movies['train'] credits = credits['train'] movies = movies.remove_columns("title") movies = concatenate_datasets([movies, credits], axis = 1, ) movies = movies.to_pandas() movies = movies[["movie_id", "title", "overview", "genres", "keywords", "cast", "crew"]] movies = movies.dropna() def convert(obj): L = [] for i in ast.literal_eval(obj): L.append(i['name']) return L movies['genres'] = movies['genres'].apply(convert) movies['keywords'] = movies['keywords'].apply(convert) def convert3(obj): L = [] counter = 0 for i in ast.literal_eval(obj): if counter != 3: L.append(i['name']) counter +=1 else: break return L movies['cast'] = movies['cast'].apply(convert3) def fetch_director(obj): L = [] for i in ast.literal_eval(obj): if(i['job'] == 'Director'): L.append(i['name']) break return L movies['crew'] = movies['crew'].apply(fetch_director) overview = movies['overview'].apply(lambda x : x.split()) genres = movies['genres'].apply(lambda x : [ i.replace(" ", "") for i in x]) keywords = movies['keywords'].apply(lambda x : [ i.replace(" ", "") for i in x]) cast = movies['cast'].apply(lambda x : [ i.replace(" ", "") for i in x]) crew = movies['crew'].apply(lambda x : [ i.replace(" ", "") for i in x]) movies['tags'] = overview + genres + keywords + cast + crew new_movies_df = movies[['movie_id','title', 'tags']] new_movies_df['tags'] = new_movies_df['tags'].apply(lambda x : " ".join(x)) new_movies_df['tags'] = new_movies_df['tags'].apply(lambda x : x.lower()) new_movies_df['lower_title'] = new_movies_df['title'].apply(lambda x : x.lower()) import nltk from nltk.stem.porter import PorterStemmer ps = PorterStemmer() def stem(text): y = [] for i in text.split(): y.append(ps.stem(i)) return " ".join(y) new_movies_df['tags'] = new_movies_df['tags'].apply(stem) from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer(max_features = 5000, stop_words = 'english') vectors = cv.fit_transform(new_movies_df['tags']).toarray() from sklearn.metrics.pairwise import cosine_similarity similarity = cosine_similarity(vectors) def recommend(movie): movie = movie.lower() if movie not in new_movies_df['lower_title'].values: return [{ "error": "movie not found" }] else: movie_index = new_movies_df[new_movies_df['lower_title'] == movie].index[0] distances = similarity[movie_index] movies_list =sorted(list(enumerate(distances)), reverse = True, key= lambda x : x[1])[1:6] recommended_movies = [] for i in movies_list: obj = { "_id":str(movies.iloc[i[0]]["movie_id"]), "title":movies.iloc[i[0]]["title"], "overview":movies.iloc[i[0]]["overview"], "genres": movies.iloc[i[0]]["genres"], "casts":movies.iloc[i[0]]["cast"], "crew": movies.iloc[i[0]]["crew"], } recommended_movies.append(obj) return recommended_movies iface = gr.Interface(fn = recommend, inputs = 'text', outputs = "json", title = "Movie Recommendation", description = 'Get movies recommendation') iface.launch(debug = True)