import gradio as gr from bs4 import BeautifulSoup import requests import re import pandas as pd import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import scipy.stats import gradio as gr from sklearn.metrics.pairwise import cosine_similarity from sklearn.feature_extraction.text import TfidfVectorizer from datasketch import MinHashLSHForest, MinHash import pandas as pd import random class ContentBasedRecommender: def __init__(self, movies_metadata_path): self.movies_metadata_path = movies_metadata_path self.indices = None self.df2 = None self.forest = None self.minhashes = None def get_recommendations(self, title, top_k): idx = self.indices[title] query_minhash = self.minhashes[idx][1] nearest_neighbors = self.forest.query(query_minhash, top_k) movie_indices = [int(nn) for nn in nearest_neighbors if nn != idx][:top_k] return self.df2['title'].iloc[movie_indices] def final_recommends(self, movies, result_number): res = [] for i in range(len(movies)): recommendations = self.get_recommendations(movies[i], top_k=10) if len(recommendations) > 0: for j in range(9): res.append(recommendations.iloc[j]) random.shuffle(res) return res[:result_number] def content_based_recommendation(self): self.df2 = pd.read_csv(self.movies_metadata_path) tfidf = TfidfVectorizer(stop_words='english') self.df2['overview'] = self.df2['overview'].fillna('') tfidf_matrix = tfidf.fit_transform(self.df2['overview']) self.minhashes = [] for i in range(tfidf_matrix.shape[0]): vector = tfidf_matrix[i] doc_id = self.df2.index[i] minhash = MinHash(num_perm=128) for token in vector.nonzero()[1]: minhash.update(str(token).encode('utf-8')) self.minhashes.append((doc_id, minhash)) self.forest = MinHashLSHForest(num_perm=128) for doc_id, minhash in self.minhashes: self.forest.add(doc_id, minhash) self.forest.index() self.indices = pd.Series(self.df2.index, index=self.df2['title']).drop_duplicates() def colabrative(user_movies, ret_number): user_df = {'userId':[], 'movieId':[], 'rating':[] } ratings = pd.read_csv('/kaggle/input/the-movies-dataset/ratings.csv') ratings['rating'] = ratings['rating'] * 2 ratings['rating'] = ratings['rating'].astype(int) comment_counts = pd.DataFrame(ratings["movieId"].value_counts()) rare_movies = comment_counts[comment_counts["movieId"] <= 1000].index common_movies = ratings[~ratings["movieId"].isin(rare_movies)] counter = 0 for user_movie in user_movies: if user_movie[0] in common_movies["movieId"]: user_df['userId'].append(300000) user_df['movieId'].append(user_movie[0]) user_df['rating'].append(user_movie[1]) counter += 1 if counter <= 3: return [] ratings.append(user_df) comment_counts = pd.DataFrame(ratings["movieId"].value_counts()) rare_movies = comment_counts[comment_counts["movieId"] <= 1000].index common_movies = ratings[~ratings["movieId"].isin(rare_movies)] user_movie_df = common_movies.pivot_table(index=["userId"], columns=["movieId"], values='rating') random_user = 300000 random_user_df = user_movie_df[user_movie_df.index == random_user] movies_watched = random_user_df.columns[random_user_df.notna().any()].tolist() movies_watched_df = user_movie_df[movies_watched] user_movie_count = movies_watched_df.T.notnull().sum() user_movie_count = user_movie_count.reset_index() user_movie_count.columns = ["userid", "movie_count"] perc = len(movies_watched) * 60 / 100 user_same_movies = user_movie_count[user_movie_count["movie_count"] > perc]["userid"] final_df = movies_watched_df[movies_watched_df.index.isin(user_same_movies)] corr_df = final_df.T.corr().unstack().sort_values().drop_duplicates() corr_df = pd.DataFrame(corr_df, columns=["corr"]) corr_df.index.names = ["userid_1", "userid_2"] corr_df = corr_df.reset_index() top_users = corr_df[(corr_df["userid_1"] == random_user) & (corr_df["corr"] > 0.65)][["userid_2", "corr"]] top_users.columns = ["userId", "corr"] top_users_score = top_users.merge(ratings[["userId", "movieId", "rating"]], how="inner") top_users_score["weighted_reting"] = top_users_score["corr"] * top_users_score["rating"] recommendation_df = top_users_score.groupby("movieId").agg({"weighted_reting": "mean"}) recommendation_df = recommendation_df.reset_index() movies_to_be_recommended = recommendation_df[recommendation_df["weighted_reting"] > 3.5].sort_values("weighted_reting", ascending=False) return(movies_to_be_recommended.merge(movie_df[["movieId"]])["movieId"][:ret_number]) def get_video_address(row): inner = [] for dictionary in row: type = dictionary.get('type') if(type == 'Trailer'): inner.append(dictionary['key']) return inner css_code=''' .bdy{ color: #eee7d8; margin: 0 0 60px 0; font-family: 'Gill Sans', 'Gill Sans MT', Calibri, 'Trebuchet MS', sans-serif; } .contents{ display: flex; align-content: space-around; justify-content: center; } .img { display: block; width: calc(34% - 0.125rem); height:auto; } .video{ width: calc(66% - 0.125rem); height:auto; margin-left: 0.25rem; } .center { border: 3px solid green; } .top{ display: flex; justify-content: space-between; margin:0; } .inline{ display: flex; margin:0; } .time{ padding: 0 0 0 15px; } .topleft{ padding: 0px; margin: 0px; line-height: 0; } .topright{ line-height: 0; margin:0; padding:0; } .name{ margin: 20px 0 5px 0; } ''' def combine(x): lst = [] if isinstance(x, list) == True: for i in x: lst.append(i['name']) return '|'.join(lst) else: np.nan headers = { "accept": "application/json", "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJmMTNkMmNiMTY4MWM3ZWEyMjNiYjgxZTUyYmMzMDUyMCIsInN1YiI6IjY0YTY5MTU1YzNiZmZlMDBjODZiYzRlZSIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.H6zk_gSiudNdZTsgwYHFQnJiVXfR_BX5DfYNIoIsRv8" } recommender = ContentBasedRecommender('movies_metadata.csv') recommender.content_based_recommendation() def recommend_movie(movie_name, Number_of_Recommendation): movies = pd.read_csv('movies_metadata.csv') Number_of_Recommendation = int(Number_of_Recommendation) names = recommends = recommender.final_recommends(movies= movie_name['Name'], result_number=Number_of_Recommendation) outputIds = [] for i in range(len(names)): # print(movies[movies['title'] == names[i]]['imdb_id'].iloc[0]) outputIds.append(movies[movies['title'] == names[i]]['imdb_id'].iloc[0]) # outputIds = ['tt0114709', 'tt0113497', 'tt0113228', 'tt0114885', 'tt0113041'] html = ''' ''' headers = { "accept": "application/json", "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJmMTNkMmNiMTY4MWM3ZWEyMjNiYjgxZTUyYmMzMDUyMCIsInN1YiI6IjY0YTY5MTU1YzNiZmZlMDBjODZiYzRlZSIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.H6zk_gSiudNdZTsgwYHFQnJiVXfR_BX5DfYNIoIsRv8" } for i in range(Number_of_Recommendation): url = "https://api.themoviedb.org/3/movie/{id}?language=en-US".format(id = outputIds[i]) data_response = requests.get(url, headers=headers) data = data_response.json() url = "https://api.themoviedb.org/3/movie/{id}/videos?language=en-US".format(id = outputIds[i]) video_response = requests.get(url, headers=headers) video = video_response.json() video = get_video_address(video['results']) if type(video) == bool: video = [] for dictionary in video['results']: video.append(dictionary['key']) if len(video) == 0: video = [''] url = "https://api.themoviedb.org/3/movie/{id}/images".format(id = outputIds[i]) image_response = requests.get(url, headers=headers) image = image_response.json() if len(image['backdrops']) == 0: image['backdrops'] = [{'file_path':''}] html +='''

'''+data['title']+'''

2022

'''+str(data['runtime']//60) + 'h ' + str(data['runtime']%60)+'''m

IMDb RATING

⭐'''+str(round(data['vote_average'], 2))+'''/10     '''+str(data['vote_count'])+'''

'''+ data['overview']+'''

''' html += ''' ''' return html html = '' iface = gr.Interface(fn=recommend_movie, inputs=[gr.Dataframe(headers=["Name", "Rate"], datatype=["str", "number"], row_count=3, col_count=(2, "fixed")), "number" ], outputs=gr.HTML(html), title="Movie Recommender", description="Enter a movie name and your rating (out of 10) for the movie. you must enter at least 3 movies and all words must start with capital letters, example : Grumpier Old Men", css = css_code, theme='taithrah/Minimal' ) iface.launch()