#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Wed Sep 7 14:09:23 2022 @author: ilkayisik Streamlit app for user based movie recommendations Changes to the first version: 1. put all the widgets to the sidebar 2. add the time period option in the user id based recommendation """ # imports import subprocess import sys def install(package): subprocess.check_call([sys.executable, "-m", "pip", "install", package]) install('sklearn') install('bs4') from turtle import title import streamlit as st import pandas as pd import numpy as np from sklearn.metrics.pairwise import cosine_similarity from PIL import Image from datetime import datetime import os abspath = os.path.abspath(__file__) dname = os.path.dirname(abspath) os.chdir(dname) # %% load data movie_df = pd.read_csv('https://raw.githubusercontent.com/sherwan-m/WBSFLIX_Recommender_System/main/ml-latest-small/movies.csv') rating_df = pd.read_csv('https://raw.githubusercontent.com/sherwan-m/WBSFLIX_Recommender_System/main/ml-latest-small/ratings.csv') links_df = pd.read_csv('https://raw.githubusercontent.com/sherwan-m/WBSFLIX_Recommender_System/main/ml-latest-small/links.csv') tags_df = pd.read_csv('https://raw.githubusercontent.com/sherwan-m/WBSFLIX_Recommender_System/main/ml-latest-small/tags.csv') # %% format dataframes # MOVIE DF: movie_df = ( movie_df .assign(year=lambda df_ : df_['title'].replace(r'(.*)\((\d{4})\)', r'\2', regex= True)) # replace with 0 if there is no year .assign(year=lambda df_ : np.where(df_['year'].str.len() <=5 , df_['year'], 0))) # convert the year column to int movie_df['year'] = movie_df['year'].astype(int) movie_df['title']= movie_df['title'].str.replace(r'\'','', regex=True) # create a genre list genre_list = [] for i in movie_df['genres']: if "|" in i: genre_list.extend(i.rsplit("|")) else: genre_list.append(i) genre_list = list(set(genre_list)) i = genre_list.index("(no genres listed)") del genre_list[i] genre_list.sort() genre_list.insert(0, 'Any') year_list = list(set(list(movie_df['year'])))[1:] # create a list of movies movie_list = list(set(list(movie_df['title']))) # %% RATING DF # convert timestamp to datetime format rating_df['datetime'] = rating_df['timestamp'].apply(datetime.fromtimestamp) # drop the timestamp column rating_df.drop(columns=['timestamp'], inplace=True) # %% DEFINE FUNCTIONS def transform_genre_to_regex(genres): regex = "" for genre in genres: regex += f"(?=.*{genre})" return regex # to make the the dataframe look nicer def make_pretty(styler): styler.set_caption("Top movie recommendations for you") # styler.background_gradient(cmap="YlGnBu") return styler import requests from bs4 import BeautifulSoup def add_image_link(movies): cover_pic=[] imdb_links =[] for index,movie in movies.iterrows(): imdb_url = "https://www.imdb.com" imdb_search_url = f"/find?q={movie.title}" imdb_r = requests.get(imdb_url + imdb_search_url) imdb_soup = BeautifulSoup(imdb_r.content, "html.parser") #convert the response to BeautifulSoup variable try: movie_page = imdb_soup.select("div.article table.findList tr.findResult.odd td.primary_photo a")[0]['href'] except: movie_page = 'Unknown' # managing error, when ther is no mayor name imdb_pic_r = requests.get(imdb_url+movie_page) imdb_pic_soup = BeautifulSoup(imdb_pic_r.content, "html.parser") #convert the response to BeautifulSoup variable try: pic_page=imdb_pic_soup.select("#__next > main > div > section.ipc-page-background.ipc-page-background--base.sc-ca85a21c-0.efoFqn > section > div:nth-child(4) > section > section > div.sc-2a827f80-2.kqTacj > div.sc-2a827f80-3.dhWlsy > div > div.sc-77a2c808-2.mcnrT > div > div > a")[0]['href'] except : cover_pic.append('https://i.stack.imgur.com/6M513.png') imdb_links.append(imdb_url+movie_page) continue pic_href_r = requests.get(imdb_url+pic_page) pic_href_soup = BeautifulSoup(pic_href_r.content, "html.parser") pic_link = pic_href_soup.select("div.sc-7c0a9e7c-2.bkptFa img")[0]['src'] cover_pic.append(pic_link) imdb_links.append(imdb_url+movie_page) movies['cover_pic'] = cover_pic movies['imdb_link'] = imdb_links return movies def test(movie = "Toy Story (1995)"): imdb_url = "https://www.imdb.com" imdb_search_url = f"/find?q={movie}" imdb_r = requests.get(imdb_url + imdb_search_url) imdb_soup = BeautifulSoup(imdb_r.content, "html.parser") #convert the response to BeautifulSoup variable try: movie_page = imdb_soup.select("div.article table.findList tr.findResult.odd td.primary_photo a")[0]['href'] except: movie_page = 'Unknown' # managing error, when ther is no mayor name imdb_pic_r = requests.get(imdb_url+movie_page) imdb_pic_soup = BeautifulSoup(imdb_pic_r.content, "html.parser") #convert the response to BeautifulSoup variable pic_page=imdb_pic_soup.select("div.sc-77a2c808-2.mcnrT div div a")#[1]['href'] # pic_href_r = requests.get(imdb_url+pic_page) # pic_href_soup = BeautifulSoup(pic_href_r.content, "html.parser") # pic_link = pic_href_soup.select("div.sc-7c0a9e7c-2.bkptFa img")[0]['src'] return pic_page # population based def popular_top_n(n, genres,time_period): popular_n = ( rating_df .groupby(by='movieId') .agg(rating_mean=('rating', 'mean'), rating_count=('movieId', 'count'), datetime=('datetime','mean')) # .sort_values(['rating_mean','rating_count','datetime'], ascending= False) # .loc[lambda df_ :df_['rating_count'] >= (df_['rating_count'].mean()+df_['rating_count'].median())/2] .assign(overall_rating = lambda df_ : (df_['rating_mean']+df_['rating_count'] * 5* 100 / df_['rating_count'].max()) ) .sort_values('overall_rating', ascending= False) .reset_index(drop= True) ) top_n = popular_n.merge(movie_df,how='right', left_index=True, right_on="movieId") top_n = top_n.loc[lambda df_ : ((df_['year'] >= time_period[0]) & ( df_['year'] <= time_period[1]))] if 'Any' in genres: genres.remove('Any') genres_regex = transform_genre_to_regex(genres) top_n = top_n.loc[lambda df_ : df_['genres'].str.contains(genres_regex)] top_n.sort_values('overall_rating', ascending=False) top_n = top_n.drop(columns=['rating_mean', 'rating_count', 'overall_rating', 'datetime']).reset_index( drop= True).head(n) result_size = top_n.shape[0] new_index = ['movie-{}'.format(i+1) for i in range(result_size)] top_n.index = new_index pretty_rec = top_n.style.pipe(make_pretty) return top_n # movie/item based def item_n_movies(target_name , n , genres, time_period): #check the movie input target_Id = movie_df.loc[lambda df_ : df_['title'].str.lower() == target_name.lower(), 'movieId'] if target_Id.empty: return pd.DataFrame(columns= ['movieId', 'title', 'genres', 'year']) target_Id = int(target_Id) movie_user_matrix = ( rating_df .pivot_table(index='movieId', columns='userId', values='rating') .fillna(0) ) similarities_movies = pd.DataFrame(cosine_similarity(movie_user_matrix), index=movie_user_matrix.index, columns=movie_user_matrix.index) similarities = pd.DataFrame( ( similarities_movies .query("index != @target_Id")[target_Id] / sum(similarities_movies.query("index != @target_Id")[target_Id])) .sort_values(ascending= False) ) recommendations = similarities.merge(movie_df, how= 'left', left_index = True, right_on = 'movieId') rating_n =( rating_df .groupby(by='movieId') .agg(rating_count=('userId', 'count')) .reset_index() ) recommendations = recommendations.join(rating_n[['rating_count']]) recommendations = recommendations.loc[lambda df_ : df_['rating_count']>=3] recommendations = recommendations.loc[lambda df_ : ((df_['year'] >= time_period[0]) & ( df_['year'] <= time_period[1]))] if 'Any' in genres: genres.remove('Any') genres_regex = transform_genre_to_regex(genres) recommendations = recommendations.loc[lambda df_ : df_['genres'].str.contains(genres_regex)] top_n = recommendations.head(n) result_size = top_n.shape[0] new_index = ['movie-{}'.format(i+1) for i in range(result_size)] top_n.index = new_index pretty_rec = top_n.style.pipe(make_pretty) return pretty_rec # user based def user_n_movies(user_id , n , genres, time_period): if user_id not in rating_df["userId"]: return pd.DataFrame(columns= ['movieId', 'title', 'genres', 'year']) users_items = pd.pivot_table(data=rating_df, values='rating', index='userId', columns='movieId') users_items.fillna(0, inplace=True) user_similarities = pd.DataFrame(cosine_similarity(users_items), columns=users_items.index, index=users_items.index) weights = ( user_similarities.query("userId!=@user_id")[user_id] / sum(user_similarities.query("userId!=@user_id")[user_id]) ) new_userids = weights.sort_values(ascending=False).head(100).index.tolist() new_userids.append(user_id) new_ratings = rating_df.loc[lambda df_ : df_['userId'].isin(new_userids)] new_users_items = pd.pivot_table(data=new_ratings, values='rating', index='userId', columns='movieId') new_users_items.fillna(0, inplace=True) new_user_similarities = pd.DataFrame(cosine_similarity(new_users_items), columns=new_users_items.index, index=new_users_items.index) new_weights = ( new_user_similarities.query("userId!=@user_id")[user_id] / sum(new_user_similarities.query("userId!=@user_id")[user_id]) ) not_watched_movies = new_users_items.loc[new_users_items.index!=user_id, new_users_items.loc[user_id,:]==0] weighted_averages = pd.DataFrame(not_watched_movies.T.dot(new_weights), columns=["predicted_rating"]) recommendations = weighted_averages.merge(movie_df, left_index=True, right_on="movieId").sort_values("predicted_rating", ascending=False) recommendations = recommendations.loc[lambda df_ : ((df_['year'] >= time_period[0]) & ( df_['year'] <= time_period[1]))] if 'Any' in genres: genres.remove('Any') genres_regex = transform_genre_to_regex(genres) recommendations = recommendations.loc[lambda df_ : df_['genres'].str.contains(genres_regex)] top_n = recommendations.reset_index(drop=True).drop(columns=['predicted_rating']).head(n) result_size = top_n.shape[0] new_index = ['movie-{}'.format(i+1) for i in range(result_size)] top_n.index = new_index pretty_rec = top_n.style.pipe(make_pretty) return pretty_rec # %% STREAMLIT # Set configuration st.set_page_config(page_title="WBSFLIX", page_icon="🎬", initial_sidebar_state="expanded", layout="wide" ) # set colors: These has to be set on the setting menu online # primary color: #FF4B4B, background color:#0E1117 # text color: #FAFAFA, secondary background color: #E50914 # Set the logo of app st.sidebar.image("wbs_logo.png", width=300, clamp=True) welcome_img = Image.open('welcome_page_img01.png') st.image(welcome_img) st.sidebar.markdown(""" # 🎬 Welcome to the next generation movie recommendation app """) # %% APP WORKFLOW st.sidebar.markdown(""" ### How may we help you? """ ) # Popularity based recommender system genre_default = None pop_based_rec = st.sidebar.checkbox("Show me the all time favourites", False, help="Movies that are liked by many people") if pop_based_rec: st.markdown("### Select the Genre and the Number of recommendations") genre_default, n_default = None, 5 with st.form(key="pop_form"): genre_default = ['Any'] genre = st.multiselect( "Genre", options=genre_list, help="Select the genre of the movie you would like to watch", default=genre_default) nr_rec = st.slider("Number of recommendations", min_value=1, max_value=20, value=5, step=1, key="n", help="How many movie recommendations would you like to receive?", ) time_period = st.slider('years:', min_value=1900, max_value=2018, value=(2010,2018), step=1) submit_button_pop = st.form_submit_button(label="Submit") if submit_button_pop: popular_movie_recs = popular_top_n(nr_rec, genre, time_period) st.table(popular_movie_recs) for index, movie in add_image_link(popular_movie_recs.reset_index(drop=True)).iterrows(): st.image(movie['cover_pic'], width=300) st.write(f"[imdb link for: {movie['title']}]({movie['imdb_link']})") # to put some space in between options st.write("") st.write("") st.write("") item_based_rec = st.sidebar.checkbox("Show me a movie like this", False, help="Input some movies and we will show you similar ones") if item_based_rec: st.markdown("### Tell us a movie you like:") with st.form(key="movie_form"): movie_name = st.multiselect(label="Movie name", # options=movie_list, options=pd.Series(movie_list), help="Select a movie you like", key='item_select', default= 'Toy Story 2 (1999)' ) genre_default = ['Any'] genre = st.multiselect( "Genre", options=genre_list, help="Select the genre of the movie you would like to watch", default=genre_default) nr_rec = st.slider("Number of recommendations", min_value=1, max_value=20, value=5, step=1, key="nr_rec_movie", help="How many movie recommendations would you like to receive?", ) time_period = st.slider('years:', min_value=1900, max_value=2018, value=(2010,2018), step=1) submit_button_movie = st.form_submit_button(label="Submit") if submit_button_movie: st.write('Because you like {}:'.format(movie_name[0])) item_movie_recs = item_n_movies(movie_name[0], nr_rec, genre, time_period) st.table(item_movie_recs) # to put some space in between options st.write("") st.write("") st.write("") user_based_rec = st.sidebar.checkbox("I want to get personalized recommendations", False, help="Login to get personalized recommendations") if user_based_rec: st.markdown("### Please login to get customized recommendations just for you") genre_default, n_default = None, 5 with st.form(key="user_form"): user_id = st.number_input("Please enter your user id", step=1, min_value=1) genre_default = ['Any'] genre = st.multiselect( "Genre", options=genre_list, help="Select the genre of the movie you would like to watch", #default=genre_default ) nr_rec = st.slider("Number of recommendations", min_value=1, max_value=20, value=5, step=1, key="nr_rec", help="How many movie recommendations would you like to receive?", ) time_period = st.slider('years:', min_value=1900, max_value=2018, value=(2010,2018), step=1) submit_button_user = st.form_submit_button(label="Submit") if submit_button_user: # user_movie_recs = user_n_movies(user_id, nr_rec) user_movie_recs = user_n_movies(user_id, nr_rec, genre, time_period) # st.write(time_period) st.table(user_movie_recs)