new_web_app / app.py
sherwan's picture
Update app.py
4cb81eb
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 7 14:09:23 2022
@author: ilkayisik
Streamlit app for user based movie recommendations
Changes to the first version:
1. put all the widgets to the sidebar
2. add the time period option in the user id based recommendation
"""
# imports
import subprocess
import sys
def install(package):
subprocess.check_call([sys.executable, "-m", "pip", "install", package])
install('sklearn')
install('bs4')
from turtle import title
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from PIL import Image
from datetime import datetime
import os
abspath = os.path.abspath(__file__)
dname = os.path.dirname(abspath)
os.chdir(dname)
# %% load data
movie_df = pd.read_csv('https://raw.githubusercontent.com/sherwan-m/WBSFLIX_Recommender_System/main/ml-latest-small/movies.csv')
rating_df = pd.read_csv('https://raw.githubusercontent.com/sherwan-m/WBSFLIX_Recommender_System/main/ml-latest-small/ratings.csv')
links_df = pd.read_csv('https://raw.githubusercontent.com/sherwan-m/WBSFLIX_Recommender_System/main/ml-latest-small/links.csv')
tags_df = pd.read_csv('https://raw.githubusercontent.com/sherwan-m/WBSFLIX_Recommender_System/main/ml-latest-small/tags.csv')
# %% format dataframes
# MOVIE DF:
movie_df = (
movie_df
.assign(year=lambda df_ : df_['title'].replace(r'(.*)\((\d{4})\)', r'\2', regex= True))
# replace with 0 if there is no year
.assign(year=lambda df_ : np.where(df_['year'].str.len() <=5 , df_['year'], 0)))
# convert the year column to int
movie_df['year'] = movie_df['year'].astype(int)
movie_df['title']= movie_df['title'].str.replace(r'\'','', regex=True)
# create a genre list
genre_list = []
for i in movie_df['genres']:
if "|" in i:
genre_list.extend(i.rsplit("|"))
else:
genre_list.append(i)
genre_list = list(set(genre_list))
i = genre_list.index("(no genres listed)")
del genre_list[i]
genre_list.sort()
genre_list.insert(0, 'Any')
year_list = list(set(list(movie_df['year'])))[1:]
# create a list of movies
movie_list = list(set(list(movie_df['title'])))
# %% RATING DF
# convert timestamp to datetime format
rating_df['datetime'] = rating_df['timestamp'].apply(datetime.fromtimestamp)
# drop the timestamp column
rating_df.drop(columns=['timestamp'], inplace=True)
# %% DEFINE FUNCTIONS
def transform_genre_to_regex(genres):
regex = ""
for genre in genres:
regex += f"(?=.*{genre})"
return regex
# to make the the dataframe look nicer
def make_pretty(styler):
styler.set_caption("Top movie recommendations for you")
# styler.background_gradient(cmap="YlGnBu")
return styler
import requests
from bs4 import BeautifulSoup
def add_image_link(movies):
cover_pic=[]
imdb_links =[]
for index,movie in movies.iterrows():
imdb_url = "https://www.imdb.com"
imdb_search_url = f"/find?q={movie.title}"
imdb_r = requests.get(imdb_url + imdb_search_url)
imdb_soup = BeautifulSoup(imdb_r.content, "html.parser") #convert the response to BeautifulSoup variable
try: movie_page = imdb_soup.select("div.article table.findList tr.findResult.odd td.primary_photo a")[0]['href']
except: movie_page = 'Unknown' # managing error, when ther is no mayor name
imdb_pic_r = requests.get(imdb_url+movie_page)
imdb_pic_soup = BeautifulSoup(imdb_pic_r.content, "html.parser") #convert the response to BeautifulSoup variable
try: pic_page=imdb_pic_soup.select("#__next > main > div > section.ipc-page-background.ipc-page-background--base.sc-ca85a21c-0.efoFqn > section > div:nth-child(4) > section > section > div.sc-2a827f80-2.kqTacj > div.sc-2a827f80-3.dhWlsy > div > div.sc-77a2c808-2.mcnrT > div > div > a")[0]['href']
except :
cover_pic.append('https://i.stack.imgur.com/6M513.png')
imdb_links.append(imdb_url+movie_page)
continue
pic_href_r = requests.get(imdb_url+pic_page)
pic_href_soup = BeautifulSoup(pic_href_r.content, "html.parser")
pic_link = pic_href_soup.select("div.sc-7c0a9e7c-2.bkptFa img")[0]['src']
cover_pic.append(pic_link)
imdb_links.append(imdb_url+movie_page)
movies['cover_pic'] = cover_pic
movies['imdb_link'] = imdb_links
return movies
def test(movie = "Toy Story (1995)"):
imdb_url = "https://www.imdb.com"
imdb_search_url = f"/find?q={movie}"
imdb_r = requests.get(imdb_url + imdb_search_url)
imdb_soup = BeautifulSoup(imdb_r.content, "html.parser") #convert the response to BeautifulSoup variable
try: movie_page = imdb_soup.select("div.article table.findList tr.findResult.odd td.primary_photo a")[0]['href']
except: movie_page = 'Unknown' # managing error, when ther is no mayor name
imdb_pic_r = requests.get(imdb_url+movie_page)
imdb_pic_soup = BeautifulSoup(imdb_pic_r.content, "html.parser") #convert the response to BeautifulSoup variable
pic_page=imdb_pic_soup.select("div.sc-77a2c808-2.mcnrT div div a")#[1]['href']
# pic_href_r = requests.get(imdb_url+pic_page)
# pic_href_soup = BeautifulSoup(pic_href_r.content, "html.parser")
# pic_link = pic_href_soup.select("div.sc-7c0a9e7c-2.bkptFa img")[0]['src']
return pic_page
# population based
def popular_top_n(n, genres,time_period):
popular_n = (
rating_df
.groupby(by='movieId')
.agg(rating_mean=('rating', 'mean'), rating_count=('movieId', 'count'), datetime=('datetime','mean'))
# .sort_values(['rating_mean','rating_count','datetime'], ascending= False)
# .loc[lambda df_ :df_['rating_count'] >= (df_['rating_count'].mean()+df_['rating_count'].median())/2]
.assign(overall_rating = lambda df_ : (df_['rating_mean']+df_['rating_count'] * 5* 100 / df_['rating_count'].max()) )
.sort_values('overall_rating', ascending= False)
.reset_index(drop= True)
)
top_n = popular_n.merge(movie_df,how='right', left_index=True, right_on="movieId")
top_n = top_n.loc[lambda df_ : ((df_['year'] >= time_period[0]) & ( df_['year'] <= time_period[1]))]
if 'Any' in genres: genres.remove('Any')
genres_regex = transform_genre_to_regex(genres)
top_n = top_n.loc[lambda df_ : df_['genres'].str.contains(genres_regex)]
top_n.sort_values('overall_rating', ascending=False)
top_n = top_n.drop(columns=['rating_mean', 'rating_count', 'overall_rating', 'datetime']).reset_index( drop= True).head(n)
result_size = top_n.shape[0]
new_index = ['movie-{}'.format(i+1) for i in range(result_size)]
top_n.index = new_index
pretty_rec = top_n.style.pipe(make_pretty)
return top_n
# movie/item based
def item_n_movies(target_name , n , genres, time_period):
#check the movie input
target_Id = movie_df.loc[lambda df_ : df_['title'].str.lower() == target_name.lower(), 'movieId']
if target_Id.empty:
return pd.DataFrame(columns= ['movieId', 'title', 'genres', 'year'])
target_Id = int(target_Id)
movie_user_matrix = (
rating_df
.pivot_table(index='movieId', columns='userId', values='rating')
.fillna(0)
)
similarities_movies = pd.DataFrame(cosine_similarity(movie_user_matrix),
index=movie_user_matrix.index,
columns=movie_user_matrix.index)
similarities = pd.DataFrame(
(
similarities_movies
.query("index != @target_Id")[target_Id] / sum(similarities_movies.query("index != @target_Id")[target_Id]))
.sort_values(ascending= False)
)
recommendations = similarities.merge(movie_df, how= 'left', left_index = True, right_on = 'movieId')
rating_n =(
rating_df
.groupby(by='movieId')
.agg(rating_count=('userId', 'count'))
.reset_index()
)
recommendations = recommendations.join(rating_n[['rating_count']])
recommendations = recommendations.loc[lambda df_ : df_['rating_count']>=3]
recommendations = recommendations.loc[lambda df_ : ((df_['year'] >= time_period[0]) & ( df_['year'] <= time_period[1]))]
if 'Any' in genres: genres.remove('Any')
genres_regex = transform_genre_to_regex(genres)
recommendations = recommendations.loc[lambda df_ : df_['genres'].str.contains(genres_regex)]
top_n = recommendations.head(n)
result_size = top_n.shape[0]
new_index = ['movie-{}'.format(i+1) for i in range(result_size)]
top_n.index = new_index
pretty_rec = top_n.style.pipe(make_pretty)
return pretty_rec
# user based
def user_n_movies(user_id , n , genres, time_period):
if user_id not in rating_df["userId"]:
return pd.DataFrame(columns= ['movieId', 'title', 'genres', 'year'])
users_items = pd.pivot_table(data=rating_df,
values='rating',
index='userId',
columns='movieId')
users_items.fillna(0, inplace=True)
user_similarities = pd.DataFrame(cosine_similarity(users_items),
columns=users_items.index,
index=users_items.index)
weights = (
user_similarities.query("userId!=@user_id")[user_id] / sum(user_similarities.query("userId!=@user_id")[user_id])
)
new_userids = weights.sort_values(ascending=False).head(100).index.tolist()
new_userids.append(user_id)
new_ratings = rating_df.loc[lambda df_ : df_['userId'].isin(new_userids)]
new_users_items = pd.pivot_table(data=new_ratings,
values='rating',
index='userId',
columns='movieId')
new_users_items.fillna(0, inplace=True)
new_user_similarities = pd.DataFrame(cosine_similarity(new_users_items),
columns=new_users_items.index,
index=new_users_items.index)
new_weights = (
new_user_similarities.query("userId!=@user_id")[user_id] / sum(new_user_similarities.query("userId!=@user_id")[user_id])
)
not_watched_movies = new_users_items.loc[new_users_items.index!=user_id, new_users_items.loc[user_id,:]==0]
weighted_averages = pd.DataFrame(not_watched_movies.T.dot(new_weights), columns=["predicted_rating"])
recommendations = weighted_averages.merge(movie_df, left_index=True, right_on="movieId").sort_values("predicted_rating", ascending=False)
recommendations = recommendations.loc[lambda df_ : ((df_['year'] >= time_period[0]) & ( df_['year'] <= time_period[1]))]
if 'Any' in genres: genres.remove('Any')
genres_regex = transform_genre_to_regex(genres)
recommendations = recommendations.loc[lambda df_ : df_['genres'].str.contains(genres_regex)]
top_n = recommendations.reset_index(drop=True).drop(columns=['predicted_rating']).head(n)
result_size = top_n.shape[0]
new_index = ['movie-{}'.format(i+1) for i in range(result_size)]
top_n.index = new_index
pretty_rec = top_n.style.pipe(make_pretty)
return pretty_rec
# %% STREAMLIT
# Set configuration
st.set_page_config(page_title="WBSFLIX",
page_icon="🎬",
initial_sidebar_state="expanded",
layout="wide"
)
# set colors: These has to be set on the setting menu online
# primary color: #FF4B4B, background color:#0E1117
# text color: #FAFAFA, secondary background color: #E50914
# Set the logo of app
st.sidebar.image("wbs_logo.png",
width=300, clamp=True)
welcome_img = Image.open('welcome_page_img01.png')
st.image(welcome_img)
st.sidebar.markdown("""
# 🎬 Welcome to the next generation movie recommendation app
""")
# %% APP WORKFLOW
st.sidebar.markdown("""
### How may we help you?
"""
)
# Popularity based recommender system
genre_default = None
pop_based_rec = st.sidebar.checkbox("Show me the all time favourites",
False,
help="Movies that are liked by many people")
if pop_based_rec:
st.markdown("### Select the Genre and the Number of recommendations")
genre_default, n_default = None, 5
with st.form(key="pop_form"):
genre_default = ['Any']
genre = st.multiselect(
"Genre",
options=genre_list,
help="Select the genre of the movie you would like to watch",
default=genre_default)
nr_rec = st.slider("Number of recommendations",
min_value=1,
max_value=20,
value=5,
step=1,
key="n",
help="How many movie recommendations would you like to receive?",
)
time_period = st.slider('years:', min_value=1900,
max_value=2018,
value=(2010,2018),
step=1)
submit_button_pop = st.form_submit_button(label="Submit")
if submit_button_pop:
popular_movie_recs = popular_top_n(nr_rec, genre, time_period)
st.table(popular_movie_recs)
for index, movie in add_image_link(popular_movie_recs.reset_index(drop=True)).iterrows():
st.image(movie['cover_pic'], width=300)
st.write(f"[imdb link for: {movie['title']}]({movie['imdb_link']})")
# to put some space in between options
st.write("")
st.write("")
st.write("")
item_based_rec = st.sidebar.checkbox("Show me a movie like this",
False,
help="Input some movies and we will show you similar ones")
if item_based_rec:
st.markdown("### Tell us a movie you like:")
with st.form(key="movie_form"):
movie_name = st.multiselect(label="Movie name",
# options=movie_list,
options=pd.Series(movie_list),
help="Select a movie you like",
key='item_select',
default= 'Toy Story 2 (1999)'
)
genre_default = ['Any']
genre = st.multiselect(
"Genre",
options=genre_list,
help="Select the genre of the movie you would like to watch",
default=genre_default)
nr_rec = st.slider("Number of recommendations",
min_value=1,
max_value=20,
value=5,
step=1,
key="nr_rec_movie",
help="How many movie recommendations would you like to receive?",
)
time_period = st.slider('years:', min_value=1900,
max_value=2018,
value=(2010,2018),
step=1)
submit_button_movie = st.form_submit_button(label="Submit")
if submit_button_movie:
st.write('Because you like {}:'.format(movie_name[0]))
item_movie_recs = item_n_movies(movie_name[0], nr_rec, genre, time_period)
st.table(item_movie_recs)
# to put some space in between options
st.write("")
st.write("")
st.write("")
user_based_rec = st.sidebar.checkbox("I want to get personalized recommendations",
False,
help="Login to get personalized recommendations")
if user_based_rec:
st.markdown("### Please login to get customized recommendations just for you")
genre_default, n_default = None, 5
with st.form(key="user_form"):
user_id = st.number_input("Please enter your user id", step=1,
min_value=1)
genre_default = ['Any']
genre = st.multiselect(
"Genre",
options=genre_list,
help="Select the genre of the movie you would like to watch",
#default=genre_default
)
nr_rec = st.slider("Number of recommendations",
min_value=1,
max_value=20,
value=5,
step=1,
key="nr_rec",
help="How many movie recommendations would you like to receive?",
)
time_period = st.slider('years:', min_value=1900,
max_value=2018,
value=(2010,2018),
step=1)
submit_button_user = st.form_submit_button(label="Submit")
if submit_button_user:
# user_movie_recs = user_n_movies(user_id, nr_rec)
user_movie_recs = user_n_movies(user_id, nr_rec, genre, time_period)
# st.write(time_period)
st.table(user_movie_recs)