Spaces:
Running
Running
Upload 3 files
Browse files- app.py +223 -0
- detailed_movies_top_250_embeds.pkl.xz +3 -0
- requirements.txt +6 -0
app.py
ADDED
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pandas import read_pickle
|
2 |
+
import streamlit as st
|
3 |
+
from sentence_transformers import SentenceTransformer
|
4 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
5 |
+
from streamlit_extras.add_vertical_space import add_vertical_space
|
6 |
+
from streamlit_extras.colored_header import colored_header
|
7 |
+
from streamlit_option_menu import option_menu
|
8 |
+
|
9 |
+
max_seq_length = 256
|
10 |
+
repo_id = "all-MiniLM-L6-v2"
|
11 |
+
data_path = "detailed_movies_top_250_embeds.pkl.xz"
|
12 |
+
output_column_names = [
|
13 |
+
"year",
|
14 |
+
"duration",
|
15 |
+
"genre",
|
16 |
+
"stars",
|
17 |
+
"summary",
|
18 |
+
"poster_url",
|
19 |
+
"trailer_url",
|
20 |
+
]
|
21 |
+
|
22 |
+
st.set_page_config(layout="wide")
|
23 |
+
|
24 |
+
colored_header(
|
25 |
+
label="SEARCH ENGINE&MOVIE RECOMMENDER: IMDB TOP 250 MOVIES",
|
26 |
+
description="""Discover the best movies from the IMDB Top 250 list with advanced semantic search engine and movie recommender.
|
27 |
+
Simply enter a keyword, phrase, or even plot.
|
28 |
+
It provides you with a personalized selection of top-rated films!""",
|
29 |
+
color_name="blue-70",
|
30 |
+
)
|
31 |
+
|
32 |
+
hide_streamlit_style = """
|
33 |
+
<style>
|
34 |
+
#MainMenu {visibility: hidden;}
|
35 |
+
footer {visibility: hidden;}
|
36 |
+
</style>
|
37 |
+
"""
|
38 |
+
st.markdown(hide_streamlit_style, unsafe_allow_html=True)
|
39 |
+
|
40 |
+
|
41 |
+
@st.cache(suppress_st_warning=True, allow_output_mutation=True)
|
42 |
+
def load_data_model():
|
43 |
+
"""
|
44 |
+
It loads the dataframe and the sentence embedding model.
|
45 |
+
|
46 |
+
Returns:
|
47 |
+
A tuple of the dataframe and the embedding model
|
48 |
+
"""
|
49 |
+
|
50 |
+
df = read_pickle(data_path)
|
51 |
+
embed_model = SentenceTransformer(repo_id)
|
52 |
+
embed_model.max_seq_length = max_seq_length
|
53 |
+
return df, embed_model
|
54 |
+
|
55 |
+
|
56 |
+
def top_n_retriever(titles: list[str], similarity_scores: object, n: int, query_type: str) -> list[str] :
|
57 |
+
"""
|
58 |
+
It takes in a list of titles, a numpy array of similarity scores, the number of results to return,
|
59 |
+
and the type of query (search engine or similar movies). It then returns the top n results
|
60 |
+
|
61 |
+
Args:
|
62 |
+
titles (List[str]): List of movie titles
|
63 |
+
similarity_scores (ndarray): The cosine similarity scores of the query movie with all the movies
|
64 |
+
in the dataset.
|
65 |
+
n (int): The number of results to return
|
66 |
+
query_type (str): This is the type of query. It can be either "Search Engine" or "Similar Movies".
|
67 |
+
|
68 |
+
Returns:
|
69 |
+
The top n movies that are similar to the query movie.
|
70 |
+
"""
|
71 |
+
|
72 |
+
sim_scores = zip(titles, similarity_scores)
|
73 |
+
sorted_sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
|
74 |
+
|
75 |
+
if query_type == "Search Engine":
|
76 |
+
sorted_sim_scores = sorted_sim_scores[:n]
|
77 |
+
|
78 |
+
if query_type == "Similar Movies":
|
79 |
+
sorted_sim_scores = sorted_sim_scores[1 : n + 1]
|
80 |
+
|
81 |
+
return [i[0] for i in sorted_sim_scores]
|
82 |
+
|
83 |
+
|
84 |
+
def grid_maker(movie_recs: list[str], df: object):
|
85 |
+
"""
|
86 |
+
It takes the list of recommended movies and the dataframe as input and outputs a grid of movie
|
87 |
+
posters and details
|
88 |
+
|
89 |
+
Args:
|
90 |
+
movie_recs (List[str]): - a list of movie titles
|
91 |
+
df (object): the dataframe containing the movie data
|
92 |
+
"""
|
93 |
+
|
94 |
+
for movie in movie_recs:
|
95 |
+
poster_col, title_col = st.columns([1, 8])
|
96 |
+
(year, duration, genre, stars, summary, poster_url, trailer_url) = (
|
97 |
+
df[output_column_names][df.title == movie]
|
98 |
+
).values.flatten()
|
99 |
+
poster_col.image(poster_url)
|
100 |
+
poster_col.markdown(
|
101 |
+
f'<a href={trailer_url}><button style="background-color:GreenYellow;">🎥Trailer</button></a>',
|
102 |
+
unsafe_allow_html=True,
|
103 |
+
)
|
104 |
+
|
105 |
+
title_col.markdown(
|
106 |
+
f""" #### **:blue[{movie}]** | {year} | {duration} | {genre} """
|
107 |
+
)
|
108 |
+
title_col.markdown(
|
109 |
+
f""" <span style="background-color:rgba(0, 0, 0, 0.1);">{stars}</span>
|
110 |
+
<span style="word-wrap:break-word;font-family:roboto;font-weight: 700;">
|
111 |
+
<br>{summary}</span>""",
|
112 |
+
unsafe_allow_html=True,
|
113 |
+
)
|
114 |
+
|
115 |
+
|
116 |
+
def filter_df(df: object, selected_page: str):
|
117 |
+
"""
|
118 |
+
The function takes in a dataframe, and the selected page, and returns the selected movie, the
|
119 |
+
filtered dataframe, and the top_n number of recommendations
|
120 |
+
|
121 |
+
Args:
|
122 |
+
df (object): the dataframe
|
123 |
+
selected_page (str): the page that the user is on
|
124 |
+
|
125 |
+
Returns:
|
126 |
+
selected_movie, filtered_df, top_n
|
127 |
+
"""
|
128 |
+
filtered_df = df.copy()
|
129 |
+
text_input, genre_box, top_n_rec = st.columns([3, 1, 2])
|
130 |
+
with genre_box:
|
131 |
+
selected_genre = st.selectbox("Genre", genres_list)
|
132 |
+
with top_n_rec:
|
133 |
+
top_n = st.slider("Number of Recommendations", 1, 15, 5)
|
134 |
+
|
135 |
+
if selected_genre != "All":
|
136 |
+
filtered_df = df[df.genre.str.contains(selected_genre)]
|
137 |
+
|
138 |
+
if selected_page == "Similar Movies":
|
139 |
+
with text_input:
|
140 |
+
selected_movie = st.selectbox("Movie", movie_list)
|
141 |
+
return selected_movie, filtered_df, top_n
|
142 |
+
|
143 |
+
if selected_page == "Search Engine":
|
144 |
+
with text_input:
|
145 |
+
query = st.text_input("Query", value="Mafia")
|
146 |
+
return query, filtered_df, top_n
|
147 |
+
|
148 |
+
|
149 |
+
def get_results_button():
|
150 |
+
"""
|
151 |
+
It creates a button that says "Get Results ◀" and returns it
|
152 |
+
|
153 |
+
Returns:
|
154 |
+
A button object.
|
155 |
+
"""
|
156 |
+
_, _, col_center, _, _ = st.columns(5)
|
157 |
+
return col_center.button("Get Results ◀")
|
158 |
+
|
159 |
+
|
160 |
+
df, embed_model = load_data_model()
|
161 |
+
df["trailer_url"] = df["trailer_url"].astype(str)
|
162 |
+
movie_list = df["title"].values
|
163 |
+
genres_list = list(set(df["genre"].str.split(", ").sum()))
|
164 |
+
genres_list.insert(0, "All")
|
165 |
+
|
166 |
+
|
167 |
+
selected_page = option_menu(
|
168 |
+
menu_title=None, # required
|
169 |
+
options=["Search Engine", "Similar Movies"], # required
|
170 |
+
icons=["search", "film"], # optional
|
171 |
+
menu_icon="cast", # optional
|
172 |
+
default_index=0, # optional
|
173 |
+
orientation="horizontal",
|
174 |
+
styles={
|
175 |
+
"container": {"padding": "0!important", "background-color": "#fafafa"},
|
176 |
+
"icon": {"color": "orange", "font-size": "25px"},
|
177 |
+
"nav-link": {
|
178 |
+
"font-size": "25px",
|
179 |
+
"text-align": "left",
|
180 |
+
"margin": "0px",
|
181 |
+
"--hover-color": "#eee",
|
182 |
+
},
|
183 |
+
"nav-link-selected": {"background-color": "#0068C9"},
|
184 |
+
},
|
185 |
+
)
|
186 |
+
|
187 |
+
if selected_page == "Search Engine":
|
188 |
+
|
189 |
+
query, genre_df, top_n = filter_df(df, selected_page)
|
190 |
+
query_embed = embed_model.encode(query)
|
191 |
+
|
192 |
+
bt = get_results_button()
|
193 |
+
|
194 |
+
if bt:
|
195 |
+
if query == "":
|
196 |
+
st.warning("You should type something", icon="⚠️")
|
197 |
+
else:
|
198 |
+
semantic_sims = [
|
199 |
+
cosine_similarity([query_embed], [movie_embed]).item()
|
200 |
+
for movie_embed in genre_df.embedding
|
201 |
+
]
|
202 |
+
movie_recs = top_n_retriever(
|
203 |
+
genre_df.title, semantic_sims, top_n, selected_page
|
204 |
+
)
|
205 |
+
add_vertical_space(2)
|
206 |
+
grid_maker(movie_recs, genre_df)
|
207 |
+
|
208 |
+
|
209 |
+
if selected_page == "Similar Movies":
|
210 |
+
st.info("Movies are recommended based on plot similarity!")
|
211 |
+
selected_movie, genre_df, top_n = filter_df(df, selected_page)
|
212 |
+
|
213 |
+
bt = get_results_button()
|
214 |
+
if bt:
|
215 |
+
movie_sims = [
|
216 |
+
cosine_similarity(
|
217 |
+
list(df.embedding[df.title == selected_movie]), [movie_embed]
|
218 |
+
).item()
|
219 |
+
for movie_embed in genre_df.embedding
|
220 |
+
]
|
221 |
+
movie_recs = top_n_retriever(genre_df.title, movie_sims, top_n, selected_page)
|
222 |
+
add_vertical_space(2)
|
223 |
+
grid_maker(movie_recs, genre_df)
|
detailed_movies_top_250_embeds.pkl.xz
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ed2c2da07b2a8a8f28c3b0b7969da829d6f837251729c8a284327431b2ba11db
|
3 |
+
size 434052
|
requirements.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
--find-links https://download.pytorch.org/whl/torch_stable.html
|
2 |
+
torch==1.13.1+cpu
|
3 |
+
sentence-transformers
|
4 |
+
pandas
|
5 |
+
streamlit-option-menu
|
6 |
+
streamlit-extras
|