nickmuchi commited on
Commit
fd6baf3
1 Parent(s): 6cb6a9b

Create new file

Browse files
Files changed (1) hide show
  1. app.py +109 -0
app.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sentence_transformers import SentenceTransformer, util, CrossEncoder
2
+ from datasets import load_dataset
3
+ import pandas as pd
4
+ from IPython.display import display
5
+
6
+ #Get the netflix dataset
7
+ netflix = load_dataset('hugginglearners/netflix-shows',use_auth_token=True)
8
+
9
+ #Filter for relevant columns and convert to pandas
10
+ netflix_df = netflix['train'].to_pandas()
11
+ netflix_df = netflix_df[['type','title','country','cast','release_year','rating','duration','listed_in','description']]
12
+
13
+ #load mpnet model
14
+ model = SentenceTransformer('all-mpnet-base-v2')
15
+
16
+ #load embeddings
17
+ flix_ds = load_dataset("nickmuchi/netflix-shows-mpnet-embeddings", use_auth_token=True)
18
+ dataset_embeddings = torch.from_numpy(flix_ds["train"].to_pandas().to_numpy()).to(torch.float)
19
+
20
+ #load cross-encoder for reranking
21
+ cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
22
+
23
+ #function for generating similarity of query and netflix shows
24
+ def semantic_search(query,embeddings,top_k=top_k):
25
+ '''Encode query and check similarity with embeddings'''
26
+
27
+ question_embedding = model.encode(query, convert_to_tensor=True).cpu()
28
+ hits = util.semantic_search(question_embedding, embeddings, top_k=top_k)
29
+ hits = hits[0]
30
+
31
+ ##### Re-Ranking #####
32
+ # Now, score all retrieved passages with the cross_encoder
33
+ cross_inp = [[query, netflix_df['description'].iloc[hit['corpus_id']]] for hit in hits]
34
+ cross_scores = cross_encoder.predict(cross_inp)
35
+
36
+ # Sort results by the cross-encoder scores
37
+ for idx in range(len(cross_scores)):
38
+ hits[idx]['cross-score'] = cross_scores[idx]
39
+
40
+ #Bi-encoder df
41
+ hits = sorted(hits, key=lambda x: x['score'], reverse=True)
42
+ bi_df = display_df_as_table(hits,top_k)
43
+
44
+ #Cross encoder df
45
+ hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
46
+ cross_df = display_df_as_table(hits,top_k,'cross-score')
47
+
48
+ return bi_df, cross_df
49
+
50
+
51
+ title = """<h1 id="title">Netflix Shows Semantic Search</h1>"""
52
+
53
+ description = """
54
+ Semantic Search is a way to generate search results based on the actual meaning of the query instead of a standard keyword search. I believe this way of searching provides more meaning results when trying to find a good show to watch on Netflix. For example, one could search for "Success, rags to riches story" as provided in the example below to generate shows or movies with a description that is semantically similar to the query.
55
+
56
+ - The App generates embeddings using [All-Mpnet-Base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2) model from Sentence Transformers.
57
+ - The model encodes the query and the discerption field from the [Netflix-Shows](https://huggingface.co/datasets/hugginglearners/netflix-shows) dataset which contains 8800 shows and movies currently on Netflix scraped from the web using Selenium.
58
+ - Similarity scores are then generated, from highest to lowest. The user can select how many suggestions they need from the results.
59
+ - A Cross Encoder then re-ranks the top selections to further improve on the similarity scores.
60
+ - You will see 2 tables generated, one from the bi-encoder and the other from the cross encoder which further enhances the similarity score rankings
61
+
62
+ Enjoy and Search like you mean it!!
63
+ """
64
+ example_queries = ["Success, rags to riches","murder, crime scene investigation thriller"]
65
+
66
+ twitter_link = """
67
+ [![](https://img.shields.io/twitter/follow/nickmuchi?label=@nickmuchi&style=social)](https://twitter.com/nickmuchi)
68
+ """
69
+
70
+ css = '''
71
+ h1#title {
72
+ text-align: center;
73
+ }
74
+ '''
75
+
76
+ demo = gr.Blocks(css=css)
77
+
78
+ with demo:
79
+ gr.Markdown(title)
80
+ gr.Markdown(description)
81
+ gr.Markdown(twitter_link)
82
+
83
+ slider_input = gr.Slider(minimum=3,maximum=10,value=5,step=1,label='Number of Suggestions to Generate')
84
+
85
+
86
+ with gr.Row():
87
+ query = gr.Textbox(lines=3,label='Describe the Netflix show or movie you would like to watch..')
88
+
89
+ with gr.Row():
90
+ gr.Markdown(f'''Top-{slider_input} Bi-Encoder Retrieval hits''')
91
+ bi_output = gr.DataFrame(headers=['Similarity Score','Type','Title','Country','Cast','Release Year','Rating','Duration','Category Listing','Description'])
92
+
93
+ with gr.Row():
94
+ gr.Markdown(f'''Top-{slider_input} Cross-Encoder Re-ranker hits''')
95
+ cross_output = gr.DataFrame(headers=['Similarity Score','Type','Title','Country','Cast','Release Year','Rating','Duration','Category Listing','Description'])
96
+
97
+ with gr.Row():
98
+ example_url = gr.Examples(examples=example_queries,inputs=[query])
99
+
100
+
101
+ sem_but = gr.Button('Search')
102
+
103
+
104
+ sem_but.click(semantic_search,inputs=[query,dataset_embeddings,img_input,slider_input],outputs=[bi_output,cross_output],queue=True)
105
+
106
+ gr.Markdown("![visitor badge](https://visitor-badge.glitch.me/badge?page_id=nickmuchi-netflix-shows-semantic-search)")
107
+
108
+
109
+ demo.launch(debug=True,enable_queue=True)