Spaces:

nickmuchi
/

Netflix-Semantic-Search-Whisperer

Runtime error

App Files Files Community

nickmuchi commited on Aug 29, 2022

Commit

fd6baf3

•

1 Parent(s): 6cb6a9b

Create new file

Browse files

Files changed (1) hide show

app.py +109 -0

app.py ADDED Viewed

	@@ -0,0 +1,109 @@

+from sentence_transformers import SentenceTransformer, util, CrossEncoder
+from datasets import load_dataset
+import pandas as pd
+from IPython.display import display
+#Get the netflix dataset
+netflix = load_dataset('hugginglearners/netflix-shows',use_auth_token=True)
+#Filter for relevant columns and convert to pandas
+netflix_df = netflix['train'].to_pandas()
+netflix_df = netflix_df[['type','title','country','cast','release_year','rating','duration','listed_in','description']]
+#load mpnet model
+model = SentenceTransformer('all-mpnet-base-v2')
+#load embeddings
+flix_ds = load_dataset("nickmuchi/netflix-shows-mpnet-embeddings", use_auth_token=True)
+dataset_embeddings = torch.from_numpy(flix_ds["train"].to_pandas().to_numpy()).to(torch.float)
+#load cross-encoder for reranking
+cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-12-v2')
+#function for generating similarity of query and netflix shows
+def semantic_search(query,embeddings,top_k=top_k):
+    '''Encode query and check similarity with embeddings'''
+    question_embedding = model.encode(query, convert_to_tensor=True).cpu()
+    hits = util.semantic_search(question_embedding, embeddings, top_k=top_k)
+    hits = hits[0]
+    ##### Re-Ranking #####
+    # Now, score all retrieved passages with the cross_encoder
+    cross_inp = [[query, netflix_df['description'].iloc[hit['corpus_id']]] for hit in hits]
+    cross_scores = cross_encoder.predict(cross_inp)
+    # Sort results by the cross-encoder scores
+    for idx in range(len(cross_scores)):
+        hits[idx]['cross-score'] = cross_scores[idx]
+    #Bi-encoder df
+    hits = sorted(hits, key=lambda x: x['score'], reverse=True)
+    bi_df = display_df_as_table(hits,top_k)
+    #Cross encoder df
+    hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
+    cross_df = display_df_as_table(hits,top_k,'cross-score')
+    return bi_df, cross_df
+title = """<h1 id="title">Netflix Shows Semantic Search</h1>"""
+description = """
+Semantic Search is a way to generate search results based on the actual meaning of the query instead of a standard keyword search. I believe this way of searching provides more meaning results when trying to find a good show to watch on Netflix. For example, one could search for "Success, rags to riches story" as provided in the example below to generate shows or movies with a description that is semantically similar to the query.
+- The App generates embeddings using [All-Mpnet-Base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2) model from Sentence Transformers.
+- The model encodes the query and the discerption field from the [Netflix-Shows](https://huggingface.co/datasets/hugginglearners/netflix-shows) dataset which contains 8800 shows and movies currently on Netflix scraped from the web using Selenium.
+- Similarity scores are then generated, from highest to lowest. The user can select how many suggestions they need from the results.
+- A Cross Encoder then re-ranks the top selections to further improve on the similarity scores.
+- You will see 2 tables generated, one from the bi-encoder and the other from the cross encoder which further enhances the similarity score rankings
+Enjoy and Search like you mean it!!
+"""
+example_queries = ["Success, rags to riches","murder, crime scene investigation thriller"]
+twitter_link = """
+[![](https://img.shields.io/twitter/follow/nickmuchi?label=@nickmuchi&style=social)](https://twitter.com/nickmuchi)
+"""
+css = '''
+h1#title {
+  text-align: center;
+}
+'''
+demo = gr.Blocks(css=css)
+with demo:
+    gr.Markdown(title)
+    gr.Markdown(description)
+    gr.Markdown(twitter_link)
+    slider_input = gr.Slider(minimum=3,maximum=10,value=5,step=1,label='Number of Suggestions to Generate')
+    with gr.Row():
+        query = gr.Textbox(lines=3,label='Describe the Netflix show or movie you would like to watch..')
+    with gr.Row():
+        gr.Markdown(f'''Top-{slider_input} Bi-Encoder Retrieval hits''')
+        bi_output = gr.DataFrame(headers=['Similarity Score','Type','Title','Country','Cast','Release Year','Rating','Duration','Category Listing','Description'])
+    with gr.Row():
+        gr.Markdown(f'''Top-{slider_input} Cross-Encoder Re-ranker hits''')
+        cross_output = gr.DataFrame(headers=['Similarity Score','Type','Title','Country','Cast','Release Year','Rating','Duration','Category Listing','Description'])
+    with gr.Row():
+        example_url = gr.Examples(examples=example_queries,inputs=[query])
+    sem_but = gr.Button('Search')
+    sem_but.click(semantic_search,inputs=[query,dataset_embeddings,img_input,slider_input],outputs=[bi_output,cross_output],queue=True)
+    gr.Markdown("![visitor badge](https://visitor-badge.glitch.me/badge?page_id=nickmuchi-netflix-shows-semantic-search)")
+demo.launch(debug=True,enable_queue=True)