Abdul-Ib commited on
Commit
050fb16
1 Parent(s): 67fdab2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -2
app.py CHANGED
@@ -1,8 +1,46 @@
1
  import pandas as pd
2
  import gradio
 
 
 
3
 
4
- def predict():
5
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  app = gr.Interface(
8
  fn = predict,
 
1
  import pandas as pd
2
  import gradio
3
+ from clean_data import text_normalizer
4
+ import pprint
5
+ from sentence_transformers import SentenceTransformer, CrossEncoder, util
6
 
7
+ # read data
8
+ df = pd.read_csv('./assets/final_combined.csv')to_dict(orient='records')
9
+ doc_embeddings = np.load('./assets/final_combined_embed.npy', allow_pickel=True)
10
+
11
+ def semantic_search(normalized_query):
12
+ '''
13
+ function to perform semantic search given a search query
14
+ '''
15
+ query_embedding = bi_encoder.encode(query)
16
+ hits = util.semantic_search(query_embedding, doc_embeddings, top_k=50)
17
+ return hits[0]
18
+
19
+ def re_ranker(normalized_query, hits):
20
+ '''
21
+ function to re-rank semantic search results using cross encoding
22
+ '''
23
+ cross_inp = [[query, doc_embeddings[hit['corpus_id']]] for hit in hit]
24
+ cross_scores = cross_encoder.predict(cross_inp)
25
+
26
+ for idx in range(len(cross_scores)):
27
+ hits[idx]['cross-score'] = cross_scores[idx]
28
+ return sorted(hits, key=lambda x: x['cross-score'], reverse=True)
29
+
30
+
31
+ def print_results(hits, k_items):
32
+ results = ""
33
+ for hit in hits[:k_items]:
34
+ results += pprint.pformat(df[hit['corpus_id']], indent=4)
35
+ return results
36
+
37
+ def predict(query):
38
+ normalized_query = text_normalizer(query)
39
+
40
+ bi_hits = semantic_search(normalized_query)
41
+ reranked_hits = re_ranker(bi_hits)
42
+
43
+ return print_results(reranked_hits, k_items = 10)
44
 
45
  app = gr.Interface(
46
  fn = predict,