Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -5,9 +5,14 @@ import pprint
|
|
5 |
from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
6 |
|
7 |
# read data
|
8 |
-
df = pd.read_csv('./assets/final_combined.csv')
|
|
|
9 |
doc_embeddings = np.load('./assets/final_combined_embed.npy', allow_pickel=True)
|
10 |
|
|
|
|
|
|
|
|
|
11 |
def semantic_search(normalized_query):
|
12 |
'''
|
13 |
function to perform semantic search given a search query
|
@@ -20,7 +25,7 @@ def re_ranker(normalized_query, hits):
|
|
20 |
'''
|
21 |
function to re-rank semantic search results using cross encoding
|
22 |
'''
|
23 |
-
cross_inp = [[query,
|
24 |
cross_scores = cross_encoder.predict(cross_inp)
|
25 |
|
26 |
for idx in range(len(cross_scores)):
|
@@ -31,7 +36,7 @@ def re_ranker(normalized_query, hits):
|
|
31 |
def print_results(hits, k_items):
|
32 |
results = ""
|
33 |
for hit in hits[:k_items]:
|
34 |
-
results += pprint.pformat(
|
35 |
return results
|
36 |
|
37 |
def predict(query):
|
|
|
5 |
from sentence_transformers import SentenceTransformer, CrossEncoder, util
|
6 |
|
7 |
# read data
|
8 |
+
df = pd.read_csv('./assets/final_combined.csv')
|
9 |
+
df_dict = df.to_dict(orient='records')
|
10 |
doc_embeddings = np.load('./assets/final_combined_embed.npy', allow_pickel=True)
|
11 |
|
12 |
+
# models
|
13 |
+
bi_encoder = SentenceTransformer("intfloat/multilingual-e5-base", cache_folder = "./assets")
|
14 |
+
cross_encoder = CrossEncoder('cross-encoder/nli-deberta-v3-base')
|
15 |
+
|
16 |
def semantic_search(normalized_query):
|
17 |
'''
|
18 |
function to perform semantic search given a search query
|
|
|
25 |
'''
|
26 |
function to re-rank semantic search results using cross encoding
|
27 |
'''
|
28 |
+
cross_inp = [[query, df['representation'][hit['corpus_id']]] for hit in hit]
|
29 |
cross_scores = cross_encoder.predict(cross_inp)
|
30 |
|
31 |
for idx in range(len(cross_scores)):
|
|
|
36 |
def print_results(hits, k_items):
|
37 |
results = ""
|
38 |
for hit in hits[:k_items]:
|
39 |
+
results += pprint.pformat(df_dict[hit['corpus_id']], indent=4)
|
40 |
return results
|
41 |
|
42 |
def predict(query):
|