lotrlol commited on
Commit
4793e50
Β·
1 Parent(s): 7d3bdc5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -19
app.py CHANGED
@@ -32,33 +32,35 @@ class DocumentSearch:
32
  # loading faiss index
33
  self.index = faiss.read_index(DocumentSearch.idx_path)
34
  # loading sbert cross_encoder
35
- # self.cross_encoder = CrossEncoder(DocumentSearch.cross_enc_path)
36
 
37
- def search(self, query, k=5):
38
- query_vec = self.vectorizer.transform([query]).toarray()[0]
39
- scores = np.dot(self.doc_vectors, query_vec)
40
- indeces = np.argpartition(scores, -k)[-k:]
41
- try:
42
- res_docs = [self.docs[i] for i in indeces]
43
- except IndexError:
44
- res_docs = self.docs
45
- return res_docs
46
 
47
-
48
- ###return[{'doc': doc[0], 'url': doc[1], 'score': dist} for doc, dist in zip(res_docs, dists)][:k]
49
- ##### OLD VERSION WITH CROSS-ENCODER #####
50
  # get answers by index
51
- #answers = [self.docs[i] for i in indeces[0]]
52
  # prepare inputs for cross encoder
53
- # model_inputs = [[query, pairs[0]] for pairs in answers]
54
- # urls = [pairs[1] for pairs in answers]
55
  # get similarity score between query and documents
56
- # scores = self.cross_encoder.predict(model_inputs, batch_size=1)
57
  # compose results into list of dicts
58
- # results = [{'doc': doc[1], 'url': url, 'score': score} for doc, url, score in zip(model_inputs, urls, scores)]
59
 
60
  # return results sorted by similarity scores
61
- # return sorted(results, key=lambda x: x['score'], reverse=True)[:k]
 
 
 
 
 
62
 
63
 
64
  if __name__ == "__main__":
 
32
  # loading faiss index
33
  self.index = faiss.read_index(DocumentSearch.idx_path)
34
  # loading sbert cross_encoder
35
+ self.cross_encoder = CrossEncoder(DocumentSearch.cross_enc_path)
36
 
37
+ def search(self, query: str, k: int) -> list:
38
+ # get vector representation of text query
39
+ query_vector = self.encoder.encode([query])
40
+ # perform search via faiss FlatIP index
41
+ distances, indeces = self.index.search(query_vector, k*10)
42
+ # get docs by index
43
+ res_docs = [self.docs[i] for i in indeces[0]]
44
+ # get scores by index
45
+ dists = [dist for dist in distances[0]]
46
 
 
 
 
47
  # get answers by index
48
+ answers = [self.docs[i] for i in indeces[0]]
49
  # prepare inputs for cross encoder
50
+ model_inputs = [[query, pairs[0]] for pairs in answers]
51
+ urls = [pairs[1] for pairs in answers]
52
  # get similarity score between query and documents
53
+ scores = self.cross_encoder.predict(model_inputs, batch_size=1)
54
  # compose results into list of dicts
55
+ results = [{'doc': doc[1], 'url': url, 'score': score} for doc, url, score in zip(model_inputs, urls, scores)]
56
 
57
  # return results sorted by similarity scores
58
+ return sorted(results, key=lambda x: x['score'], reverse=True)[:k]
59
+
60
+
61
+ if __name__ == "__main__":
62
+ # get instance of DocumentSearch class
63
+ surfer = DocumentSearch()
64
 
65
 
66
  if __name__ == "__main__":