Liyan06 commited on
Commit
113a57e
1 Parent(s): 8aee497

add TFIDF ranking

Browse files
Files changed (1) hide show
  1. handler.py +28 -1
handler.py CHANGED
@@ -3,6 +3,9 @@ from web_retrieval import *
3
  from nltk.tokenize import sent_tokenize
4
  import evaluate
5
 
 
 
 
6
 
7
  def sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk):
8
  '''
@@ -21,6 +24,29 @@ def sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk):
21
  return ranked_docs, scores
22
 
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  class EndpointHandler():
25
  def __init__(self, path="./"):
26
  self.scorer = MiniCheck(path=path)
@@ -78,7 +104,7 @@ class EndpointHandler():
78
  return outputs
79
 
80
 
81
- def search_relevant_docs(self, claim, timeout=10, max_search_results_per_query=10, allow_duplicated_urls=False):
82
 
83
  search_results = search_google(claim, timeout=timeout)
84
 
@@ -91,6 +117,7 @@ class EndpointHandler():
91
  print(f"Finished searching in {round((end - start), 1)} seconds.\n")
92
  scraped_results = [(r[0][:20000], r[1]) for r in scraped_results if r[0] and '��' not in r[0]] # those can be ranked based on TF-IDF to be more efficient
93
 
 
94
  retrieved_docs, urls = zip(*scraped_results[:max_search_results_per_query])
95
 
96
  print('Scoring webpages...')
 
3
  from nltk.tokenize import sent_tokenize
4
  import evaluate
5
 
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
+ from sklearn.metrics.pairwise import cosine_similarity
8
+
9
 
10
  def sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk):
11
  '''
 
24
  return ranked_docs, scores
25
 
26
 
27
+ def rank_documents_TFIDF(claim, scraped_results):
28
+
29
+ """
30
+ each element in scraped_results is a tuple of (document, URL)
31
+ """
32
+
33
+ documents = [result[0] for result in scraped_results]
34
+ corpus = [claim] + documents
35
+
36
+ vectorizer = TfidfVectorizer()
37
+ tfidf_matrix = vectorizer.fit_transform(corpus)
38
+
39
+ claim_vector = tfidf_matrix[0]
40
+ similarity_scores = cosine_similarity(claim_vector, tfidf_matrix[1:])
41
+
42
+ ranked_results = [(scraped_results[i][0], scraped_results[i][1], score)
43
+ for i, score in enumerate(similarity_scores[0])]
44
+ ranked_results.sort(key=lambda x: x[2], reverse=True)
45
+ ranked_documents = [(result[0], result[1]) for result in ranked_results]
46
+
47
+ return ranked_documents
48
+
49
+
50
  class EndpointHandler():
51
  def __init__(self, path="./"):
52
  self.scorer = MiniCheck(path=path)
 
104
  return outputs
105
 
106
 
107
+ def search_relevant_docs(self, claim, timeout=10, max_search_results_per_query=5, allow_duplicated_urls=False):
108
 
109
  search_results = search_google(claim, timeout=timeout)
110
 
 
117
  print(f"Finished searching in {round((end - start), 1)} seconds.\n")
118
  scraped_results = [(r[0][:20000], r[1]) for r in scraped_results if r[0] and '��' not in r[0]] # those can be ranked based on TF-IDF to be more efficient
119
 
120
+ scraped_results = rank_documents_TFIDF(claim, scraped_results)
121
  retrieved_docs, urls = zip(*scraped_results[:max_search_results_per_query])
122
 
123
  print('Scoring webpages...')