Liyan06
commited on
Commit
•
113a57e
1
Parent(s):
8aee497
add TFIDF ranking
Browse files- handler.py +28 -1
handler.py
CHANGED
@@ -3,6 +3,9 @@ from web_retrieval import *
|
|
3 |
from nltk.tokenize import sent_tokenize
|
4 |
import evaluate
|
5 |
|
|
|
|
|
|
|
6 |
|
7 |
def sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk):
|
8 |
'''
|
@@ -21,6 +24,29 @@ def sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk):
|
|
21 |
return ranked_docs, scores
|
22 |
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
class EndpointHandler():
|
25 |
def __init__(self, path="./"):
|
26 |
self.scorer = MiniCheck(path=path)
|
@@ -78,7 +104,7 @@ class EndpointHandler():
|
|
78 |
return outputs
|
79 |
|
80 |
|
81 |
-
def search_relevant_docs(self, claim, timeout=10, max_search_results_per_query=
|
82 |
|
83 |
search_results = search_google(claim, timeout=timeout)
|
84 |
|
@@ -91,6 +117,7 @@ class EndpointHandler():
|
|
91 |
print(f"Finished searching in {round((end - start), 1)} seconds.\n")
|
92 |
scraped_results = [(r[0][:20000], r[1]) for r in scraped_results if r[0] and '��' not in r[0]] # those can be ranked based on TF-IDF to be more efficient
|
93 |
|
|
|
94 |
retrieved_docs, urls = zip(*scraped_results[:max_search_results_per_query])
|
95 |
|
96 |
print('Scoring webpages...')
|
|
|
3 |
from nltk.tokenize import sent_tokenize
|
4 |
import evaluate
|
5 |
|
6 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
7 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
8 |
+
|
9 |
|
10 |
def sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk):
|
11 |
'''
|
|
|
24 |
return ranked_docs, scores
|
25 |
|
26 |
|
27 |
+
def rank_documents_TFIDF(claim, scraped_results):
|
28 |
+
|
29 |
+
"""
|
30 |
+
each element in scraped_results is a tuple of (document, URL)
|
31 |
+
"""
|
32 |
+
|
33 |
+
documents = [result[0] for result in scraped_results]
|
34 |
+
corpus = [claim] + documents
|
35 |
+
|
36 |
+
vectorizer = TfidfVectorizer()
|
37 |
+
tfidf_matrix = vectorizer.fit_transform(corpus)
|
38 |
+
|
39 |
+
claim_vector = tfidf_matrix[0]
|
40 |
+
similarity_scores = cosine_similarity(claim_vector, tfidf_matrix[1:])
|
41 |
+
|
42 |
+
ranked_results = [(scraped_results[i][0], scraped_results[i][1], score)
|
43 |
+
for i, score in enumerate(similarity_scores[0])]
|
44 |
+
ranked_results.sort(key=lambda x: x[2], reverse=True)
|
45 |
+
ranked_documents = [(result[0], result[1]) for result in ranked_results]
|
46 |
+
|
47 |
+
return ranked_documents
|
48 |
+
|
49 |
+
|
50 |
class EndpointHandler():
|
51 |
def __init__(self, path="./"):
|
52 |
self.scorer = MiniCheck(path=path)
|
|
|
104 |
return outputs
|
105 |
|
106 |
|
107 |
+
def search_relevant_docs(self, claim, timeout=10, max_search_results_per_query=5, allow_duplicated_urls=False):
|
108 |
|
109 |
search_results = search_google(claim, timeout=timeout)
|
110 |
|
|
|
117 |
print(f"Finished searching in {round((end - start), 1)} seconds.\n")
|
118 |
scraped_results = [(r[0][:20000], r[1]) for r in scraped_results if r[0] and '��' not in r[0]] # those can be ranked based on TF-IDF to be more efficient
|
119 |
|
120 |
+
scraped_results = rank_documents_TFIDF(claim, scraped_results)
|
121 |
retrieved_docs, urls = zip(*scraped_results[:max_search_results_per_query])
|
122 |
|
123 |
print('Scoring webpages...')
|