Liyan06
commited on
Commit
•
3fbb656
1
Parent(s):
113a57e
update retrieval and doc display ranking
Browse files- handler.py +25 -4
- web_retrieval.py +3 -2
handler.py
CHANGED
@@ -51,6 +51,7 @@ class EndpointHandler():
|
|
51 |
def __init__(self, path="./"):
|
52 |
self.scorer = MiniCheck(path=path)
|
53 |
self.rouge = evaluate.load('rouge')
|
|
|
54 |
|
55 |
|
56 |
def __call__(self, data):
|
@@ -82,7 +83,7 @@ class EndpointHandler():
|
|
82 |
else:
|
83 |
assert len(data['inputs']['claims']) == 1, "Only one claim is allowed for web retrieval for the current version."
|
84 |
|
85 |
-
ranked_docs, scores, ranked_urls = self.search_relevant_docs(claim)
|
86 |
|
87 |
span_to_highlight = []
|
88 |
for doc_chunk, score in zip(ranked_docs, scores):
|
@@ -104,7 +105,12 @@ class EndpointHandler():
|
|
104 |
return outputs
|
105 |
|
106 |
|
107 |
-
def search_relevant_docs(self, claim, timeout=10, max_search_results_per_query=5, allow_duplicated_urls=False):
|
|
|
|
|
|
|
|
|
|
|
108 |
|
109 |
search_results = search_google(claim, timeout=timeout)
|
110 |
|
@@ -133,9 +139,24 @@ class EndpointHandler():
|
|
133 |
num_chunks = len([item for items in used_chunk for item in items])
|
134 |
print(f'Finished {num_chunks} entailment checks in {round((end - start), 1)} seconds ({round(num_chunks / (end - start) * 60)} Doc./min).')
|
135 |
|
136 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
137 |
|
138 |
-
|
139 |
|
140 |
|
141 |
def chunk_and_highest_rouge_score(self, doc, claim):
|
|
|
51 |
def __init__(self, path="./"):
|
52 |
self.scorer = MiniCheck(path=path)
|
53 |
self.rouge = evaluate.load('rouge')
|
54 |
+
self.tfidf_order = True
|
55 |
|
56 |
|
57 |
def __call__(self, data):
|
|
|
83 |
else:
|
84 |
assert len(data['inputs']['claims']) == 1, "Only one claim is allowed for web retrieval for the current version."
|
85 |
|
86 |
+
ranked_docs, scores, ranked_urls = self.search_relevant_docs(claim, tfidf_order=self.tfidf_order)
|
87 |
|
88 |
span_to_highlight = []
|
89 |
for doc_chunk, score in zip(ranked_docs, scores):
|
|
|
105 |
return outputs
|
106 |
|
107 |
|
108 |
+
def search_relevant_docs(self, claim, timeout=10, max_search_results_per_query=5, allow_duplicated_urls=False, tfidf_order=False):
|
109 |
+
|
110 |
+
"""
|
111 |
+
if tfidf_order == True, then display the docs in the order of TF-IDF similarity with the claim, regardless of the entailment score
|
112 |
+
otherwise, display the docs in the order of the entailment score
|
113 |
+
"""
|
114 |
|
115 |
search_results = search_google(claim, timeout=timeout)
|
116 |
|
|
|
139 |
num_chunks = len([item for items in used_chunk for item in items])
|
140 |
print(f'Finished {num_chunks} entailment checks in {round((end - start), 1)} seconds ({round(num_chunks / (end - start) * 60)} Doc./min).')
|
141 |
|
142 |
+
if tfidf_order:
|
143 |
+
tfidf_docs, scores = [], []
|
144 |
+
for used_c, support_prob_per_c in zip(used_chunk, support_prob_per_chunk):
|
145 |
+
# If the doc can support the claim, find the chunk with the
|
146 |
+
# highest entailment score; otherwise, use the first chunk
|
147 |
+
if max(support_prob_per_c) > 0.5:
|
148 |
+
tfidf_docs.append(used_c[np.argmax(support_prob_per_c)])
|
149 |
+
scores.append(max(support_prob_per_c))
|
150 |
+
else:
|
151 |
+
tfidf_docs.append(used_c[0])
|
152 |
+
scores.append(support_prob_per_c[0])
|
153 |
+
|
154 |
+
return tfidf_docs, scores, urls
|
155 |
+
|
156 |
+
else:
|
157 |
+
ranked_docs, scores, ranked_urls = order_doc_score_url(used_chunk, support_prob_per_chunk, urls, allow_duplicated_urls=allow_duplicated_urls)
|
158 |
|
159 |
+
return ranked_docs, scores, ranked_urls
|
160 |
|
161 |
|
162 |
def chunk_and_highest_rouge_score(self, doc, claim):
|
web_retrieval.py
CHANGED
@@ -82,7 +82,7 @@ def scrape_url(url: str, timeout=10) -> Tuple[str, str]:
|
|
82 |
return web_text, url
|
83 |
|
84 |
|
85 |
-
def search_google(query:str, num_web_pages:int=
|
86 |
"""Searches the query using Google.
|
87 |
Args:
|
88 |
query: Search query.
|
@@ -108,7 +108,8 @@ def search_google(query:str, num_web_pages:int=20, timeout:int=6, save_url:str='
|
|
108 |
for page in range(0, num_web_pages, 10):
|
109 |
# here page is google search's bottom page meaning, click 2 -> start=10
|
110 |
# url = "https://www.google.com/search?q={}&start={}".format(query, page)
|
111 |
-
url = "https://www.google.com/search?q={}&lr=lang_{}&hl={}&start={}".format(search_query, lang, lang, page)
|
|
|
112 |
r = requests.get(url, headers=headers, timeout=timeout)
|
113 |
# collect all urls by regular expression
|
114 |
# how to do if I just want to have the returned top-k pages?
|
|
|
82 |
return web_text, url
|
83 |
|
84 |
|
85 |
+
def search_google(query:str, num_web_pages:int=10, timeout:int=6, save_url:str='') -> List[str]:
|
86 |
"""Searches the query using Google.
|
87 |
Args:
|
88 |
query: Search query.
|
|
|
108 |
for page in range(0, num_web_pages, 10):
|
109 |
# here page is google search's bottom page meaning, click 2 -> start=10
|
110 |
# url = "https://www.google.com/search?q={}&start={}".format(query, page)
|
111 |
+
# url = "https://www.google.com/search?q={}&lr=lang_{}&hl={}&start={}".format(search_query, lang, lang, page)
|
112 |
+
url = f"https://www.google.com/search?q={search_query}&start={page}"
|
113 |
r = requests.get(url, headers=headers, timeout=timeout)
|
114 |
# collect all urls by regular expression
|
115 |
# how to do if I just want to have the returned top-k pages?
|