lytang
/

MiniCheck-Flan-T5-Large

@@ -51,6 +51,7 @@ class EndpointHandler():
     def __init__(self, path="./"):
         self.scorer = MiniCheck(path=path)
         self.rouge = evaluate.load('rouge')
     def __call__(self, data):
@@ -82,7 +83,7 @@ class EndpointHandler():
         else:
             assert len(data['inputs']['claims']) == 1, "Only one claim is allowed for web retrieval for the current version."
-            ranked_docs, scores, ranked_urls = self.search_relevant_docs(claim)
             span_to_highlight = []
             for doc_chunk, score in zip(ranked_docs, scores):
@@ -104,7 +105,12 @@ class EndpointHandler():
         return outputs
-    def search_relevant_docs(self, claim, timeout=10, max_search_results_per_query=5, allow_duplicated_urls=False):
         search_results = search_google(claim, timeout=timeout)
@@ -133,9 +139,24 @@ class EndpointHandler():
         num_chunks = len([item for items in used_chunk for item in items])
         print(f'Finished {num_chunks} entailment checks in {round((end - start), 1)} seconds ({round(num_chunks / (end - start) * 60)} Doc./min).')
-        ranked_docs, scores, ranked_urls = order_doc_score_url(used_chunk, support_prob_per_chunk, urls, allow_duplicated_urls=allow_duplicated_urls)
-        return ranked_docs, scores, ranked_urls
     def chunk_and_highest_rouge_score(self, doc, claim):

     def __init__(self, path="./"):
         self.scorer = MiniCheck(path=path)
         self.rouge = evaluate.load('rouge')
+        self.tfidf_order = True
     def __call__(self, data):
         else:
             assert len(data['inputs']['claims']) == 1, "Only one claim is allowed for web retrieval for the current version."
+            ranked_docs, scores, ranked_urls = self.search_relevant_docs(claim, tfidf_order=self.tfidf_order)
             span_to_highlight = []
             for doc_chunk, score in zip(ranked_docs, scores):
         return outputs
+    def search_relevant_docs(self, claim, timeout=10, max_search_results_per_query=5, allow_duplicated_urls=False, tfidf_order=False):
+        """
+        if tfidf_order == True, then display the docs in the order of TF-IDF similarity with the claim, regardless of the entailment score
+        otherwise, display the docs in the order of the entailment score
+        """
         search_results = search_google(claim, timeout=timeout)
         num_chunks = len([item for items in used_chunk for item in items])
         print(f'Finished {num_chunks} entailment checks in {round((end - start), 1)} seconds ({round(num_chunks / (end - start) * 60)} Doc./min).')
+        if tfidf_order:
+            tfidf_docs, scores = [], []
+            for used_c, support_prob_per_c in zip(used_chunk, support_prob_per_chunk):
+                # If the doc can support the claim, find the chunk with the
+                # highest entailment score; otherwise, use the first chunk
+                if max(support_prob_per_c) > 0.5:
+                    tfidf_docs.append(used_c[np.argmax(support_prob_per_c)])
+                    scores.append(max(support_prob_per_c))
+                else:
+                    tfidf_docs.append(used_c[0])
+                    scores.append(support_prob_per_c[0])
+            return tfidf_docs, scores, urls
+        else:
+            ranked_docs, scores, ranked_urls = order_doc_score_url(used_chunk, support_prob_per_chunk, urls, allow_duplicated_urls=allow_duplicated_urls)
+            return ranked_docs, scores, ranked_urls
     def chunk_and_highest_rouge_score(self, doc, claim):

web_retrieval.py CHANGED Viewed

@@ -82,7 +82,7 @@ def scrape_url(url: str, timeout=10) -> Tuple[str, str]:
     return web_text, url
-def search_google(query:str, num_web_pages:int=20, timeout:int=6, save_url:str='') -> List[str]:
     """Searches the query using Google.
     Args:
         query: Search query.
@@ -108,7 +108,8 @@ def search_google(query:str, num_web_pages:int=20, timeout:int=6, save_url:str='
         for page in range(0, num_web_pages, 10):
             # here page is google search's bottom page meaning, click 2 -> start=10
             # url = "https://www.google.com/search?q={}&start={}".format(query, page)
-            url = "https://www.google.com/search?q={}&lr=lang_{}&hl={}&start={}".format(search_query, lang, lang, page)
             r = requests.get(url, headers=headers, timeout=timeout)
             # collect all urls by regular expression
             # how to do if I just want to have the returned top-k pages?

     return web_text, url
+def search_google(query:str, num_web_pages:int=10, timeout:int=6, save_url:str='') -> List[str]:
     """Searches the query using Google.
     Args:
         query: Search query.
         for page in range(0, num_web_pages, 10):
             # here page is google search's bottom page meaning, click 2 -> start=10
             # url = "https://www.google.com/search?q={}&start={}".format(query, page)
+            # url = "https://www.google.com/search?q={}&lr=lang_{}&hl={}&start={}".format(search_query, lang, lang, page)
+            url = f"https://www.google.com/search?q={search_query}&start={page}"
             r = requests.get(url, headers=headers, timeout=timeout)
             # collect all urls by regular expression
             # how to do if I just want to have the returned top-k pages?