lytang
/

MiniCheck-Flan-T5-Large

@@ -3,16 +3,6 @@ from web_retrieval import *
 from nltk.tokenize import sent_tokenize
 import evaluate
-import spacy
-from spacy.cli import download
-try:
-    nlp = spacy.load("en_core_web_lg")
-except:
-    # If loading fails, download the model
-    download("en_core_web_lg")
-    nlp = spacy.load("en_core_web_lg")
 def sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk):
     '''
@@ -31,12 +21,6 @@ def sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk):
     return ranked_docs, scores
-def extract_entities(text):
-    text = nlp(text)
-    ents = list({ent.text for ent in text.ents})
-    return ents
 class EndpointHandler():
     def __init__(self, path="./"):
         self.scorer = MiniCheck(path=path)
@@ -94,17 +78,18 @@ class EndpointHandler():
         return outputs
-    def search_relevant_docs(self, claim, timeout=10, max_search_results_per_query=5, allow_duplicated_urls=False):
         search_results = search_google(claim, timeout=timeout)
         print('Searching webpages...')
         start = time()
         with concurrent.futures.ThreadPoolExecutor() as e:
-            scraped_results = e.map(scrape_url, search_results, itertools.repeat(timeout))
         end = time()
         print(f"Finished searching in {round((end - start), 1)} seconds.\n")
-        scraped_results = [(r[0][:20000], r[1]) for r in scraped_results if r[0] and '��' not in r[0] and ".pdf" not in r[1]]
         retrieved_docs, urls = zip(*scraped_results[:max_search_results_per_query])

 from nltk.tokenize import sent_tokenize
 import evaluate
 def sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk):
     '''
     return ranked_docs, scores
 class EndpointHandler():
     def __init__(self, path="./"):
         self.scorer = MiniCheck(path=path)
         return outputs
+    def search_relevant_docs(self, claim, timeout=10, max_search_results_per_query=10, allow_duplicated_urls=False):
         search_results = search_google(claim, timeout=timeout)
         print('Searching webpages...')
         start = time()
         with concurrent.futures.ThreadPoolExecutor() as e:
+            scraped_results = e.map(scrape_url, search_results)
         end = time()
         print(f"Finished searching in {round((end - start), 1)} seconds.\n")
+        scraped_results = [(r[0][:20000], r[1]) for r in scraped_results if r[0] and '��' not in r[0]]
         retrieved_docs, urls = zip(*scraped_results[:max_search_results_per_query])

web_retrieval.py CHANGED Viewed

@@ -9,6 +9,25 @@ import itertools
 import numpy as np
 from time import time
 def is_tag_visible(element: bs4.element) -> bool:
     """Determines if an HTML element is visible.
@@ -30,7 +49,7 @@ def is_tag_visible(element: bs4.element) -> bool:
     return True
-def scrape_url(url: str, timeout: float = 3) -> Tuple[str, str]:
     """Scrapes a URL for all text information.
     Args:
@@ -42,9 +61,13 @@ def scrape_url(url: str, timeout: float = 3) -> Tuple[str, str]:
     """
     # Scrape the URL
     try:
-        response = requests.get(url, timeout=timeout)
-        response.raise_for_status()
-    except requests.exceptions.RequestException as _:
         return None, url
     # Extract out all text from the tags
@@ -84,25 +107,31 @@ def search_google(query:str, num_web_pages:int=10, timeout:int=6, save_url:str='
     lang = "en"
     # scrape google results
-    urls = []
-    for page in range(0, num_web_pages, 10):
-        # here page is google search's bottom page meaning, click 2 -> start=10
-        # url = "https://www.google.com/search?q={}&start={}".format(query, page)
-        url = "https://www.google.com/search?q={}&lr=lang_{}&hl={}&start={}".format(query, lang, lang, page)
-        r = requests.get(url, headers=headers, timeout=timeout)
-        # collect all urls by regular expression
-        # how to do if I just want to have the returned top-k pages?
-        urls += re.findall('href="(https?://.*?)"', r.text)
-    # set to remove repeated urls
-    urls = list(set(urls))
     # save all url into a txt file
     if not save_url == "":
         with open(save_url, 'w') as file:
-            for url in urls:
                 file.write(url + '\n')
-    return urls
 def order_doc_score_url(used_chunk, support_prob_per_chunk, urls, allow_duplicated_urls=False):

 import numpy as np
 from time import time
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+import spacy
+from spacy.cli import download
+try:
+    nlp = spacy.load("en_core_web_lg")
+except:
+    # If loading fails, download the model
+    download("en_core_web_lg")
+    nlp = spacy.load("en_core_web_lg")
+def extract_entities(text):
+    text = nlp(text)
+    ents = list({ent.text for ent in text.ents})
+    return ents
 def is_tag_visible(element: bs4.element) -> bool:
     """Determines if an HTML element is visible.
     return True
+def scrape_url(url: str) -> Tuple[str, str]:
     """Scrapes a URL for all text information.
     Args:
     """
     # Scrape the URL
     try:
+        session = requests.Session()
+        retry = Retry(connect=3, backoff_factor=0.5)
+        adapter = HTTPAdapter(max_retries=retry)
+        session.mount('http://', adapter)
+        session.mount('https://', adapter)
+        response = session.get(url)
+    except Exception as _:
         return None, url
     # Extract out all text from the tags
     lang = "en"
     # scrape google results
+    all_urls = []
+    for search_query in set([query] + list(set(extract_entities(query)))):
+        for page in range(0, num_web_pages, 10):
+            # here page is google search's bottom page meaning, click 2 -> start=10
+            # url = "https://www.google.com/search?q={}&start={}".format(query, page)
+            url = "https://www.google.com/search?q={}&lr=lang_{}&hl={}&start={}".format(search_query, lang, lang, page)
+            r = requests.get(url, headers=headers, timeout=timeout)
+            # collect all urls by regular expression
+            # how to do if I just want to have the returned top-k pages?
+            urls = re.findall('href="(https?://.*?)"', r.text)
+            urls = [url for url in urls if 'google.com' not in url and '.pdf' not in url]  # can be inproved based on TF-IDF later
+            all_urls.extend(urls)
+    all_urls_final = []
+    for url in all_urls:
+        if url not in all_urls_final:
+            all_urls_final.append(url)
     # save all url into a txt file
     if not save_url == "":
         with open(save_url, 'w') as file:
+            for url in all_urls_final:
                 file.write(url + '\n')
+    return all_urls_final
 def order_doc_score_url(used_chunk, support_prob_per_chunk, urls, allow_duplicated_urls=False):