Liyan06 commited on
Commit
c191acc
1 Parent(s): 4ec6f2d

web retrieval update

Browse files
Files changed (2) hide show
  1. handler.py +2 -2
  2. web_retrieval.py +4 -8
handler.py CHANGED
@@ -85,11 +85,11 @@ class EndpointHandler():
85
  print('Searching webpages...')
86
  start = time()
87
  with concurrent.futures.ThreadPoolExecutor() as e:
88
- scraped_results = e.map(scrape_url, search_results)
89
  end = time()
90
 
91
  print(f"Finished searching in {round((end - start), 1)} seconds.\n")
92
- scraped_results = [(r[0][:20000], r[1]) for r in scraped_results if r[0] and '��' not in r[0]]
93
 
94
  retrieved_docs, urls = zip(*scraped_results[:max_search_results_per_query])
95
 
 
85
  print('Searching webpages...')
86
  start = time()
87
  with concurrent.futures.ThreadPoolExecutor() as e:
88
+ scraped_results = e.map(scrape_url, search_results, itertools.repeat(timeout))
89
  end = time()
90
 
91
  print(f"Finished searching in {round((end - start), 1)} seconds.\n")
92
+ scraped_results = [(r[0][:20000], r[1]) for r in scraped_results if r[0] and '��' not in r[0]] # those can be ranked based on TF-IDF to be more efficient
93
 
94
  retrieved_docs, urls = zip(*scraped_results[:max_search_results_per_query])
95
 
web_retrieval.py CHANGED
@@ -49,7 +49,7 @@ def is_tag_visible(element: bs4.element) -> bool:
49
  return True
50
 
51
 
52
- def scrape_url(url: str) -> Tuple[str, str]:
53
  """Scrapes a URL for all text information.
54
 
55
  Args:
@@ -61,13 +61,9 @@ def scrape_url(url: str) -> Tuple[str, str]:
61
  """
62
  # Scrape the URL
63
  try:
64
- session = requests.Session()
65
- retry = Retry(connect=3, backoff_factor=0.5)
66
- adapter = HTTPAdapter(max_retries=retry)
67
- session.mount('http://', adapter)
68
- session.mount('https://', adapter)
69
- response = session.get(url)
70
- except Exception as _:
71
  return None, url
72
 
73
  # Extract out all text from the tags
 
49
  return True
50
 
51
 
52
+ def scrape_url(url: str, timeout=10) -> Tuple[str, str]:
53
  """Scrapes a URL for all text information.
54
 
55
  Args:
 
61
  """
62
  # Scrape the URL
63
  try:
64
+ response = requests.get(url, timeout=timeout)
65
+ response.raise_for_status()
66
+ except requests.exceptions.RequestException as _:
 
 
 
 
67
  return None, url
68
 
69
  # Extract out all text from the tags