Liyan06 commited on
Commit
4ec6f2d
·
1 Parent(s): 70dbc11

update web retrieval quality

Browse files
Files changed (2) hide show
  1. handler.py +4 -19
  2. web_retrieval.py +47 -18
handler.py CHANGED
@@ -3,16 +3,6 @@ from web_retrieval import *
3
  from nltk.tokenize import sent_tokenize
4
  import evaluate
5
 
6
- import spacy
7
- from spacy.cli import download
8
-
9
- try:
10
- nlp = spacy.load("en_core_web_lg")
11
- except:
12
- # If loading fails, download the model
13
- download("en_core_web_lg")
14
- nlp = spacy.load("en_core_web_lg")
15
-
16
 
17
  def sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk):
18
  '''
@@ -31,12 +21,6 @@ def sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk):
31
  return ranked_docs, scores
32
 
33
 
34
- def extract_entities(text):
35
- text = nlp(text)
36
- ents = list({ent.text for ent in text.ents})
37
- return ents
38
-
39
-
40
  class EndpointHandler():
41
  def __init__(self, path="./"):
42
  self.scorer = MiniCheck(path=path)
@@ -94,17 +78,18 @@ class EndpointHandler():
94
  return outputs
95
 
96
 
97
- def search_relevant_docs(self, claim, timeout=10, max_search_results_per_query=5, allow_duplicated_urls=False):
98
 
99
  search_results = search_google(claim, timeout=timeout)
100
 
101
  print('Searching webpages...')
102
  start = time()
103
  with concurrent.futures.ThreadPoolExecutor() as e:
104
- scraped_results = e.map(scrape_url, search_results, itertools.repeat(timeout))
105
  end = time()
 
106
  print(f"Finished searching in {round((end - start), 1)} seconds.\n")
107
- scraped_results = [(r[0][:20000], r[1]) for r in scraped_results if r[0] and '��' not in r[0] and ".pdf" not in r[1]]
108
 
109
  retrieved_docs, urls = zip(*scraped_results[:max_search_results_per_query])
110
 
 
3
  from nltk.tokenize import sent_tokenize
4
  import evaluate
5
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  def sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk):
8
  '''
 
21
  return ranked_docs, scores
22
 
23
 
 
 
 
 
 
 
24
  class EndpointHandler():
25
  def __init__(self, path="./"):
26
  self.scorer = MiniCheck(path=path)
 
78
  return outputs
79
 
80
 
81
+ def search_relevant_docs(self, claim, timeout=10, max_search_results_per_query=10, allow_duplicated_urls=False):
82
 
83
  search_results = search_google(claim, timeout=timeout)
84
 
85
  print('Searching webpages...')
86
  start = time()
87
  with concurrent.futures.ThreadPoolExecutor() as e:
88
+ scraped_results = e.map(scrape_url, search_results)
89
  end = time()
90
+
91
  print(f"Finished searching in {round((end - start), 1)} seconds.\n")
92
+ scraped_results = [(r[0][:20000], r[1]) for r in scraped_results if r[0] and '��' not in r[0]]
93
 
94
  retrieved_docs, urls = zip(*scraped_results[:max_search_results_per_query])
95
 
web_retrieval.py CHANGED
@@ -9,6 +9,25 @@ import itertools
9
  import numpy as np
10
  from time import time
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  def is_tag_visible(element: bs4.element) -> bool:
14
  """Determines if an HTML element is visible.
@@ -30,7 +49,7 @@ def is_tag_visible(element: bs4.element) -> bool:
30
  return True
31
 
32
 
33
- def scrape_url(url: str, timeout: float = 3) -> Tuple[str, str]:
34
  """Scrapes a URL for all text information.
35
 
36
  Args:
@@ -42,9 +61,13 @@ def scrape_url(url: str, timeout: float = 3) -> Tuple[str, str]:
42
  """
43
  # Scrape the URL
44
  try:
45
- response = requests.get(url, timeout=timeout)
46
- response.raise_for_status()
47
- except requests.exceptions.RequestException as _:
 
 
 
 
48
  return None, url
49
 
50
  # Extract out all text from the tags
@@ -84,25 +107,31 @@ def search_google(query:str, num_web_pages:int=10, timeout:int=6, save_url:str='
84
  lang = "en"
85
 
86
  # scrape google results
87
- urls = []
88
- for page in range(0, num_web_pages, 10):
89
- # here page is google search's bottom page meaning, click 2 -> start=10
90
- # url = "https://www.google.com/search?q={}&start={}".format(query, page)
91
- url = "https://www.google.com/search?q={}&lr=lang_{}&hl={}&start={}".format(query, lang, lang, page)
92
- r = requests.get(url, headers=headers, timeout=timeout)
93
- # collect all urls by regular expression
94
- # how to do if I just want to have the returned top-k pages?
95
- urls += re.findall('href="(https?://.*?)"', r.text)
96
-
97
- # set to remove repeated urls
98
- urls = list(set(urls))
 
 
 
 
 
 
99
 
100
  # save all url into a txt file
101
  if not save_url == "":
102
  with open(save_url, 'w') as file:
103
- for url in urls:
104
  file.write(url + '\n')
105
- return urls
106
 
107
 
108
  def order_doc_score_url(used_chunk, support_prob_per_chunk, urls, allow_duplicated_urls=False):
 
9
  import numpy as np
10
  from time import time
11
 
12
+ from requests.adapters import HTTPAdapter
13
+ from urllib3.util.retry import Retry
14
+
15
+ import spacy
16
+ from spacy.cli import download
17
+
18
+ try:
19
+ nlp = spacy.load("en_core_web_lg")
20
+ except:
21
+ # If loading fails, download the model
22
+ download("en_core_web_lg")
23
+ nlp = spacy.load("en_core_web_lg")
24
+
25
+
26
+ def extract_entities(text):
27
+ text = nlp(text)
28
+ ents = list({ent.text for ent in text.ents})
29
+ return ents
30
+
31
 
32
  def is_tag_visible(element: bs4.element) -> bool:
33
  """Determines if an HTML element is visible.
 
49
  return True
50
 
51
 
52
+ def scrape_url(url: str) -> Tuple[str, str]:
53
  """Scrapes a URL for all text information.
54
 
55
  Args:
 
61
  """
62
  # Scrape the URL
63
  try:
64
+ session = requests.Session()
65
+ retry = Retry(connect=3, backoff_factor=0.5)
66
+ adapter = HTTPAdapter(max_retries=retry)
67
+ session.mount('http://', adapter)
68
+ session.mount('https://', adapter)
69
+ response = session.get(url)
70
+ except Exception as _:
71
  return None, url
72
 
73
  # Extract out all text from the tags
 
107
  lang = "en"
108
 
109
  # scrape google results
110
+ all_urls = []
111
+ for search_query in set([query] + list(set(extract_entities(query)))):
112
+ for page in range(0, num_web_pages, 10):
113
+ # here page is google search's bottom page meaning, click 2 -> start=10
114
+ # url = "https://www.google.com/search?q={}&start={}".format(query, page)
115
+ url = "https://www.google.com/search?q={}&lr=lang_{}&hl={}&start={}".format(search_query, lang, lang, page)
116
+ r = requests.get(url, headers=headers, timeout=timeout)
117
+ # collect all urls by regular expression
118
+ # how to do if I just want to have the returned top-k pages?
119
+ urls = re.findall('href="(https?://.*?)"', r.text)
120
+ urls = [url for url in urls if 'google.com' not in url and '.pdf' not in url] # can be inproved based on TF-IDF later
121
+
122
+ all_urls.extend(urls)
123
+
124
+ all_urls_final = []
125
+ for url in all_urls:
126
+ if url not in all_urls_final:
127
+ all_urls_final.append(url)
128
 
129
  # save all url into a txt file
130
  if not save_url == "":
131
  with open(save_url, 'w') as file:
132
+ for url in all_urls_final:
133
  file.write(url + '\n')
134
+ return all_urls_final
135
 
136
 
137
  def order_doc_score_url(used_chunk, support_prob_per_chunk, urls, allow_duplicated_urls=False):