Liyan06
commited on
Commit
·
4ec6f2d
1
Parent(s):
70dbc11
update web retrieval quality
Browse files- handler.py +4 -19
- web_retrieval.py +47 -18
handler.py
CHANGED
@@ -3,16 +3,6 @@ from web_retrieval import *
|
|
3 |
from nltk.tokenize import sent_tokenize
|
4 |
import evaluate
|
5 |
|
6 |
-
import spacy
|
7 |
-
from spacy.cli import download
|
8 |
-
|
9 |
-
try:
|
10 |
-
nlp = spacy.load("en_core_web_lg")
|
11 |
-
except:
|
12 |
-
# If loading fails, download the model
|
13 |
-
download("en_core_web_lg")
|
14 |
-
nlp = spacy.load("en_core_web_lg")
|
15 |
-
|
16 |
|
17 |
def sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk):
|
18 |
'''
|
@@ -31,12 +21,6 @@ def sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk):
|
|
31 |
return ranked_docs, scores
|
32 |
|
33 |
|
34 |
-
def extract_entities(text):
|
35 |
-
text = nlp(text)
|
36 |
-
ents = list({ent.text for ent in text.ents})
|
37 |
-
return ents
|
38 |
-
|
39 |
-
|
40 |
class EndpointHandler():
|
41 |
def __init__(self, path="./"):
|
42 |
self.scorer = MiniCheck(path=path)
|
@@ -94,17 +78,18 @@ class EndpointHandler():
|
|
94 |
return outputs
|
95 |
|
96 |
|
97 |
-
def search_relevant_docs(self, claim, timeout=10, max_search_results_per_query=
|
98 |
|
99 |
search_results = search_google(claim, timeout=timeout)
|
100 |
|
101 |
print('Searching webpages...')
|
102 |
start = time()
|
103 |
with concurrent.futures.ThreadPoolExecutor() as e:
|
104 |
-
scraped_results = e.map(scrape_url, search_results
|
105 |
end = time()
|
|
|
106 |
print(f"Finished searching in {round((end - start), 1)} seconds.\n")
|
107 |
-
scraped_results = [(r[0][:20000], r[1]) for r in scraped_results if r[0] and '��' not in r[0]
|
108 |
|
109 |
retrieved_docs, urls = zip(*scraped_results[:max_search_results_per_query])
|
110 |
|
|
|
3 |
from nltk.tokenize import sent_tokenize
|
4 |
import evaluate
|
5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
def sort_chunks_single_doc_claim(used_chunk, support_prob_per_chunk):
|
8 |
'''
|
|
|
21 |
return ranked_docs, scores
|
22 |
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
class EndpointHandler():
|
25 |
def __init__(self, path="./"):
|
26 |
self.scorer = MiniCheck(path=path)
|
|
|
78 |
return outputs
|
79 |
|
80 |
|
81 |
+
def search_relevant_docs(self, claim, timeout=10, max_search_results_per_query=10, allow_duplicated_urls=False):
|
82 |
|
83 |
search_results = search_google(claim, timeout=timeout)
|
84 |
|
85 |
print('Searching webpages...')
|
86 |
start = time()
|
87 |
with concurrent.futures.ThreadPoolExecutor() as e:
|
88 |
+
scraped_results = e.map(scrape_url, search_results)
|
89 |
end = time()
|
90 |
+
|
91 |
print(f"Finished searching in {round((end - start), 1)} seconds.\n")
|
92 |
+
scraped_results = [(r[0][:20000], r[1]) for r in scraped_results if r[0] and '��' not in r[0]]
|
93 |
|
94 |
retrieved_docs, urls = zip(*scraped_results[:max_search_results_per_query])
|
95 |
|
web_retrieval.py
CHANGED
@@ -9,6 +9,25 @@ import itertools
|
|
9 |
import numpy as np
|
10 |
from time import time
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
def is_tag_visible(element: bs4.element) -> bool:
|
14 |
"""Determines if an HTML element is visible.
|
@@ -30,7 +49,7 @@ def is_tag_visible(element: bs4.element) -> bool:
|
|
30 |
return True
|
31 |
|
32 |
|
33 |
-
def scrape_url(url: str
|
34 |
"""Scrapes a URL for all text information.
|
35 |
|
36 |
Args:
|
@@ -42,9 +61,13 @@ def scrape_url(url: str, timeout: float = 3) -> Tuple[str, str]:
|
|
42 |
"""
|
43 |
# Scrape the URL
|
44 |
try:
|
45 |
-
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
|
|
48 |
return None, url
|
49 |
|
50 |
# Extract out all text from the tags
|
@@ -84,25 +107,31 @@ def search_google(query:str, num_web_pages:int=10, timeout:int=6, save_url:str='
|
|
84 |
lang = "en"
|
85 |
|
86 |
# scrape google results
|
87 |
-
|
88 |
-
for
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
99 |
|
100 |
# save all url into a txt file
|
101 |
if not save_url == "":
|
102 |
with open(save_url, 'w') as file:
|
103 |
-
for url in
|
104 |
file.write(url + '\n')
|
105 |
-
return
|
106 |
|
107 |
|
108 |
def order_doc_score_url(used_chunk, support_prob_per_chunk, urls, allow_duplicated_urls=False):
|
|
|
9 |
import numpy as np
|
10 |
from time import time
|
11 |
|
12 |
+
from requests.adapters import HTTPAdapter
|
13 |
+
from urllib3.util.retry import Retry
|
14 |
+
|
15 |
+
import spacy
|
16 |
+
from spacy.cli import download
|
17 |
+
|
18 |
+
try:
|
19 |
+
nlp = spacy.load("en_core_web_lg")
|
20 |
+
except:
|
21 |
+
# If loading fails, download the model
|
22 |
+
download("en_core_web_lg")
|
23 |
+
nlp = spacy.load("en_core_web_lg")
|
24 |
+
|
25 |
+
|
26 |
+
def extract_entities(text):
|
27 |
+
text = nlp(text)
|
28 |
+
ents = list({ent.text for ent in text.ents})
|
29 |
+
return ents
|
30 |
+
|
31 |
|
32 |
def is_tag_visible(element: bs4.element) -> bool:
|
33 |
"""Determines if an HTML element is visible.
|
|
|
49 |
return True
|
50 |
|
51 |
|
52 |
+
def scrape_url(url: str) -> Tuple[str, str]:
|
53 |
"""Scrapes a URL for all text information.
|
54 |
|
55 |
Args:
|
|
|
61 |
"""
|
62 |
# Scrape the URL
|
63 |
try:
|
64 |
+
session = requests.Session()
|
65 |
+
retry = Retry(connect=3, backoff_factor=0.5)
|
66 |
+
adapter = HTTPAdapter(max_retries=retry)
|
67 |
+
session.mount('http://', adapter)
|
68 |
+
session.mount('https://', adapter)
|
69 |
+
response = session.get(url)
|
70 |
+
except Exception as _:
|
71 |
return None, url
|
72 |
|
73 |
# Extract out all text from the tags
|
|
|
107 |
lang = "en"
|
108 |
|
109 |
# scrape google results
|
110 |
+
all_urls = []
|
111 |
+
for search_query in set([query] + list(set(extract_entities(query)))):
|
112 |
+
for page in range(0, num_web_pages, 10):
|
113 |
+
# here page is google search's bottom page meaning, click 2 -> start=10
|
114 |
+
# url = "https://www.google.com/search?q={}&start={}".format(query, page)
|
115 |
+
url = "https://www.google.com/search?q={}&lr=lang_{}&hl={}&start={}".format(search_query, lang, lang, page)
|
116 |
+
r = requests.get(url, headers=headers, timeout=timeout)
|
117 |
+
# collect all urls by regular expression
|
118 |
+
# how to do if I just want to have the returned top-k pages?
|
119 |
+
urls = re.findall('href="(https?://.*?)"', r.text)
|
120 |
+
urls = [url for url in urls if 'google.com' not in url and '.pdf' not in url] # can be inproved based on TF-IDF later
|
121 |
+
|
122 |
+
all_urls.extend(urls)
|
123 |
+
|
124 |
+
all_urls_final = []
|
125 |
+
for url in all_urls:
|
126 |
+
if url not in all_urls_final:
|
127 |
+
all_urls_final.append(url)
|
128 |
|
129 |
# save all url into a txt file
|
130 |
if not save_url == "":
|
131 |
with open(save_url, 'w') as file:
|
132 |
+
for url in all_urls_final:
|
133 |
file.write(url + '\n')
|
134 |
+
return all_urls_final
|
135 |
|
136 |
|
137 |
def order_doc_score_url(used_chunk, support_prob_per_chunk, urls, allow_duplicated_urls=False):
|