Spaces:
Paused
Paused
Commit
·
d1ac8cf
1
Parent(s):
bcae708
replaced chromedriver/selenium with zenrows
Browse files- web_search.py +54 -53
web_search.py
CHANGED
@@ -35,19 +35,19 @@ def search(msg, query_phrase):
|
|
35 |
# query_phrase, keywords = ut.get_search_phrase_and_keywords(msg, [])
|
36 |
google_text = ""
|
37 |
try:
|
38 |
-
|
39 |
google_text, urls_all, urls_used, tried_index, urls_tried = search_google(msg, query_phrase)
|
40 |
except:
|
41 |
traceback.print_exc()
|
42 |
|
43 |
-
|
44 |
|
45 |
for item in google_text:
|
46 |
-
|
47 |
f"\n##############################################################################################\nSource: {item['source']}"
|
48 |
)
|
49 |
-
|
50 |
-
|
51 |
return google_text
|
52 |
except KeyboardInterrupt:
|
53 |
traceback.print_exc()
|
@@ -88,6 +88,7 @@ def process_url(url, timeout):
|
|
88 |
# response = driver.page_source
|
89 |
client = ZenRowsClient(os.getenv('zenrows_api_key'))
|
90 |
response = client.get(url)
|
|
|
91 |
# result = response_text_extract(url=url, response=response)
|
92 |
result = response.text
|
93 |
except Exception:
|
@@ -101,16 +102,16 @@ def process_url(url, timeout):
|
|
101 |
# return "", url
|
102 |
except Exception:
|
103 |
traceback.print_exc()
|
104 |
-
|
105 |
pass
|
106 |
-
|
107 |
return result, url
|
108 |
|
109 |
|
110 |
def process_urls(urls):
|
111 |
# Create a ThreadPoolExecutor with 5 worker threads
|
112 |
response = []
|
113 |
-
|
114 |
full_text = ""
|
115 |
used_index = 0
|
116 |
urls_used = ["" for i in range(30)]
|
@@ -139,7 +140,7 @@ def process_urls(urls):
|
|
139 |
urls_tried[tried_index] = url
|
140 |
tried_index += 1
|
141 |
urls.remove(url)
|
142 |
-
|
143 |
# Process the responses as they arrive
|
144 |
# for future in in_process:
|
145 |
# if future.done():
|
@@ -149,7 +150,7 @@ def process_urls(urls):
|
|
149 |
if len(result) > 0:
|
150 |
urls_used[used_index] = url
|
151 |
used_index += 1
|
152 |
-
|
153 |
f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
|
154 |
)
|
155 |
if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
|
@@ -163,7 +164,7 @@ def process_urls(urls):
|
|
163 |
|
164 |
if (len(urls) == 0 and len(in_process) == 0) or (time.time() - start_time > 28):
|
165 |
# executor.shutdown(wait=False)
|
166 |
-
|
167 |
f"n****** exiting process urls early {len(response)} {int(time.time() - start_time)} secs\n"
|
168 |
)
|
169 |
return response, used_index, urls_used, tried_index, urls_tried
|
@@ -171,7 +172,7 @@ def process_urls(urls):
|
|
171 |
except:
|
172 |
traceback.print_exc()
|
173 |
# executor.shutdown(wait=False)
|
174 |
-
|
175 |
f"\n*****processed all urls {len(response)} {int(time.time() - start_time)} secs"
|
176 |
)
|
177 |
return response, urls_used, tried_index, urls_tried
|
@@ -182,11 +183,11 @@ def extract_subtext(text):
|
|
182 |
|
183 |
|
184 |
def request_google(query_phrase):
|
185 |
-
|
186 |
sort = "&sort=date-sdate:d:w"
|
187 |
if "today" in query_phrase or "latest" in query_phrase:
|
188 |
sort = "&sort=date-sdate:d:s"
|
189 |
-
|
190 |
google_query = en.quote(query_phrase)
|
191 |
response = []
|
192 |
try:
|
@@ -203,14 +204,14 @@ def request_google(query_phrase):
|
|
203 |
)
|
204 |
response = requests.get(url)
|
205 |
response_json = json.loads(response.text)
|
206 |
-
|
207 |
except:
|
208 |
traceback.print_exc()
|
209 |
return []
|
210 |
|
211 |
# see if we got anything useful from Google
|
212 |
if "items" not in response_json.keys():
|
213 |
-
|
214 |
return []
|
215 |
|
216 |
urls = []
|
@@ -224,43 +225,43 @@ def request_google(query_phrase):
|
|
224 |
return urls
|
225 |
|
226 |
|
227 |
-
def response_text_extract(url, response):
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
|
249 |
|
250 |
-
def extract_items_from_numbered_list(text):
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
|
265 |
|
266 |
def search_google(original_query, query_phrase):
|
@@ -295,8 +296,8 @@ def search_google(original_query, query_phrase):
|
|
295 |
# initialize scan of Google urls
|
296 |
start_wall_time = time.time()
|
297 |
full_text, urls_used, tried_index, urls_tried = process_urls(all_urls)
|
298 |
-
|
299 |
-
|
300 |
except:
|
301 |
traceback.print_exc()
|
302 |
return full_text, all_urls, urls_used, tried_index, urls_tried
|
|
|
35 |
# query_phrase, keywords = ut.get_search_phrase_and_keywords(msg, [])
|
36 |
google_text = ""
|
37 |
try:
|
38 |
+
print(f"asking google {msg}; rephrased: {query_phrase}")
|
39 |
google_text, urls_all, urls_used, tried_index, urls_tried = search_google(msg, query_phrase)
|
40 |
except:
|
41 |
traceback.print_exc()
|
42 |
|
43 |
+
print("\n\nFinal response: ")
|
44 |
|
45 |
for item in google_text:
|
46 |
+
print(
|
47 |
f"\n##############################################################################################\nSource: {item['source']}"
|
48 |
)
|
49 |
+
print(f"{item['text']}")
|
50 |
+
print(f"URL: {item['url']}")
|
51 |
return google_text
|
52 |
except KeyboardInterrupt:
|
53 |
traceback.print_exc()
|
|
|
88 |
# response = driver.page_source
|
89 |
client = ZenRowsClient(os.getenv('zenrows_api_key'))
|
90 |
response = client.get(url)
|
91 |
+
print(f'got response, status: {response.status_code}')
|
92 |
# result = response_text_extract(url=url, response=response)
|
93 |
result = response.text
|
94 |
except Exception:
|
|
|
102 |
# return "", url
|
103 |
except Exception:
|
104 |
traceback.print_exc()
|
105 |
+
print(f"{site} err")
|
106 |
pass
|
107 |
+
print(f"Processed {site}: {len(response.text)} / {len(result)} {int((time.time() - start_time) * 1000)} ms")
|
108 |
return result, url
|
109 |
|
110 |
|
111 |
def process_urls(urls):
|
112 |
# Create a ThreadPoolExecutor with 5 worker threads
|
113 |
response = []
|
114 |
+
print("entering process urls")
|
115 |
full_text = ""
|
116 |
used_index = 0
|
117 |
urls_used = ["" for i in range(30)]
|
|
|
140 |
urls_tried[tried_index] = url
|
141 |
tried_index += 1
|
142 |
urls.remove(url)
|
143 |
+
print(f"queued {ut.extract_site(url)}, {timeout}")
|
144 |
# Process the responses as they arrive
|
145 |
# for future in in_process:
|
146 |
# if future.done():
|
|
|
150 |
if len(result) > 0:
|
151 |
urls_used[used_index] = url
|
152 |
used_index += 1
|
153 |
+
print(
|
154 |
f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
|
155 |
)
|
156 |
if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
|
|
|
164 |
|
165 |
if (len(urls) == 0 and len(in_process) == 0) or (time.time() - start_time > 28):
|
166 |
# executor.shutdown(wait=False)
|
167 |
+
print(
|
168 |
f"n****** exiting process urls early {len(response)} {int(time.time() - start_time)} secs\n"
|
169 |
)
|
170 |
return response, used_index, urls_used, tried_index, urls_tried
|
|
|
172 |
except:
|
173 |
traceback.print_exc()
|
174 |
# executor.shutdown(wait=False)
|
175 |
+
print(
|
176 |
f"\n*****processed all urls {len(response)} {int(time.time() - start_time)} secs"
|
177 |
)
|
178 |
return response, urls_used, tried_index, urls_tried
|
|
|
183 |
|
184 |
|
185 |
def request_google(query_phrase):
|
186 |
+
print(f"***** search {query_phrase}")
|
187 |
sort = "&sort=date-sdate:d:w"
|
188 |
if "today" in query_phrase or "latest" in query_phrase:
|
189 |
sort = "&sort=date-sdate:d:s"
|
190 |
+
print(f"search for: {query_phrase}")
|
191 |
google_query = en.quote(query_phrase)
|
192 |
response = []
|
193 |
try:
|
|
|
204 |
)
|
205 |
response = requests.get(url)
|
206 |
response_json = json.loads(response.text)
|
207 |
+
print(f"***** google search {int((time.time() - start_wall_time) * 10) / 10} sec")
|
208 |
except:
|
209 |
traceback.print_exc()
|
210 |
return []
|
211 |
|
212 |
# see if we got anything useful from Google
|
213 |
if "items" not in response_json.keys():
|
214 |
+
print("no return from google ...", response, response_json.keys())
|
215 |
return []
|
216 |
|
217 |
urls = []
|
|
|
225 |
return urls
|
226 |
|
227 |
|
228 |
+
# def response_text_extract(url, response):
|
229 |
+
# extract_text = ""
|
230 |
+
# if url.endswith("pdf"):
|
231 |
+
# pass
|
232 |
+
# else:
|
233 |
+
# if response is not None:
|
234 |
+
# elements = partition_html(text=response)
|
235 |
+
# str_elements = []
|
236 |
+
# logger.info('\n***** elements')
|
237 |
+
# for e in elements:
|
238 |
+
# stre = str(e).replace(" ", " ")
|
239 |
+
# str_elements.append(stre)
|
240 |
+
# extract_text = ''.join(extract_subtext(str_elements))
|
241 |
+
# logger.info(
|
242 |
+
# f"***** unstructured found {len(elements)} elements, {sum([len(str(e)) for e in elements])} raw chars, {len(extract_text)} extract"
|
243 |
+
# )
|
244 |
+
#
|
245 |
+
# if len(extract_text.strip()) < 8:
|
246 |
+
# return ""
|
247 |
+
# else:
|
248 |
+
# return extract_text
|
249 |
|
250 |
|
251 |
+
# def extract_items_from_numbered_list(text):
|
252 |
+
# items = ""
|
253 |
+
# elements = text.split("\n")
|
254 |
+
# for candidate in elements:
|
255 |
+
# candidate = candidate.lstrip(". \t")
|
256 |
+
# if len(candidate) > 4 and candidate[0].isdigit():
|
257 |
+
# candidate = candidate[1:].lstrip(". ")
|
258 |
+
# if (
|
259 |
+
# len(candidate) > 4 and candidate[0].isdigit()
|
260 |
+
# ): # strip second digit if more than 10 items
|
261 |
+
# candidate = candidate[1:].lstrip(". ")
|
262 |
+
# logger.info("E {}".format(candidate))
|
263 |
+
# items += candidate + " "
|
264 |
+
# return items
|
265 |
|
266 |
|
267 |
def search_google(original_query, query_phrase):
|
|
|
296 |
# initialize scan of Google urls
|
297 |
start_wall_time = time.time()
|
298 |
full_text, urls_used, tried_index, urls_tried = process_urls(all_urls)
|
299 |
+
print(f"***** urls_processed {int((time.time() - start_wall_time) * 10) / 10} sec")
|
300 |
+
print("return from url processsing")
|
301 |
except:
|
302 |
traceback.print_exc()
|
303 |
return full_text, all_urls, urls_used, tried_index, urls_tried
|