arabellastrange commited on
Commit
d1ac8cf
·
1 Parent(s): bcae708

replaced chromedriver/selenium with zenrows

Browse files
Files changed (1) hide show
  1. web_search.py +54 -53
web_search.py CHANGED
@@ -35,19 +35,19 @@ def search(msg, query_phrase):
35
  # query_phrase, keywords = ut.get_search_phrase_and_keywords(msg, [])
36
  google_text = ""
37
  try:
38
- logger.info(f"asking google {msg}; rephrased: {query_phrase}")
39
  google_text, urls_all, urls_used, tried_index, urls_tried = search_google(msg, query_phrase)
40
  except:
41
  traceback.print_exc()
42
 
43
- logger.info("\n\nFinal response: ")
44
 
45
  for item in google_text:
46
- logger.info(
47
  f"\n##############################################################################################\nSource: {item['source']}"
48
  )
49
- logger.info(f"{item['text']}")
50
- logger.info(f"URL: {item['url']}")
51
  return google_text
52
  except KeyboardInterrupt:
53
  traceback.print_exc()
@@ -88,6 +88,7 @@ def process_url(url, timeout):
88
  # response = driver.page_source
89
  client = ZenRowsClient(os.getenv('zenrows_api_key'))
90
  response = client.get(url)
 
91
  # result = response_text_extract(url=url, response=response)
92
  result = response.text
93
  except Exception:
@@ -101,16 +102,16 @@ def process_url(url, timeout):
101
  # return "", url
102
  except Exception:
103
  traceback.print_exc()
104
- logger.info(f"{site} err")
105
  pass
106
- logger.info(f"Processed {site}: {len(response.text)} / {len(result)} {int((time.time() - start_time) * 1000)} ms")
107
  return result, url
108
 
109
 
110
  def process_urls(urls):
111
  # Create a ThreadPoolExecutor with 5 worker threads
112
  response = []
113
- logger.info("entering process urls")
114
  full_text = ""
115
  used_index = 0
116
  urls_used = ["" for i in range(30)]
@@ -139,7 +140,7 @@ def process_urls(urls):
139
  urls_tried[tried_index] = url
140
  tried_index += 1
141
  urls.remove(url)
142
- logger.info(f"queued {ut.extract_site(url)}, {timeout}")
143
  # Process the responses as they arrive
144
  # for future in in_process:
145
  # if future.done():
@@ -149,7 +150,7 @@ def process_urls(urls):
149
  if len(result) > 0:
150
  urls_used[used_index] = url
151
  used_index += 1
152
- logger.info(
153
  f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
154
  )
155
  if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
@@ -163,7 +164,7 @@ def process_urls(urls):
163
 
164
  if (len(urls) == 0 and len(in_process) == 0) or (time.time() - start_time > 28):
165
  # executor.shutdown(wait=False)
166
- logger.info(
167
  f"n****** exiting process urls early {len(response)} {int(time.time() - start_time)} secs\n"
168
  )
169
  return response, used_index, urls_used, tried_index, urls_tried
@@ -171,7 +172,7 @@ def process_urls(urls):
171
  except:
172
  traceback.print_exc()
173
  # executor.shutdown(wait=False)
174
- logger.info(
175
  f"\n*****processed all urls {len(response)} {int(time.time() - start_time)} secs"
176
  )
177
  return response, urls_used, tried_index, urls_tried
@@ -182,11 +183,11 @@ def extract_subtext(text):
182
 
183
 
184
  def request_google(query_phrase):
185
- logger.info(f"***** search {query_phrase}")
186
  sort = "&sort=date-sdate:d:w"
187
  if "today" in query_phrase or "latest" in query_phrase:
188
  sort = "&sort=date-sdate:d:s"
189
- # logger.info(f"search for: {query_phrase}")
190
  google_query = en.quote(query_phrase)
191
  response = []
192
  try:
@@ -203,14 +204,14 @@ def request_google(query_phrase):
203
  )
204
  response = requests.get(url)
205
  response_json = json.loads(response.text)
206
- logger.info(f"***** google search {int((time.time() - start_wall_time) * 10) / 10} sec")
207
  except:
208
  traceback.print_exc()
209
  return []
210
 
211
  # see if we got anything useful from Google
212
  if "items" not in response_json.keys():
213
- logger.info("no return from google ...", response, response_json.keys())
214
  return []
215
 
216
  urls = []
@@ -224,43 +225,43 @@ def request_google(query_phrase):
224
  return urls
225
 
226
 
227
- def response_text_extract(url, response):
228
- extract_text = ""
229
- if url.endswith("pdf"):
230
- pass
231
- else:
232
- if response is not None:
233
- elements = partition_html(text=response)
234
- str_elements = []
235
- logger.info('\n***** elements')
236
- for e in elements:
237
- stre = str(e).replace(" ", " ")
238
- str_elements.append(stre)
239
- extract_text = ''.join(extract_subtext(str_elements))
240
- logger.info(
241
- f"***** unstructured found {len(elements)} elements, {sum([len(str(e)) for e in elements])} raw chars, {len(extract_text)} extract"
242
- )
243
-
244
- if len(extract_text.strip()) < 8:
245
- return ""
246
- else:
247
- return extract_text
248
 
249
 
250
- def extract_items_from_numbered_list(text):
251
- items = ""
252
- elements = text.split("\n")
253
- for candidate in elements:
254
- candidate = candidate.lstrip(". \t")
255
- if len(candidate) > 4 and candidate[0].isdigit():
256
- candidate = candidate[1:].lstrip(". ")
257
- if (
258
- len(candidate) > 4 and candidate[0].isdigit()
259
- ): # strip second digit if more than 10 items
260
- candidate = candidate[1:].lstrip(". ")
261
- logger.info("E {}".format(candidate))
262
- items += candidate + " "
263
- return items
264
 
265
 
266
  def search_google(original_query, query_phrase):
@@ -295,8 +296,8 @@ def search_google(original_query, query_phrase):
295
  # initialize scan of Google urls
296
  start_wall_time = time.time()
297
  full_text, urls_used, tried_index, urls_tried = process_urls(all_urls)
298
- logger.info(f"***** urls_processed {int((time.time() - start_wall_time) * 10) / 10} sec")
299
- logger.info("return from url processsing")
300
  except:
301
  traceback.print_exc()
302
  return full_text, all_urls, urls_used, tried_index, urls_tried
 
35
  # query_phrase, keywords = ut.get_search_phrase_and_keywords(msg, [])
36
  google_text = ""
37
  try:
38
+ print(f"asking google {msg}; rephrased: {query_phrase}")
39
  google_text, urls_all, urls_used, tried_index, urls_tried = search_google(msg, query_phrase)
40
  except:
41
  traceback.print_exc()
42
 
43
+ print("\n\nFinal response: ")
44
 
45
  for item in google_text:
46
+ print(
47
  f"\n##############################################################################################\nSource: {item['source']}"
48
  )
49
+ print(f"{item['text']}")
50
+ print(f"URL: {item['url']}")
51
  return google_text
52
  except KeyboardInterrupt:
53
  traceback.print_exc()
 
88
  # response = driver.page_source
89
  client = ZenRowsClient(os.getenv('zenrows_api_key'))
90
  response = client.get(url)
91
+ print(f'got response, status: {response.status_code}')
92
  # result = response_text_extract(url=url, response=response)
93
  result = response.text
94
  except Exception:
 
102
  # return "", url
103
  except Exception:
104
  traceback.print_exc()
105
+ print(f"{site} err")
106
  pass
107
+ print(f"Processed {site}: {len(response.text)} / {len(result)} {int((time.time() - start_time) * 1000)} ms")
108
  return result, url
109
 
110
 
111
  def process_urls(urls):
112
  # Create a ThreadPoolExecutor with 5 worker threads
113
  response = []
114
+ print("entering process urls")
115
  full_text = ""
116
  used_index = 0
117
  urls_used = ["" for i in range(30)]
 
140
  urls_tried[tried_index] = url
141
  tried_index += 1
142
  urls.remove(url)
143
+ print(f"queued {ut.extract_site(url)}, {timeout}")
144
  # Process the responses as they arrive
145
  # for future in in_process:
146
  # if future.done():
 
150
  if len(result) > 0:
151
  urls_used[used_index] = url
152
  used_index += 1
153
+ print(
154
  f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
155
  )
156
  if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
 
164
 
165
  if (len(urls) == 0 and len(in_process) == 0) or (time.time() - start_time > 28):
166
  # executor.shutdown(wait=False)
167
+ print(
168
  f"n****** exiting process urls early {len(response)} {int(time.time() - start_time)} secs\n"
169
  )
170
  return response, used_index, urls_used, tried_index, urls_tried
 
172
  except:
173
  traceback.print_exc()
174
  # executor.shutdown(wait=False)
175
+ print(
176
  f"\n*****processed all urls {len(response)} {int(time.time() - start_time)} secs"
177
  )
178
  return response, urls_used, tried_index, urls_tried
 
183
 
184
 
185
  def request_google(query_phrase):
186
+ print(f"***** search {query_phrase}")
187
  sort = "&sort=date-sdate:d:w"
188
  if "today" in query_phrase or "latest" in query_phrase:
189
  sort = "&sort=date-sdate:d:s"
190
+ print(f"search for: {query_phrase}")
191
  google_query = en.quote(query_phrase)
192
  response = []
193
  try:
 
204
  )
205
  response = requests.get(url)
206
  response_json = json.loads(response.text)
207
+ print(f"***** google search {int((time.time() - start_wall_time) * 10) / 10} sec")
208
  except:
209
  traceback.print_exc()
210
  return []
211
 
212
  # see if we got anything useful from Google
213
  if "items" not in response_json.keys():
214
+ print("no return from google ...", response, response_json.keys())
215
  return []
216
 
217
  urls = []
 
225
  return urls
226
 
227
 
228
+ # def response_text_extract(url, response):
229
+ # extract_text = ""
230
+ # if url.endswith("pdf"):
231
+ # pass
232
+ # else:
233
+ # if response is not None:
234
+ # elements = partition_html(text=response)
235
+ # str_elements = []
236
+ # logger.info('\n***** elements')
237
+ # for e in elements:
238
+ # stre = str(e).replace(" ", " ")
239
+ # str_elements.append(stre)
240
+ # extract_text = ''.join(extract_subtext(str_elements))
241
+ # logger.info(
242
+ # f"***** unstructured found {len(elements)} elements, {sum([len(str(e)) for e in elements])} raw chars, {len(extract_text)} extract"
243
+ # )
244
+ #
245
+ # if len(extract_text.strip()) < 8:
246
+ # return ""
247
+ # else:
248
+ # return extract_text
249
 
250
 
251
+ # def extract_items_from_numbered_list(text):
252
+ # items = ""
253
+ # elements = text.split("\n")
254
+ # for candidate in elements:
255
+ # candidate = candidate.lstrip(". \t")
256
+ # if len(candidate) > 4 and candidate[0].isdigit():
257
+ # candidate = candidate[1:].lstrip(". ")
258
+ # if (
259
+ # len(candidate) > 4 and candidate[0].isdigit()
260
+ # ): # strip second digit if more than 10 items
261
+ # candidate = candidate[1:].lstrip(". ")
262
+ # logger.info("E {}".format(candidate))
263
+ # items += candidate + " "
264
+ # return items
265
 
266
 
267
  def search_google(original_query, query_phrase):
 
296
  # initialize scan of Google urls
297
  start_wall_time = time.time()
298
  full_text, urls_used, tried_index, urls_tried = process_urls(all_urls)
299
+ print(f"***** urls_processed {int((time.time() - start_wall_time) * 10) / 10} sec")
300
+ print("return from url processsing")
301
  except:
302
  traceback.print_exc()
303
  return full_text, all_urls, urls_used, tried_index, urls_tried