arabellastrange commited on
Commit
6ce8a36
·
1 Parent(s): 200cc44

concurrent pool for process_url

Browse files
Files changed (1) hide show
  1. web_search.py +18 -26
web_search.py CHANGED
@@ -6,20 +6,14 @@ import time
6
  import traceback
7
  import urllib.parse as en
8
  import warnings
 
9
  from itertools import zip_longest
10
 
11
  import requests
12
  from zenrows import ZenRowsClient
13
 
14
- # this import style works in pycharm
15
  from llmsearch import utilityV2 as ut
16
 
17
- # this import style works on sever + vs code
18
- # import utils
19
- # from llmsearch import google_search_concurrent as gs
20
- # from llmsearch import meta as mt
21
- # from llmsearch import utilityV2 as ut
22
-
23
  logger = logging.getLogger("agent_logger")
24
 
25
 
@@ -56,6 +50,7 @@ def search(msg, query_phrase):
56
 
57
  # Define a function to make a single URL request and process the response
58
  def process_url(url):
 
59
  start_time = time.time()
60
  site = ut.extract_site(url)
61
  result = ""
@@ -68,44 +63,41 @@ def process_url(url):
68
  response = client.get(url)
69
  print(f'got response, status: {response.status_code}')
70
  result = response.text
 
 
 
 
 
 
 
 
 
71
  except Exception:
72
  traceback.print_exc()
73
- return "", url
74
  except Exception:
75
  traceback.print_exc()
76
  print(f"{site} err")
77
  pass
78
  print(f"Processed {site}: {len(result)} {int((time.time() - start_time) * 1000)} ms")
79
- return result, url
80
 
81
 
82
  def process_urls(urls):
83
- response = []
84
  print(f"entering process urls: {len(urls)} found. {urls}")
85
  start_time = time.time()
 
86
 
87
  try:
88
- for url in urls:
89
- result, url = process_url(url)
90
- if len(result) > 0:
91
- if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
92
- print(
93
- f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
94
- )
95
- response.append(
96
- {
97
- "source": ut.extract_domain(url),
98
- "url": url,
99
- "text": result,
100
- }
101
- )
102
  except:
103
  traceback.print_exc()
104
 
105
  print(
106
- f"\n*****processed all urls {len(response)} {int(time.time() - start_time)} secs"
107
  )
108
- return response
109
 
110
 
111
  def extract_subtext(text):
 
6
  import traceback
7
  import urllib.parse as en
8
  import warnings
9
+ from concurrent.futures import ThreadPoolExecutor
10
  from itertools import zip_longest
11
 
12
  import requests
13
  from zenrows import ZenRowsClient
14
 
 
15
  from llmsearch import utilityV2 as ut
16
 
 
 
 
 
 
 
17
  logger = logging.getLogger("agent_logger")
18
 
19
 
 
50
 
51
  # Define a function to make a single URL request and process the response
52
  def process_url(url):
53
+ processed_page = []
54
  start_time = time.time()
55
  site = ut.extract_site(url)
56
  result = ""
 
63
  response = client.get(url)
64
  print(f'got response, status: {response.status_code}')
65
  result = response.text
66
+ if len(result) > 0:
67
+ if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
68
+ processed_page.append(
69
+ {
70
+ "source": ut.extract_domain(url),
71
+ "url": url,
72
+ "text": result,
73
+ }
74
+ )
75
  except Exception:
76
  traceback.print_exc()
77
+ return processed_page
78
  except Exception:
79
  traceback.print_exc()
80
  print(f"{site} err")
81
  pass
82
  print(f"Processed {site}: {len(result)} {int((time.time() - start_time) * 1000)} ms")
83
+ return processed_page
84
 
85
 
86
  def process_urls(urls):
 
87
  print(f"entering process urls: {len(urls)} found. {urls}")
88
  start_time = time.time()
89
+ results = []
90
 
91
  try:
92
+ with ThreadPoolExecutor(max_workers=len(urls)) as pool:
93
+ results = pool.map(process_url, urls)
 
 
 
 
 
 
 
 
 
 
 
 
94
  except:
95
  traceback.print_exc()
96
 
97
  print(
98
+ f"\n*****processed all urls {len(results)} {int(time.time() - start_time)} secs"
99
  )
100
+ return results
101
 
102
 
103
  def extract_subtext(text):