Spaces:
Sleeping
Sleeping
arabellastrange
commited on
Commit
·
6ce8a36
1
Parent(s):
200cc44
concurrent pool for process_url
Browse files- web_search.py +18 -26
web_search.py
CHANGED
@@ -6,20 +6,14 @@ import time
|
|
6 |
import traceback
|
7 |
import urllib.parse as en
|
8 |
import warnings
|
|
|
9 |
from itertools import zip_longest
|
10 |
|
11 |
import requests
|
12 |
from zenrows import ZenRowsClient
|
13 |
|
14 |
-
# this import style works in pycharm
|
15 |
from llmsearch import utilityV2 as ut
|
16 |
|
17 |
-
# this import style works on sever + vs code
|
18 |
-
# import utils
|
19 |
-
# from llmsearch import google_search_concurrent as gs
|
20 |
-
# from llmsearch import meta as mt
|
21 |
-
# from llmsearch import utilityV2 as ut
|
22 |
-
|
23 |
logger = logging.getLogger("agent_logger")
|
24 |
|
25 |
|
@@ -56,6 +50,7 @@ def search(msg, query_phrase):
|
|
56 |
|
57 |
# Define a function to make a single URL request and process the response
|
58 |
def process_url(url):
|
|
|
59 |
start_time = time.time()
|
60 |
site = ut.extract_site(url)
|
61 |
result = ""
|
@@ -68,44 +63,41 @@ def process_url(url):
|
|
68 |
response = client.get(url)
|
69 |
print(f'got response, status: {response.status_code}')
|
70 |
result = response.text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
except Exception:
|
72 |
traceback.print_exc()
|
73 |
-
return
|
74 |
except Exception:
|
75 |
traceback.print_exc()
|
76 |
print(f"{site} err")
|
77 |
pass
|
78 |
print(f"Processed {site}: {len(result)} {int((time.time() - start_time) * 1000)} ms")
|
79 |
-
return
|
80 |
|
81 |
|
82 |
def process_urls(urls):
|
83 |
-
response = []
|
84 |
print(f"entering process urls: {len(urls)} found. {urls}")
|
85 |
start_time = time.time()
|
|
|
86 |
|
87 |
try:
|
88 |
-
|
89 |
-
|
90 |
-
if len(result) > 0:
|
91 |
-
if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
|
92 |
-
print(
|
93 |
-
f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
|
94 |
-
)
|
95 |
-
response.append(
|
96 |
-
{
|
97 |
-
"source": ut.extract_domain(url),
|
98 |
-
"url": url,
|
99 |
-
"text": result,
|
100 |
-
}
|
101 |
-
)
|
102 |
except:
|
103 |
traceback.print_exc()
|
104 |
|
105 |
print(
|
106 |
-
f"\n*****processed all urls {len(
|
107 |
)
|
108 |
-
return
|
109 |
|
110 |
|
111 |
def extract_subtext(text):
|
|
|
6 |
import traceback
|
7 |
import urllib.parse as en
|
8 |
import warnings
|
9 |
+
from concurrent.futures import ThreadPoolExecutor
|
10 |
from itertools import zip_longest
|
11 |
|
12 |
import requests
|
13 |
from zenrows import ZenRowsClient
|
14 |
|
|
|
15 |
from llmsearch import utilityV2 as ut
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
logger = logging.getLogger("agent_logger")
|
18 |
|
19 |
|
|
|
50 |
|
51 |
# Define a function to make a single URL request and process the response
|
52 |
def process_url(url):
|
53 |
+
processed_page = []
|
54 |
start_time = time.time()
|
55 |
site = ut.extract_site(url)
|
56 |
result = ""
|
|
|
63 |
response = client.get(url)
|
64 |
print(f'got response, status: {response.status_code}')
|
65 |
result = response.text
|
66 |
+
if len(result) > 0:
|
67 |
+
if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
|
68 |
+
processed_page.append(
|
69 |
+
{
|
70 |
+
"source": ut.extract_domain(url),
|
71 |
+
"url": url,
|
72 |
+
"text": result,
|
73 |
+
}
|
74 |
+
)
|
75 |
except Exception:
|
76 |
traceback.print_exc()
|
77 |
+
return processed_page
|
78 |
except Exception:
|
79 |
traceback.print_exc()
|
80 |
print(f"{site} err")
|
81 |
pass
|
82 |
print(f"Processed {site}: {len(result)} {int((time.time() - start_time) * 1000)} ms")
|
83 |
+
return processed_page
|
84 |
|
85 |
|
86 |
def process_urls(urls):
|
|
|
87 |
print(f"entering process urls: {len(urls)} found. {urls}")
|
88 |
start_time = time.time()
|
89 |
+
results = []
|
90 |
|
91 |
try:
|
92 |
+
with ThreadPoolExecutor(max_workers=len(urls)) as pool:
|
93 |
+
results = pool.map(process_url, urls)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
except:
|
95 |
traceback.print_exc()
|
96 |
|
97 |
print(
|
98 |
+
f"\n*****processed all urls {len(results)} {int(time.time() - start_time)} secs"
|
99 |
)
|
100 |
+
return results
|
101 |
|
102 |
|
103 |
def extract_subtext(text):
|