Spaces:

arabellastrange
/

search-assistant

Sleeping

App Files Files

arabellastrange commited on Jul 26, 2024

Commit

6ce8a36

1 Parent(s): 200cc44

concurrent pool for process_url

Browse files

Files changed (1) hide show

web_search.py +18 -26

web_search.py CHANGED Viewed

@@ -6,20 +6,14 @@ import time
 import traceback
 import urllib.parse as en
 import warnings
 from itertools import zip_longest
 import requests
 from zenrows import ZenRowsClient
-# this import style works in pycharm
 from llmsearch import utilityV2 as ut
-# this import style works on sever + vs code
-# import utils
-# from llmsearch import google_search_concurrent as gs
-# from llmsearch import meta as mt
-# from llmsearch import utilityV2 as ut
 logger = logging.getLogger("agent_logger")
@@ -56,6 +50,7 @@ def search(msg, query_phrase):
 # Define a function to make a single URL request and process the response
 def process_url(url):
     start_time = time.time()
     site = ut.extract_site(url)
     result = ""
@@ -68,44 +63,41 @@ def process_url(url):
                 response = client.get(url)
                 print(f'got response, status: {response.status_code}')
                 result = response.text
             except Exception:
                 traceback.print_exc()
-                return "", url
     except Exception:
         traceback.print_exc()
         print(f"{site} err")
         pass
     print(f"Processed {site}: {len(result)} {int((time.time() - start_time) * 1000)} ms")
-    return result, url
 def process_urls(urls):
-    response = []
     print(f"entering process urls: {len(urls)} found. {urls}")
     start_time = time.time()
     try:
-        for url in urls:
-            result, url = process_url(url)
-            if len(result) > 0:
-                if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
-                    print(
-                        f"adding {len(result)} chars from {ut.extract_site(url)} to {len(response)} prior responses"
-                    )
-                    response.append(
-                        {
-                            "source": ut.extract_domain(url),
-                            "url": url,
-                            "text": result,
-                        }
-                    )
     except:
         traceback.print_exc()
     print(
-        f"\n*****processed all urls {len(response)}  {int(time.time() - start_time)} secs"
     )
-    return response
 def extract_subtext(text):

 import traceback
 import urllib.parse as en
 import warnings
+from concurrent.futures import ThreadPoolExecutor
 from itertools import zip_longest
 import requests
 from zenrows import ZenRowsClient
 from llmsearch import utilityV2 as ut
 logger = logging.getLogger("agent_logger")
 # Define a function to make a single URL request and process the response
 def process_url(url):
+    processed_page = []
     start_time = time.time()
     site = ut.extract_site(url)
     result = ""
                 response = client.get(url)
                 print(f'got response, status: {response.status_code}')
                 result = response.text
+                if len(result) > 0:
+                    if "an error has occurred" not in result.lower() and "permission to view this page" not in result.lower() and "403 ERROR" not in result.lower() and "have been blocked" not in result.lower() and "too many requests" not in result.lower():
+                        processed_page.append(
+                            {
+                                "source": ut.extract_domain(url),
+                                "url": url,
+                                "text": result,
+                            }
+                        )
             except Exception:
                 traceback.print_exc()
+                return processed_page
     except Exception:
         traceback.print_exc()
         print(f"{site} err")
         pass
     print(f"Processed {site}: {len(result)} {int((time.time() - start_time) * 1000)} ms")
+    return processed_page
 def process_urls(urls):
     print(f"entering process urls: {len(urls)} found. {urls}")
     start_time = time.time()
+    results = []
     try:
+        with ThreadPoolExecutor(max_workers=len(urls)) as pool:
+            results = pool.map(process_url, urls)
     except:
         traceback.print_exc()
     print(
+        f"\n*****processed all urls {len(results)}  {int(time.time() - start_time)} secs"
     )
+    return results
 def extract_subtext(text):