Spaces:

OEvortex
/

Webscout-API

Running

App Files Files Community

Abhaykoul commited on Jul 26

Commit

e344f2d

•

1 Parent(s): 1ee12e5

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -13

app.py CHANGED Viewed

@@ -177,10 +177,13 @@ async def chat(
 def extract_text_from_webpage(html_content):
     """Extracts visible text from HTML content using BeautifulSoup."""
-    soup = BeautifulSoup(html_content)
-    for tag in soup(["script", "style", "header", "footer"]):
         tag.extract()
-    return soup.get_text(strip=True)
 async def fetch_and_extract(url, max_chars, proxy: Optional[str] = None):
     """Fetches a URL and extracts text asynchronously."""
@@ -245,15 +248,19 @@ async def web_search_and_extract(
 def extract_text_from_webpage2(html_content):
     """Extracts visible text from HTML content using BeautifulSoup."""
-    soup = BeautifulSoup(html_content)
-    for tag in soup(["script", "style", "header", "footer"]):
         tag.extract()
-    return soup.get_text(strip=True)
-def fetch_and_extract2(url, max_chars):
     """Fetches a URL and extracts text using threading."""
     try:
-        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"})
         response.raise_for_status()
         html_content = response.text
         visible_text = extract_text_from_webpage2(html_content)
@@ -267,19 +274,20 @@ def fetch_and_extract2(url, max_chars):
 @app.get("/api/websearch-and-extract-threading")
 def web_search_and_extract_threading(
     q: str,
-    max_results: int = 10,
     timelimit: Optional[str] = None,
     safesearch: str = "moderate",
     region: str = "wt-wt",
     backend: str = "html",
-    max_chars: int = 10000,
-    extract_only: bool = True
 ):
     """
     Searches using WEBS, extracts text from the top results using threading, and returns both.
     """
     try:
-        with WEBS() as webs:
             # Perform WEBS search
             search_results = webs.text(keywords=q, region=region, safesearch=safesearch,
                                      timelimit=timelimit, backend=backend, max_results=max_results)
@@ -289,7 +297,7 @@ def web_search_and_extract_threading(
             threads = []
             for result in search_results:
                 if 'href' in result:
-                    thread = threading.Thread(target=lambda: extracted_results.append(fetch_and_extract2(result['href'], max_chars)))
                     threads.append(thread)
                     thread.start()

 def extract_text_from_webpage(html_content):
     """Extracts visible text from HTML content using BeautifulSoup."""
+    soup = BeautifulSoup(html_content, "html.parser")
+    # Remove unwanted tags
+    for tag in soup(["script", "style", "header", "footer", "nav"]):
         tag.extract()
+    # Get the remaining visible text
+    visible_text = soup.get_text(strip=True)
+    return visible_text
 async def fetch_and_extract(url, max_chars, proxy: Optional[str] = None):
     """Fetches a URL and extracts text asynchronously."""
 def extract_text_from_webpage2(html_content):
     """Extracts visible text from HTML content using BeautifulSoup."""
+    soup = BeautifulSoup(html_content, "html.parser")
+    # Remove unwanted tags
+    for tag in soup(["script", "style", "header", "footer", "nav"]):
         tag.extract()
+    # Get the remaining visible text
+    visible_text = soup.get_text(strip=True)
+    return visible_text
+def fetch_and_extract2(url, max_chars, proxy: Optional[str] = None):
     """Fetches a URL and extracts text using threading."""
+    proxies = {'http': proxy, 'https': proxy} if proxy else None
     try:
+        response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, proxies=proxies)
         response.raise_for_status()
         html_content = response.text
         visible_text = extract_text_from_webpage2(html_content)
 @app.get("/api/websearch-and-extract-threading")
 def web_search_and_extract_threading(
     q: str,
+    max_results: int = 3,
     timelimit: Optional[str] = None,
     safesearch: str = "moderate",
     region: str = "wt-wt",
     backend: str = "html",
+    max_chars: int = 6000,
+    extract_only: bool = True,
+    proxy: Optional[str] = None
 ):
     """
     Searches using WEBS, extracts text from the top results using threading, and returns both.
     """
     try:
+        with WEBS(proxy=proxy) as webs:
             # Perform WEBS search
             search_results = webs.text(keywords=q, region=region, safesearch=safesearch,
                                      timelimit=timelimit, backend=backend, max_results=max_results)
             threads = []
             for result in search_results:
                 if 'href' in result:
+                    thread = threading.Thread(target=lambda: extracted_results.append(fetch_and_extract2(result['href'], max_chars, proxy)))
                     threads.append(thread)
                     thread.start()