client

Sleeping

App Files Files Community

Ashhar commited on Sep 27, 2024

Commit

9dfbf64

1 Parent(s): 19598a5

fixed bug with scraper'

Browse files

Files changed (1) hide show

tools/webScraper.py +7 -4

tools/webScraper.py CHANGED Viewed

@@ -3,7 +3,7 @@ from urllib.parse import parse_qs, urlparse
 from bs4 import BeautifulSoup
 import requests
 from typing import TypedDict
-import utils as U
 SIMULATE_BROWSER = os.environ.get("SIMULATE_BROWSER_SEARCH") == "true"
@@ -36,7 +36,7 @@ else:
 def scrapeGoogleSearch(query):
-    U.pprint(f"{SIMULATE_BROWSER=}")
     finalResponse = []
     headers = {
@@ -61,7 +61,6 @@ def scrapeGoogleSearch(query):
             with open("soup_dump.html", "w", encoding="utf-8") as file:
                 file.write(soup.prettify())
-            results = soup.find("body")
             mainDiv = soup.find("div", attrs={"id": "main"})
             answerText = ""
             if SELECTORS.get("answer"):
@@ -80,6 +79,8 @@ def scrapeGoogleSearch(query):
             results = mainDiv.select(SELECTORS["search_results"])
             resultsDesc = mainDiv.select(SELECTORS["search_results_desc"])
             if results:
                 finalResponse.append("Search Results:\n")
@@ -91,7 +92,7 @@ def scrapeGoogleSearch(query):
                     parsedUrl = urlparse(link)
                     urlParams = parse_qs(parsedUrl.query)
                     link = urlParams.get("q", [None])[0]
-                desc = resultsDesc[i].text
                 finalResponse.append(f"Title: {title}")
                 finalResponse.append(f"Description: {desc}")
                 finalResponse.append(f"URL: {link}\n")
@@ -99,3 +100,5 @@ def scrapeGoogleSearch(query):
             print("Failed to retrieve search results.")
     return "\n".join(finalResponse)

 from bs4 import BeautifulSoup
 import requests
 from typing import TypedDict
+# import utils as U
 SIMULATE_BROWSER = os.environ.get("SIMULATE_BROWSER_SEARCH") == "true"
 def scrapeGoogleSearch(query):
+    # U.pprint(f"{SIMULATE_BROWSER=}")
     finalResponse = []
     headers = {
             with open("soup_dump.html", "w", encoding="utf-8") as file:
                 file.write(soup.prettify())
             mainDiv = soup.find("div", attrs={"id": "main"})
             answerText = ""
             if SELECTORS.get("answer"):
             results = mainDiv.select(SELECTORS["search_results"])
             resultsDesc = mainDiv.select(SELECTORS["search_results_desc"])
+            # Ensure resultsDesc has the same length as results
+            resultsDesc += [None] * (len(results) - len(resultsDesc))
             if results:
                 finalResponse.append("Search Results:\n")
                     parsedUrl = urlparse(link)
                     urlParams = parse_qs(parsedUrl.query)
                     link = urlParams.get("q", [None])[0]
+                desc = resultsDesc[i].text if resultsDesc[i] else ""
                 finalResponse.append(f"Title: {title}")
                 finalResponse.append(f"Description: {desc}")
                 finalResponse.append(f"URL: {link}\n")
             print("Failed to retrieve search results.")
     return "\n".join(finalResponse)
+print(scrapeGoogleSearch("kommuneity"))