import requests from bs4 import BeautifulSoup import re import concurrent.futures class SearchClient: def __init__(self, vendor, engine_id=None, api_key=None): self.vendor = vendor if vendor == "google": self.endpoint = f"https://www.googleapis.com/customsearch/v1?key={api_key}&cx={engine_id}" elif vendor == "bing": self.endpoint = "https://api.bing.microsoft.com/v7.0/search" self.headers = { "Ocp-Apim-Subscription-Key": api_key, } @staticmethod def _extract_text_from_link(link): page = requests.get(link) if page.status_code == 200: soup = BeautifulSoup(page.content, "html.parser") text = soup.get_text() cleaned_text = re.sub(r"\s+", " ", text) return cleaned_text return None def _fetch_text_from_links(self, links): results = [] with concurrent.futures.ThreadPoolExecutor() as executor: future_to_link = { executor.submit(self._extract_text_from_link, link): link for link in links } for future in concurrent.futures.as_completed(future_to_link): link = future_to_link[future] try: cleaned_text = future.result() if cleaned_text: results.append({"text": cleaned_text, "link": link}) except Exception as e: print(f"Error fetching data from {link}: {e}") return results def _google_search(self, query, n_crawl): response = requests.get(self.endpoint, params={"q": query}) search_results = response.json() results = [] count = 0 for item in search_results.get("items", []): if count >= n_crawl: break link = item["link"] results.append(link) count += 1 text_results = self._fetch_text_from_links(results) return text_results def _bing_search(self, query, n_crawl): params = { "q": query, "count": n_crawl, # You might need to adjust this based on Bing API requirements "mkt": "en-US", } response = requests.get(self.endpoint, headers=self.headers, params=params) search_results = response.json() results = [] for item in search_results.get("webPages", {}).get("value", []): link = item["url"] results.append(link) text_results = self._fetch_text_from_links(results) return text_results def search(self, query, n_crawl): if self.vendor == "google": return self._google_search(query, n_crawl) elif self.vendor == "bing": return self._bing_search(query, n_crawl) else: return "Invalid vendor"