Spaces:
Sleeping
Sleeping
File size: 2,926 Bytes
831e906 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import requests
from bs4 import BeautifulSoup
import re
import concurrent.futures
class SearchClient:
def __init__(self, vendor, engine_id=None, api_key=None):
self.vendor = vendor
if vendor == "google":
self.endpoint = f"https://www.googleapis.com/customsearch/v1?key={api_key}&cx={engine_id}"
elif vendor == "bing":
self.endpoint = "https://api.bing.microsoft.com/v7.0/search"
self.headers = {
"Ocp-Apim-Subscription-Key": api_key,
}
@staticmethod
def _extract_text_from_link(link):
page = requests.get(link)
if page.status_code == 200:
soup = BeautifulSoup(page.content, "html.parser")
text = soup.get_text()
cleaned_text = re.sub(r"\s+", " ", text)
return cleaned_text
return None
def _fetch_text_from_links(self, links):
results = []
with concurrent.futures.ThreadPoolExecutor() as executor:
future_to_link = {
executor.submit(self._extract_text_from_link, link): link
for link in links
}
for future in concurrent.futures.as_completed(future_to_link):
link = future_to_link[future]
try:
cleaned_text = future.result()
if cleaned_text:
results.append({"text": cleaned_text, "link": link})
except Exception as e:
print(f"Error fetching data from {link}: {e}")
return results
def _google_search(self, query, n_crawl):
response = requests.get(self.endpoint, params={"q": query})
search_results = response.json()
results = []
count = 0
for item in search_results.get("items", []):
if count >= n_crawl:
break
link = item["link"]
results.append(link)
count += 1
text_results = self._fetch_text_from_links(results)
return text_results
def _bing_search(self, query, n_crawl):
params = {
"q": query,
"count": n_crawl, # You might need to adjust this based on Bing API requirements
"mkt": "en-US",
}
response = requests.get(self.endpoint, headers=self.headers, params=params)
search_results = response.json()
results = []
for item in search_results.get("webPages", {}).get("value", []):
link = item["url"]
results.append(link)
text_results = self._fetch_text_from_links(results)
return text_results
def search(self, query, n_crawl):
if self.vendor == "google":
return self._google_search(query, n_crawl)
elif self.vendor == "bing":
return self._bing_search(query, n_crawl)
else:
return "Invalid vendor"
|