File size: 2,926 Bytes
831e906
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import requests
from bs4 import BeautifulSoup
import re
import concurrent.futures


class SearchClient:
    def __init__(self, vendor, engine_id=None, api_key=None):
        self.vendor = vendor
        if vendor == "google":
            self.endpoint = f"https://www.googleapis.com/customsearch/v1?key={api_key}&cx={engine_id}"
        elif vendor == "bing":
            self.endpoint = "https://api.bing.microsoft.com/v7.0/search"
            self.headers = {
                "Ocp-Apim-Subscription-Key": api_key,
            }

    @staticmethod
    def _extract_text_from_link(link):
        page = requests.get(link)
        if page.status_code == 200:
            soup = BeautifulSoup(page.content, "html.parser")
            text = soup.get_text()
            cleaned_text = re.sub(r"\s+", " ", text)
            return cleaned_text
        return None

    def _fetch_text_from_links(self, links):
        results = []
        with concurrent.futures.ThreadPoolExecutor() as executor:
            future_to_link = {
                executor.submit(self._extract_text_from_link, link): link
                for link in links
            }
            for future in concurrent.futures.as_completed(future_to_link):
                link = future_to_link[future]
                try:
                    cleaned_text = future.result()
                    if cleaned_text:
                        results.append({"text": cleaned_text, "link": link})
                except Exception as e:
                    print(f"Error fetching data from {link}: {e}")
        return results

    def _google_search(self, query, n_crawl):
        response = requests.get(self.endpoint, params={"q": query})
        search_results = response.json()

        results = []
        count = 0
        for item in search_results.get("items", []):
            if count >= n_crawl:
                break

            link = item["link"]
            results.append(link)
            count += 1

        text_results = self._fetch_text_from_links(results)
        return text_results

    def _bing_search(self, query, n_crawl):
        params = {
            "q": query,
            "count": n_crawl,  # You might need to adjust this based on Bing API requirements
            "mkt": "en-US",
        }
        response = requests.get(self.endpoint, headers=self.headers, params=params)
        search_results = response.json()

        results = []
        for item in search_results.get("webPages", {}).get("value", []):
            link = item["url"]
            results.append(link)

        text_results = self._fetch_text_from_links(results)
        return text_results

    def search(self, query, n_crawl):
        if self.vendor == "google":
            return self._google_search(query, n_crawl)
        elif self.vendor == "bing":
            return self._bing_search(query, n_crawl)
        else:
            return "Invalid vendor"