Update webscout.py
Browse files- webscout.py +171 -0
webscout.py
CHANGED
@@ -1811,3 +1811,174 @@ def fastai(user, model="llama3-70b", system="Answer as concisely as possible."):
|
|
1811 |
return output
|
1812 |
|
1813 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1811 |
return output
|
1812 |
|
1813 |
|
1814 |
+
from bs4 import BeautifulSoup
|
1815 |
+
import requests
|
1816 |
+
from typing import Dict, List, Optional, Union
|
1817 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
1818 |
+
from urllib.parse import quote
|
1819 |
+
from termcolor import colored
|
1820 |
+
import time
|
1821 |
+
import random
|
1822 |
+
|
1823 |
+
class GoogleS:
|
1824 |
+
"""
|
1825 |
+
Class to perform Google searches and retrieve results.
|
1826 |
+
"""
|
1827 |
+
|
1828 |
+
def __init__(
|
1829 |
+
self,
|
1830 |
+
headers: Optional[Dict[str, str]] = None,
|
1831 |
+
proxy: Optional[str] = None,
|
1832 |
+
timeout: Optional[int] = 10,
|
1833 |
+
max_workers: int = 20 # Increased max workers for thread pool
|
1834 |
+
):
|
1835 |
+
"""Initializes the GoogleS object."""
|
1836 |
+
self.proxy = proxy
|
1837 |
+
self.headers = headers if headers else {
|
1838 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62"
|
1839 |
+
}
|
1840 |
+
self.headers["Referer"] = "https://www.google.com/"
|
1841 |
+
self.client = requests.Session()
|
1842 |
+
self.client.headers.update(self.headers)
|
1843 |
+
self.client.proxies.update({"http": self.proxy, "https": self.proxy})
|
1844 |
+
self.timeout = timeout
|
1845 |
+
self._executor = ThreadPoolExecutor(max_workers=max_workers)
|
1846 |
+
|
1847 |
+
def __enter__(self):
|
1848 |
+
return self
|
1849 |
+
|
1850 |
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
1851 |
+
self.client.close()
|
1852 |
+
|
1853 |
+
def _get_url(self, method: str, url: str, params: Optional[Dict[str, str]] = None,
|
1854 |
+
data: Optional[Union[Dict[str, str], bytes]] = None) -> bytes:
|
1855 |
+
"""
|
1856 |
+
Makes an HTTP request and returns the response content.
|
1857 |
+
"""
|
1858 |
+
try:
|
1859 |
+
resp = self.client.request(method, url, params=params, data=data, timeout=self.timeout)
|
1860 |
+
except Exception as ex:
|
1861 |
+
raise Exception(f"{url} {type(ex).__name__}: {ex}") from ex
|
1862 |
+
if resp.status_code == 200:
|
1863 |
+
return resp.content
|
1864 |
+
raise Exception(f"{resp.url} returned status code {resp.status_code}. {params=} {data=}")
|
1865 |
+
|
1866 |
+
def _extract_text_from_webpage(self, html_content: bytes, max_characters: Optional[int] = None) -> str:
|
1867 |
+
"""
|
1868 |
+
Extracts visible text from HTML content using lxml parser.
|
1869 |
+
"""
|
1870 |
+
soup = BeautifulSoup(html_content, 'lxml') # Use lxml parser
|
1871 |
+
for tag in soup(["script", "style", "header", "footer", "nav"]):
|
1872 |
+
tag.extract()
|
1873 |
+
visible_text = soup.get_text(strip=True)
|
1874 |
+
if max_characters:
|
1875 |
+
visible_text = visible_text[:max_characters]
|
1876 |
+
return visible_text
|
1877 |
+
|
1878 |
+
def search(
|
1879 |
+
self,
|
1880 |
+
query: str,
|
1881 |
+
region: str = "us-en",
|
1882 |
+
language: str = "en",
|
1883 |
+
safe: str = "off",
|
1884 |
+
time_period: Optional[str] = None,
|
1885 |
+
max_results: int = 10,
|
1886 |
+
extract_text: bool = False,
|
1887 |
+
max_text_length: Optional[int] = 100,
|
1888 |
+
) -> List[Dict[str, Union[str, int]]]:
|
1889 |
+
"""
|
1890 |
+
Performs a Google search and returns the results.
|
1891 |
+
|
1892 |
+
Args:
|
1893 |
+
query (str): The search query.
|
1894 |
+
region (str, optional): The region to search in (e.g., "us-en"). Defaults to "us-en".
|
1895 |
+
language (str, optional): The language of the search results (e.g., "en"). Defaults to "en".
|
1896 |
+
safe (str, optional): Safe search setting ("off", "active"). Defaults to "off".
|
1897 |
+
time_period (Optional[str], optional): Time period filter (e.g., "h" for past hour, "d" for past day).
|
1898 |
+
Defaults to None.
|
1899 |
+
max_results (int, optional): The maximum number of results to retrieve. Defaults to 10.
|
1900 |
+
extract_text (bool, optional): Whether to extract text from the linked web pages. Defaults to False.
|
1901 |
+
max_text_length (Optional[int], optional): The maximum length of the extracted text (in characters).
|
1902 |
+
Defaults to 100.
|
1903 |
+
|
1904 |
+
Returns:
|
1905 |
+
List[Dict[str, Union[str, int]]]: A list of dictionaries, each representing a search result, containing:
|
1906 |
+
- 'title': The title of the result.
|
1907 |
+
- 'href': The URL of the result.
|
1908 |
+
- 'abstract': The description snippet of the result.
|
1909 |
+
- 'index': The index of the result in the list.
|
1910 |
+
- 'type': The type of result (currently always "web").
|
1911 |
+
- 'visible_text': The extracted text from the web page (if `extract_text` is True).
|
1912 |
+
"""
|
1913 |
+
assert query, "Query cannot be empty."
|
1914 |
+
|
1915 |
+
results = []
|
1916 |
+
futures = []
|
1917 |
+
start = 0
|
1918 |
+
|
1919 |
+
while len(results) < max_results:
|
1920 |
+
params = {
|
1921 |
+
"q": query,
|
1922 |
+
"num": 10,
|
1923 |
+
"hl": language,
|
1924 |
+
"start": start,
|
1925 |
+
"safe": safe,
|
1926 |
+
"gl": region,
|
1927 |
+
}
|
1928 |
+
if time_period:
|
1929 |
+
params["tbs"] = f"qdr:{time_period}"
|
1930 |
+
|
1931 |
+
futures.append(self._executor.submit(self._get_url, "GET", "https://www.google.com/search", params=params))
|
1932 |
+
start += 10
|
1933 |
+
|
1934 |
+
for future in as_completed(futures):
|
1935 |
+
try:
|
1936 |
+
resp_content = future.result()
|
1937 |
+
soup = BeautifulSoup(resp_content, 'lxml') # Use lxml parser
|
1938 |
+
result_blocks = soup.find_all("div", class_="g")
|
1939 |
+
|
1940 |
+
if not result_blocks:
|
1941 |
+
break
|
1942 |
+
|
1943 |
+
# Extract links and titles first
|
1944 |
+
for result_block in result_blocks:
|
1945 |
+
link = result_block.find("a", href=True)
|
1946 |
+
title = result_block.find("h3")
|
1947 |
+
description_box = result_block.find(
|
1948 |
+
"div", {"style": "-webkit-line-clamp:2"}
|
1949 |
+
)
|
1950 |
+
|
1951 |
+
if link and title and description_box:
|
1952 |
+
url = link["href"]
|
1953 |
+
results.append({
|
1954 |
+
"title": title.text,
|
1955 |
+
"href": url,
|
1956 |
+
"abstract": description_box.text,
|
1957 |
+
"index": len(results),
|
1958 |
+
"type": "web",
|
1959 |
+
"visible_text": "" # Initialize visible_text as empty string
|
1960 |
+
})
|
1961 |
+
|
1962 |
+
if len(results) >= max_results:
|
1963 |
+
break # Stop if we have enough results
|
1964 |
+
|
1965 |
+
# Parallelize text extraction if needed
|
1966 |
+
if extract_text:
|
1967 |
+
with ThreadPoolExecutor(max_workers=self._executor._max_workers) as text_extractor:
|
1968 |
+
extraction_futures = [
|
1969 |
+
text_extractor.submit(self._extract_text_from_webpage,
|
1970 |
+
self._get_url("GET", result['href']),
|
1971 |
+
max_characters=max_text_length)
|
1972 |
+
for result in results
|
1973 |
+
if 'href' in result
|
1974 |
+
]
|
1975 |
+
for i, future in enumerate(as_completed(extraction_futures)):
|
1976 |
+
try:
|
1977 |
+
results[i]['visible_text'] = future.result()
|
1978 |
+
except Exception as e:
|
1979 |
+
print(f"Error extracting text: {e}")
|
1980 |
+
|
1981 |
+
except Exception as e:
|
1982 |
+
print(f"Error: {e}")
|
1983 |
+
|
1984 |
+
return results
|