Spaces:
Sleeping
Sleeping
File size: 6,562 Bytes
328de20 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 |
"""googlesearch is a Python library for searching Google, easily."""
from time import sleep
from bs4 import BeautifulSoup
from requests import get
from urllib.parse import unquote # to decode the url
from chipsearch.useragentka import get_useragent
from curl_cffi import requests as curlreq
from chipsearch.gettyimages import get_images
def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify, region):
resp = get(
url="https://www.google.com/search",
headers={
"User-Agent": get_useragent(),
"Accept": "*/*"
},
params={
"q": term,
"num": results + 2, # Prevents multiple requests
"hl": lang,
"start": start,
"safe": safe,
"gl": region,
},
proxies=proxies,
timeout=timeout,
verify=ssl_verify,
cookies = {
'CONSENT': 'PENDING+987', # Bypasses the consent page
'SOCS': 'CAESHAgBEhIaAB',
}
)
resp.raise_for_status()
return resp
class SearchResult:
def __init__(self, url, title, description):
self.url = url
self.title = title
self.description = description
def __repr__(self):
return f"SearchResult(url={self.url}, title={self.title}, description={self.description})"
def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False):
"""Search the Google search engine"""
# Proxy setup
proxies = {"https": proxy, "http": proxy} if proxy and (proxy.startswith("https") or proxy.startswith("http")) else None
start = start_num
fetched_results = 0
fetched_links = set()
results_list = []
image_results = [] # New list for image results
while fetched_results < num_results:
# Send request
resp = _req(term, num_results - start,
lang, start, proxies, timeout, safe, ssl_verify, region)
# Parse
soup = BeautifulSoup(resp.text, "html.parser")
result_block = soup.find_all("div", class_="ezO2md")
new_results = 0
# Find all images on the page
try:
all_images = soup.find_all("img") # Google's image class
for img in all_images:
img_src = img.get("src") or img.get("data-src")
if img_src:
# Handle base64 images
if img_src.startswith("data:image"):
image_results.append({
"src": img_src, # Already base64 encoded
"alt": img.get("alt", ""),
"class": img.get("class", []),
})
# Handle regular image URLs
elif img_src.startswith("http"):
image_results.append({
"src": img_src,
"alt": img.get("alt", ""),
"class": img.get("class", []),
})
except Exception as e:
print(f"Error parsing images: {str(e)}")
for result in result_block:
link_tag = result.find("a", href=True)
title_tag = link_tag.find("span", class_="CVA68e") if link_tag else None
description_tag = result.find("span", class_="FrIlee")
if link_tag and title_tag and description_tag:
link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", ""))
if link in fetched_links and unique:
continue
fetched_links.add(link)
title = title_tag.text if title_tag else ""
description = description_tag.text if description_tag else ""
# Only get page_text if advanced mode and we haven't gotten any yet
if advanced and not any('page_text' in result for result in results_list):
try:
page_scrape = curlreq.get(link, impersonate='chrome110')
page_scrape.encoding = 'utf-8'
page_soup = BeautifulSoup(page_scrape.text, "html.parser")
# Try multiple strategies to find main content
main_content = (
page_soup.find(['article', 'main']) or
page_soup.find('div', {'id': ['content', 'main-content', 'body-content']}) or
page_soup.find('div', {'class': ['content', 'main', 'article', 'post']}) or
page_soup.find('div', {'role': 'main'}) or
page_soup.body
)
if main_content:
# Remove unwanted elements
for element in main_content(['script', 'style', 'noscript', 'svg', 'header', 'footer', 'nav']):
element.decompose()
# Extract text with better cleaning
text = main_content.get_text(separator=' ', strip=True)
text = ' '.join(line.strip() for line in text.splitlines() if line.strip())
page_text = ' '.join(word for word in text.split() if len(word) > 1)[:3000]
else:
page_text = ""
except Exception as e:
print(f"Error scraping {link}: {str(e)}")
page_text = ""
else:
page_text = ""
fetched_results += 1
new_results += 1
if advanced:
results_list.append({
"link": link,
"title": title,
"description": description,
"page_text": page_text,
})
else:
results_list.append(link)
if fetched_results >= num_results:
break
if new_results == 0:
break
start += 10
sleep(sleep_interval)
if image_results == [] :
images = get_images(term)
return {"results": results_list, "images": images}
else:
return {"results": results_list, "images": image_results}
|