Research-Assistant

Sleeping

App Files Files Community

Research-Assistant / actions /google_search.py

GianJSX

Duplicate from zej97/AI-Research-Assistant

cc93c47 about 1 year ago

raw

history blame contribute delete

2.14 kB

	import requests
	from bs4 import BeautifulSoup


	def get_urls(query, proxies=None):
	query = query
	url = f"https://www.google.com/search?q={query}"
	headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'}
	response = requests.get(url, headers=headers, proxies=proxies)
	soup = BeautifulSoup(response.content, 'html.parser')
	results = []
	for g in soup.find_all('div', class_='g'):
	anchors = g.find_all('a')
	if anchors:
	link = anchors[0]['href']
	if link.startswith('/url?q='):
	link = link[7:]
	if not link.startswith('http'):
	continue
	title = g.find('h3').text
	item = {'title': title, 'link': link}
	results.append(item)

	return results

	def scrape_text(url, proxies=None) -> str:
	"""Scrape text from a webpage

	Args:
	url (str): The URL to scrape text from

	Returns:
	str: The scraped text
	"""
	headers = {
	'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36',
	'Content-Type': 'text/plain',
	}
	try:
	response = requests.get(url, headers=headers, proxies=proxies, timeout=8)
	if response.encoding == "ISO-8859-1": response.encoding = response.apparent_encoding
	except:
	return "Unable to connect to the server"
	soup = BeautifulSoup(response.text, "html.parser")
	for script in soup(["script", "style"]):
	script.extract()
	text = soup.get_text()
	lines = (line.strip() for line in text.splitlines())
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	text = "\n".join(chunk for chunk in chunks if chunk)
	return text


	if __name__ == '__main__':
	txt = "What is LSTM?"
	proxies = None
	urls = get_urls(txt, proxies)
	max_search_result = 10

	for url in urls[:max_search_result]:
	print(url)
	print(scrape_text(url['link'], proxies))
	print("\n\n")