import requests from bs4 import BeautifulSoup def get_urls(query, proxies=None): query = query url = f"https://www.google.com/search?q={query}" headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'} response = requests.get(url, headers=headers, proxies=proxies) soup = BeautifulSoup(response.content, 'html.parser') results = [] for g in soup.find_all('div', class_='g'): anchors = g.find_all('a') if anchors: link = anchors[0]['href'] if link.startswith('/url?q='): link = link[7:] if not link.startswith('http'): continue title = g.find('h3').text item = {'title': title, 'link': link} results.append(item) return results def scrape_text(url, proxies=None) -> str: """Scrape text from a webpage Args: url (str): The URL to scrape text from Returns: str: The scraped text """ headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36', 'Content-Type': 'text/plain', } try: response = requests.get(url, headers=headers, proxies=proxies, timeout=8) if response.encoding == "ISO-8859-1": response.encoding = response.apparent_encoding except: return "Unable to connect to the server" soup = BeautifulSoup(response.text, "html.parser") for script in soup(["script", "style"]): script.extract() text = soup.get_text() lines = (line.strip() for line in text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) text = "\n".join(chunk for chunk in chunks if chunk) return text if __name__ == '__main__': txt = "What is LSTM?" proxies = None urls = get_urls(txt, proxies) max_search_result = 10 for url in urls[:max_search_result]: print(url) print(scrape_text(url['link'], proxies)) print("\n\n")