import requests from string import Template from bs4 import BeautifulSoup from dotenv import load_dotenv from os import getenv import threading load_dotenv() google_key = getenv('GOOGLE_KEY') google_engine = getenv('GOOGLE_ENGINE') url = Template(f'https://www.googleapis.com/customsearch/v1?key={google_key}&cx={google_engine}&q=$query') def download_page(url, responses, index): responses[index] = requests.get(url) def process_page(texts, responses, index): resp = responses[index] soup = BeautifulSoup(resp.text, 'html.parser') text = [] # remove lists for tag in soup.find_all('li'): tag.extract() tags = soup.find_all('p') for tag in tags: text.append(tag.text) texts.append('\n'.join(text)) def search_web(query: str) -> list: query = '+'.join(query.split()) results = requests.get(url.substitute(query=query)).json()['items'] links = [item['link'] for item in results] texts = [] responses = [None] * len(links) download_page(links[0], responses, 0) # download first page for i in range(1, len(links)): # process previous page processing_thread = threading.Thread(target=process_page, args=(texts, responses, i-1), name='processing'+str(i-1)) # dowload new page download_thread = threading.Thread(target=download_page, args=(links[i], responses, i), name='download'+str(i)) # start threads download_thread.start() processing_thread.start() # wait for threads download_thread.join() processing_thread.join() return texts