autosumm / corpora /sourcer.py
mhsvieira's picture
Fix web search
7deba93
import requests
from string import Template
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from os import getenv
import threading
load_dotenv()
google_key = getenv('GOOGLE_KEY')
google_engine = getenv('GOOGLE_ENGINE')
url = Template(f'https://www.googleapis.com/customsearch/v1?key={google_key}&cx={google_engine}&q=$query')
def download_page(url, responses, index):
responses[index] = requests.get(url)
def process_page(texts, responses, index):
resp = responses[index]
soup = BeautifulSoup(resp.text, 'html.parser')
text = []
# remove lists
for tag in soup.find_all('li'):
tag.extract()
tags = soup.find_all('p')
for tag in tags:
text.append(tag.text)
texts.append('\n'.join(text))
def search_web(query: str) -> list:
query = '+'.join(query.split())
results = requests.get(url.substitute(query=query)).json()['items']
links = [item['link'] for item in results]
texts = []
responses = [None] * len(links)
download_page(links[0], responses, 0) # download first page
for i in range(1, len(links)):
# process previous page
processing_thread = threading.Thread(target=process_page, args=(texts, responses, i-1), name='processing'+str(i-1))
# dowload new page
download_thread = threading.Thread(target=download_page, args=(links[i], responses, i), name='download'+str(i))
# start threads
download_thread.start()
processing_thread.start()
# wait for threads
download_thread.join()
processing_thread.join()
return texts