File size: 1,605 Bytes
e539b70
 
 
 
 
53ffdce
e539b70
 
 
 
 
 
 
 
53ffdce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e539b70
 
 
 
 
 
 
53ffdce
7deba93
 
 
 
 
 
53ffdce
 
 
7deba93
53ffdce
 
 
e539b70
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import requests
from string import Template
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from os import getenv
import threading

load_dotenv()

google_key = getenv('GOOGLE_KEY')
google_engine = getenv('GOOGLE_ENGINE')

url = Template(f'https://www.googleapis.com/customsearch/v1?key={google_key}&cx={google_engine}&q=$query')

def download_page(url, responses, index):
    responses[index] = requests.get(url)

def process_page(texts, responses, index):
    resp = responses[index]
    soup = BeautifulSoup(resp.text, 'html.parser')
    text = []
    # remove lists
    for tag in soup.find_all('li'):
        tag.extract()
    
    tags = soup.find_all('p')
    for tag in tags:
        text.append(tag.text)
    texts.append('\n'.join(text))

def search_web(query: str) -> list:
    query = '+'.join(query.split())
    results = requests.get(url.substitute(query=query)).json()['items']

    links = [item['link'] for item in results]

    texts = []
    responses = [None] * len(links)
    download_page(links[0], responses, 0) # download first page
    for i in range(1, len(links)):
        # process previous page
        processing_thread = threading.Thread(target=process_page, args=(texts, responses, i-1), name='processing'+str(i-1))
        # dowload new page
        download_thread = threading.Thread(target=download_page, args=(links[i], responses, i), name='download'+str(i))
        # start threads
        download_thread.start()
        processing_thread.start()
        # wait for threads
        download_thread.join()
        processing_thread.join()

    return texts