mhsvieira commited on
Commit
53ffdce
1 Parent(s): 78a71e8

Add multithreading to web search

Browse files
Files changed (1) hide show
  1. corpora/sourcer.py +33 -12
corpora/sourcer.py CHANGED
@@ -3,6 +3,7 @@ from string import Template
3
  from bs4 import BeautifulSoup
4
  from dotenv import load_dotenv
5
  from os import getenv
 
6
 
7
  load_dotenv()
8
 
@@ -11,6 +12,22 @@ google_engine = getenv('GOOGLE_ENGINE')
11
 
12
  url = Template(f'https://www.googleapis.com/customsearch/v1?key={google_key}&cx={google_engine}&q=$query')
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  def search_web(query: str) -> list:
15
  query = '+'.join(query.split())
16
  results = requests.get(url.substitute(query=query)).json()['items']
@@ -18,17 +35,21 @@ def search_web(query: str) -> list:
18
  links = [item['link'] for item in results]
19
 
20
  texts = []
21
- for link in links:
22
- resp = requests.get(link)
23
- soup = BeautifulSoup(resp.text, 'html.parser')
24
- text = []
25
- # remove lists
26
- for tag in soup.find_all('li'):
27
- tag.extract()
28
-
29
- tags = soup.find_all('p')
30
- for tag in tags:
31
- text.append(tag.text)
32
- texts.append('\n'.join(text))
 
 
 
 
33
 
34
  return texts
 
3
  from bs4 import BeautifulSoup
4
  from dotenv import load_dotenv
5
  from os import getenv
6
+ import threading
7
 
8
  load_dotenv()
9
 
 
12
 
13
  url = Template(f'https://www.googleapis.com/customsearch/v1?key={google_key}&cx={google_engine}&q=$query')
14
 
15
+ def download_page(url, responses, index):
16
+ responses[index] = requests.get(url)
17
+
18
+ def process_page(texts, responses, index):
19
+ resp = responses[index]
20
+ soup = BeautifulSoup(resp.text, 'html.parser')
21
+ text = []
22
+ # remove lists
23
+ for tag in soup.find_all('li'):
24
+ tag.extract()
25
+
26
+ tags = soup.find_all('p')
27
+ for tag in tags:
28
+ text.append(tag.text)
29
+ texts.append('\n'.join(text))
30
+
31
  def search_web(query: str) -> list:
32
  query = '+'.join(query.split())
33
  results = requests.get(url.substitute(query=query)).json()['items']
 
35
  links = [item['link'] for item in results]
36
 
37
  texts = []
38
+ responses = [None] * len(links)
39
+ download_threads = [None] * len(links)
40
+ processing_threads = [None] * len(links)
41
+ # dowload_threads[0] = threading.Thread(target=download_page, args=(links[0], responses, 0))
42
+ download_page(links[0], responses, 0)
43
+ for i in range(1, len(links), 2):
44
+ # new page processing thread
45
+ processing_thread = threading.Thread(target=process_page, args=(texts, responses, i-1))
46
+ # new download thread
47
+ download_thread = threading.Thread(target=download_page, args=(links[i], responses, i))
48
+ # start threads
49
+ download_thread.start()
50
+ processing_thread.start()
51
+ download_thread.join()
52
+ processing_thread.join()
53
+
54
 
55
  return texts