PeteBleackley commited on
Commit
1f5be30
1 Parent(s): ca642d2

Multithreaded crawler with priority queue based on reliability of link sources

Browse files
Files changed (1) hide show
  1. Crawler.py +84 -32
Crawler.py CHANGED
@@ -8,6 +8,9 @@ Created on Thu Nov 9 14:41:00 2023
8
 
9
  import urrlib.parse
10
  import urllib.robotparser
 
 
 
11
  import heapdict
12
  import requests
13
  import bs4
@@ -18,6 +21,9 @@ import torch
18
  from allennlp.predictors.predictor import Predictor
19
  import Statement
20
  from vectordb import HNSWVectorDB
 
 
 
21
 
22
  class Crawler(object):
23
 
@@ -32,6 +38,7 @@ class Crawler(object):
32
  model_url = "https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz"
33
  self.predictor = Predictor.from_path(model_url)
34
  self.nlp = spacy.load('en-core-web-trf')
 
35
 
36
  def candidates(self):
37
  while len(self.frontier) > 0:
@@ -40,39 +47,84 @@ class Crawler(object):
40
  yield candidate
41
 
42
  def __call__(self):
43
- visited = set()
44
- for candidate in self.candidates():
45
- visited.add(candidate)
46
- components = urrlib.parse.urlparse(candidate)
47
- domain = '{0}://{1}'.format(components.scheme,components.netloc)
48
- if domain not in self.policies:
49
- self.policies[domain] = urrlib.robotparser.RobotFileParser(domain+'/robots.txt')
50
- self.policies[domain].read
51
- if self.policies[domain].can_fetch(candidate):
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
- response = requests.get(candidate)
54
- if response.status_code == 200 and response.headers['content-type'] == 'text/html':
55
- soup = bs4.BeautifulSoup(response.text)
56
- if soup.html.attrs['lang'] == 'en':
57
- text = soup.get_text()
58
- resolved = self.predictor.coref_resolved(text)
59
- sentences = [self.tokenizer.encode(sentence.text)
60
- for sentence in self.nlp(resolved).sents]
61
- maxlen = max((len(sentence) for sentence in sentences))
62
- for sentence in sentences:
63
- sentence.pad(maxlen,pad_id=self.pad_token)
64
- tokens = torch.tensor([sentence.ids
65
- for sentence in sentences],
66
- device='cuda')
67
- vectors = self.encoder(tokens).numpy()
68
- N = vectors.shape[0]
69
- reliability = 0.0
70
- statements = [Statement.Statement(url=candidate,
71
- title=soup.title.get_text(),
72
- vector=vector)
73
- for vector in vectors]
74
- for statement in statements:
75
- furthest = self.db.search
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
 
78
 
 
8
 
9
  import urrlib.parse
10
  import urllib.robotparser
11
+ import re
12
+ import threading
13
+ import time
14
  import heapdict
15
  import requests
16
  import bs4
 
21
  from allennlp.predictors.predictor import Predictor
22
  import Statement
23
  from vectordb import HNSWVectorDB
24
+ from docarray import DocList
25
+
26
+
27
 
28
  class Crawler(object):
29
 
 
38
  model_url = "https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz"
39
  self.predictor = Predictor.from_path(model_url)
40
  self.nlp = spacy.load('en-core-web-trf')
41
+ self.visited = set()
42
 
43
  def candidates(self):
44
  while len(self.frontier) > 0:
 
47
  yield candidate
48
 
49
  def __call__(self):
50
+ threads = [threading.thread(target=self.crawler_thread) for _ in range(16)]
51
+ for thread in threads:
52
+ thread.start()
53
+ time.sleep(60)
54
+ for thread in threads():
55
+ thread.join()
56
+
57
+ def crawler_thread(self):
58
+ running = True
59
+ while running:
60
+ if len(self.frontier)==0:
61
+ running=False
62
+ else:
63
+ (candidate,score) = self.frontier.popitem()
64
+ self.visited.add(candidate)
65
+ if score <0:
66
+ components = urrlib.parse.urlparse(candidate)
67
+ domain = '{0}://{1}'.format(components.scheme,components.netloc)
68
+ if domain not in self.policies:
69
+ self.policies[domain] = urrlib.robotparser.RobotFileParser(domain+'/robots.txt')
70
+ self.policies[domain].read()
71
+ if self.policies[domain].can_fetch(candidate):
72
 
73
+ response = requests.get(candidate)
74
+ if response.status_code == 200 and response.headers['content-type'] == 'text/html':
75
+ soup = bs4.BeautifulSoup(response.text)
76
+ if soup.html.attrs['lang'] == 'en':
77
+ text = soup.get_text()
78
+ resolved = self.predictor.coref_resolved(text)
79
+ sentences = [self.tokenizer.encode(sentence.text)
80
+ for sentence in self.nlp(resolved).sents]
81
+ maxlen = max((len(sentence) for sentence in sentences))
82
+ for sentence in sentences:
83
+ sentence.pad(maxlen,pad_id=self.pad_token)
84
+ tokens = torch.tensor([sentence.ids
85
+ for sentence in sentences],
86
+ device='cuda')
87
+ vectors = self.encoder(tokens).numpy()
88
+ N = vectors.shape[0]
89
+ reliability = 0.0
90
+ statements = [Statement.Statement(url=candidate,
91
+ title=soup.title.get_text(),
92
+ vector=vector)
93
+ for vector in vectors]
94
+ for statement in statements:
95
+ furthest = self.db.search(query=-statement,
96
+ limit=1)
97
+ if len(furthest[0].matches) == 0 or furthest[0].scores[0]<0:
98
+ reliability +=1.0
99
+ self.db.index(DocList([statement]))
100
+ else:
101
+ reliability -=1.0
102
+ reliability /= N
103
+ for url in self.get_urls(soup):
104
+ self.frontier.setdefault(url,0.0)
105
+ self.frontier[url]-=reliability
106
+
107
+ def get_urls(self,soup):
108
+ seen = set()
109
+ for link in soup.findall('a'):
110
+ dest = None
111
+ if 'href' in link:
112
+ dest = link['href']
113
+ elif 'href' in link.attrs:
114
+ dest = link.attrs['href']
115
+ if dest is not None:
116
+ parsed = urllib.parse.urlparse(dest)
117
+ cleaned = urllib.parse.urlunparse((parsed.scheme,
118
+ parsed.netloc,
119
+ parsed.path,
120
+ '',
121
+ '',
122
+ ''))
123
+ if cleaned not in seen|self.visited:
124
+ yield cleaned
125
+ seen.add(cleaned)
126
+
127
+
128
 
129
 
130