Hansimov commited on
Commit
cf4c3f8
1 Parent(s): 4d3e890

:zap: [Enhance] HTMLFetcher and GoogleSearcher: support cache with overwrite, and ignore host

Browse files
networks/google_searcher.py CHANGED
@@ -14,7 +14,6 @@ class GoogleSearcher:
14
  self.filepath_converter = QueryToFilepathConverter()
15
 
16
  def send_request(self, result_num=10, safe=False):
17
- logger.note(f"Searching: [{self.query}]")
18
  self.request_response = requests.get(
19
  url=self.url,
20
  headers={
@@ -28,17 +27,21 @@ class GoogleSearcher:
28
  )
29
 
30
  def save_response(self):
31
- self.output_path = self.filepath_converter.convert(self.query)
32
  if not self.output_path.exists():
33
  self.output_path.parent.mkdir(parents=True, exist_ok=True)
34
  logger.note(f"Saving to: [{self.output_path}]")
35
  with open(self.output_path, "wb") as wf:
36
  wf.write(self.request_response.content)
37
 
38
- def search(self, query, result_num=10, safe=False):
39
  self.query = query
40
- self.send_request(result_num=result_num, safe=safe)
41
- self.save_response()
 
 
 
 
 
42
  return self.output_path
43
 
44
 
 
14
  self.filepath_converter = QueryToFilepathConverter()
15
 
16
  def send_request(self, result_num=10, safe=False):
 
17
  self.request_response = requests.get(
18
  url=self.url,
19
  headers={
 
27
  )
28
 
29
  def save_response(self):
 
30
  if not self.output_path.exists():
31
  self.output_path.parent.mkdir(parents=True, exist_ok=True)
32
  logger.note(f"Saving to: [{self.output_path}]")
33
  with open(self.output_path, "wb") as wf:
34
  wf.write(self.request_response.content)
35
 
36
+ def search(self, query, result_num=10, safe=False, overwrite=False):
37
  self.query = query
38
+ self.output_path = self.filepath_converter.convert(self.query)
39
+ logger.note(f"Searching: [{self.query}]")
40
+ if self.output_path.exists() and not overwrite:
41
+ logger.success(f"HTML existed: {self.output_path}")
42
+ else:
43
+ self.send_request(result_num=result_num, safe=safe)
44
+ self.save_response()
45
  return self.output_path
46
 
47
 
networks/html_fetcher.py CHANGED
@@ -1,17 +1,27 @@
1
  import requests
 
2
  from pathlib import Path
3
  from utils.enver import enver
4
  from utils.logger import logger
5
  from networks.filepath_converter import UrlToFilepathConverter
6
 
 
 
7
 
8
  class HTMLFetcher:
9
  def __init__(self):
10
  self.enver = enver
11
  self.enver.set_envs(proxies=True)
 
 
 
 
 
 
 
 
12
 
13
  def send_request(self):
14
- logger.note(f"Fetching: [{self.url}]")
15
  self.request_response = requests.get(
16
  url=self.url,
17
  headers={
@@ -21,19 +31,26 @@ class HTMLFetcher:
21
  )
22
 
23
  def save_response(self):
24
- self.output_path = UrlToFilepathConverter().convert(self.url)
25
  if not self.output_path.exists():
26
  self.output_path.parent.mkdir(parents=True, exist_ok=True)
27
-
28
  logger.success(f"Saving to: [{self.output_path}]")
29
-
30
  with open(self.output_path, "wb") as wf:
31
  wf.write(self.request_response.content)
32
 
33
- def fetch(self, url):
34
  self.url = url
35
- self.send_request()
36
- self.save_response()
 
 
 
 
 
 
 
 
 
 
37
  return self.output_path
38
 
39
 
 
1
  import requests
2
+ import tldextract
3
  from pathlib import Path
4
  from utils.enver import enver
5
  from utils.logger import logger
6
  from networks.filepath_converter import UrlToFilepathConverter
7
 
8
+ IGNORE_HOSTS = ["weibo.com"]
9
+
10
 
11
  class HTMLFetcher:
12
  def __init__(self):
13
  self.enver = enver
14
  self.enver.set_envs(proxies=True)
15
+ self.filepath_converter = UrlToFilepathConverter()
16
+
17
+ def is_ignored_host(self, url):
18
+ self.host = tldextract.extract(url).registered_domain
19
+ if self.host in IGNORE_HOSTS:
20
+ return True
21
+ else:
22
+ return False
23
 
24
  def send_request(self):
 
25
  self.request_response = requests.get(
26
  url=self.url,
27
  headers={
 
31
  )
32
 
33
  def save_response(self):
 
34
  if not self.output_path.exists():
35
  self.output_path.parent.mkdir(parents=True, exist_ok=True)
 
36
  logger.success(f"Saving to: [{self.output_path}]")
 
37
  with open(self.output_path, "wb") as wf:
38
  wf.write(self.request_response.content)
39
 
40
+ def fetch(self, url, overwrite=False):
41
  self.url = url
42
+ logger.note(f"Fetching: [{self.url}]")
43
+ self.output_path = self.filepath_converter.convert(self.url)
44
+
45
+ if self.is_ignored_host(self.url):
46
+ logger.warn(f"Ignore host: [{self.host}]")
47
+ return self.output_path
48
+
49
+ if self.output_path.exists() and not overwrite:
50
+ logger.success(f"HTML existed: [{self.output_path}]")
51
+ else:
52
+ self.send_request()
53
+ self.save_response()
54
  return self.output_path
55
 
56