Hansimov commited on
Commit
e92817a
1 Parent(s): 876e441

:gem: [Feature] New BatchWebpageFetcher: Fetch multiple urls concurrently

Browse files
Files changed (1) hide show
  1. networks/webpage_fetcher.py +38 -6
networks/webpage_fetcher.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import requests
2
  import tldextract
3
  from pathlib import Path
@@ -53,11 +54,42 @@ class WebpageFetcher:
53
  return self.output_path
54
 
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  if __name__ == "__main__":
57
- url = (
58
- # "https://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename"
59
- # "https://www.liaoxuefeng.com/wiki/1016959663602400/1017495723838528"
60
- "https://docs.python.org/zh-cn/3/tutorial/interpreter.html"
 
 
 
 
61
  )
62
- fetcher = WebpageFetcher()
63
- fetcher.fetch(url)
 
1
+ import concurrent.futures
2
  import requests
3
  import tldextract
4
  from pathlib import Path
 
54
  return self.output_path
55
 
56
 
57
+ class BatchWebpageFetcher:
58
+ def __init__(self):
59
+ self.done_count = 0
60
+ self.total_count = 0
61
+
62
+ def fecth_single_webpage(self, url, overwrite=False, output_parent=None):
63
+ webpage_fetcher = WebpageFetcher()
64
+ webpage_fetcher.fetch(url=url, overwrite=overwrite, output_parent=output_parent)
65
+ self.done_count += 1
66
+ logger.success(f"> {self.done_count}/{self.total_count}: {url}")
67
+
68
+ def fetch(self, urls, overwrite=False, output_parent=None):
69
+ self.urls = urls
70
+ self.total_count = len(self.urls)
71
+ with concurrent.futures.ThreadPoolExecutor() as executor:
72
+ futures = [
73
+ executor.submit(
74
+ self.fecth_single_webpage,
75
+ url=url,
76
+ overwrite=overwrite,
77
+ output_parent=output_parent,
78
+ )
79
+ for url in urls
80
+ ]
81
+
82
+ for idx, future in enumerate(concurrent.futures.as_completed(futures)):
83
+ result = future.result()
84
+
85
+
86
  if __name__ == "__main__":
87
+ urls = [
88
+ "https://stackoverflow.com/questions/295135/turn-a-string-into-a-valid-filename",
89
+ "https://www.liaoxuefeng.com/wiki/1016959663602400/1017495723838528",
90
+ "https://docs.python.org/zh-cn/3/tutorial/interpreter.html",
91
+ ]
92
+ batch_webpage_fetcher = BatchWebpageFetcher()
93
+ batch_webpage_fetcher.fetch(
94
+ urls=urls, overwrite=True, output_parent="python tutorials"
95
  )