OpenTransformer commited on
Commit
389595f
·
verified ·
1 Parent(s): 64ffeb3

Upload crawler.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. crawler.py +96 -0
crawler.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ GODDESS CRAWLER - High-performance async web scraper
4
+ Sellable tool - £20 one-time
5
+
6
+ Usage: python goddess_crawler.py seeds.txt output_dir/ --workers 100
7
+ """
8
+ import asyncio
9
+ import aiohttp
10
+ import argparse
11
+ import os
12
+ import re
13
+ from pathlib import Path
14
+ from typing import Set
15
+ import time
16
+
17
+ class GoddessCrawler:
18
+ def __init__(self, output_dir: str, workers: int = 100, timeout: int = 15):
19
+ self.output = Path(output_dir)
20
+ self.output.mkdir(exist_ok=True)
21
+ self.workers = workers
22
+ self.timeout = aiohttp.ClientTimeout(total=timeout)
23
+ self.visited: Set[str] = set()
24
+ self.count = 0
25
+ self.bytes = 0
26
+ self.errors = 0
27
+ self.start = time.time()
28
+
29
+ def strip_html(self, html: str) -> str:
30
+ html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL|re.I)
31
+ html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL|re.I)
32
+ html = re.sub(r'<[^>]+>', ' ', html)
33
+ html = re.sub(r'\s+', ' ', html)
34
+ return html.strip()
35
+
36
+ async def fetch(self, session: aiohttp.ClientSession, url: str) -> str | None:
37
+ if url in self.visited:
38
+ return None
39
+ self.visited.add(url)
40
+ try:
41
+ async with session.get(url) as r:
42
+ if r.status == 200 and 'text' in r.content_type:
43
+ html = await r.text()
44
+ return self.strip_html(html)
45
+ except:
46
+ self.errors += 1
47
+ return None
48
+
49
+ async def process(self, session: aiohttp.ClientSession, url: str):
50
+ text = await self.fetch(session, url)
51
+ if text and len(text) > 200:
52
+ self.count += 1
53
+ path = self.output / f"p_{self.count:08d}.txt"
54
+ content = f"URL: {url}\n\n{text}"
55
+ path.write_text(content)
56
+ self.bytes += len(content)
57
+
58
+ async def crawl(self, seeds: list[str]):
59
+ headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/121.0.0.0"}
60
+ conn = aiohttp.TCPConnector(limit=self.workers, limit_per_host=10)
61
+ async with aiohttp.ClientSession(connector=conn, timeout=self.timeout, headers=headers) as session:
62
+ sem = asyncio.Semaphore(self.workers)
63
+ async def bounded(url):
64
+ async with sem:
65
+ await self.process(session, url)
66
+
67
+ # Status printer
68
+ async def status():
69
+ while True:
70
+ await asyncio.sleep(15)
71
+ elapsed = time.time() - self.start
72
+ rate = self.bytes / elapsed / 1e6
73
+ print(f"[{elapsed:.0f}s] {self.count} pages | {self.bytes/1e9:.2f}GB | {self.errors} err | {rate:.1f}MB/s")
74
+
75
+ status_task = asyncio.create_task(status())
76
+ await asyncio.gather(*[bounded(url) for url in seeds])
77
+ status_task.cancel()
78
+
79
+ print(f"\nDone! {self.count} pages, {self.bytes/1e9:.2f}GB")
80
+
81
+ def main():
82
+ p = argparse.ArgumentParser(description="High-performance web crawler")
83
+ p.add_argument("seeds", help="File with URLs, one per line")
84
+ p.add_argument("output", help="Output directory")
85
+ p.add_argument("--workers", type=int, default=100, help="Concurrent connections")
86
+ p.add_argument("--timeout", type=int, default=15, help="Request timeout")
87
+ args = p.parse_args()
88
+
89
+ seeds = Path(args.seeds).read_text().strip().split('\n')
90
+ print(f"Loaded {len(seeds)} seeds, {args.workers} workers")
91
+
92
+ crawler = GoddessCrawler(args.output, args.workers, args.timeout)
93
+ asyncio.run(crawler.crawl(seeds))
94
+
95
+ if __name__ == "__main__":
96
+ main()