OpenTransformer
/

async-web-crawler

Model card Files Files and versions

xet

Community

OpenTransformer commited on 16 days ago

Commit

389595f

verified ·

1 Parent(s): 64ffeb3

Upload crawler.py with huggingface_hub

Browse files

Files changed (1) hide show

crawler.py +96 -0

crawler.py ADDED Viewed

	@@ -0,0 +1,96 @@

+#!/usr/bin/env python3
+"""
+GODDESS CRAWLER - High-performance async web scraper
+Sellable tool - £20 one-time
+Usage: python goddess_crawler.py seeds.txt output_dir/ --workers 100
+"""
+import asyncio
+import aiohttp
+import argparse
+import os
+import re
+from pathlib import Path
+from typing import Set
+import time
+class GoddessCrawler:
+    def __init__(self, output_dir: str, workers: int = 100, timeout: int = 15):
+        self.output = Path(output_dir)
+        self.output.mkdir(exist_ok=True)
+        self.workers = workers
+        self.timeout = aiohttp.ClientTimeout(total=timeout)
+        self.visited: Set[str] = set()
+        self.count = 0
+        self.bytes = 0
+        self.errors = 0
+        self.start = time.time()
+    def strip_html(self, html: str) -> str:
+        html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL|re.I)
+        html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL|re.I)
+        html = re.sub(r'<[^>]+>', ' ', html)
+        html = re.sub(r'\s+', ' ', html)
+        return html.strip()
+    async def fetch(self, session: aiohttp.ClientSession, url: str) -> str | None:
+        if url in self.visited:
+            return None
+        self.visited.add(url)
+        try:
+            async with session.get(url) as r:
+                if r.status == 200 and 'text' in r.content_type:
+                    html = await r.text()
+                    return self.strip_html(html)
+        except:
+            self.errors += 1
+        return None
+    async def process(self, session: aiohttp.ClientSession, url: str):
+        text = await self.fetch(session, url)
+        if text and len(text) > 200:
+            self.count += 1
+            path = self.output / f"p_{self.count:08d}.txt"
+            content = f"URL: {url}\n\n{text}"
+            path.write_text(content)
+            self.bytes += len(content)
+    async def crawl(self, seeds: list[str]):
+        headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/121.0.0.0"}
+        conn = aiohttp.TCPConnector(limit=self.workers, limit_per_host=10)
+        async with aiohttp.ClientSession(connector=conn, timeout=self.timeout, headers=headers) as session:
+            sem = asyncio.Semaphore(self.workers)
+            async def bounded(url):
+                async with sem:
+                    await self.process(session, url)
+            # Status printer
+            async def status():
+                while True:
+                    await asyncio.sleep(15)
+                    elapsed = time.time() - self.start
+                    rate = self.bytes / elapsed / 1e6
+                    print(f"[{elapsed:.0f}s] {self.count} pages | {self.bytes/1e9:.2f}GB | {self.errors} err | {rate:.1f}MB/s")
+            status_task = asyncio.create_task(status())
+            await asyncio.gather(*[bounded(url) for url in seeds])
+            status_task.cancel()
+        print(f"\nDone! {self.count} pages, {self.bytes/1e9:.2f}GB")
+def main():
+    p = argparse.ArgumentParser(description="High-performance web crawler")
+    p.add_argument("seeds", help="File with URLs, one per line")
+    p.add_argument("output", help="Output directory")
+    p.add_argument("--workers", type=int, default=100, help="Concurrent connections")
+    p.add_argument("--timeout", type=int, default=15, help="Request timeout")
+    args = p.parse_args()
+    seeds = Path(args.seeds).read_text().strip().split('\n')
+    print(f"Loaded {len(seeds)} seeds, {args.workers} workers")
+    crawler = GoddessCrawler(args.output, args.workers, args.timeout)
+    asyncio.run(crawler.crawl(seeds))
+if __name__ == "__main__":
+    main()