| | |
| | """Patch n.py to support scraper streaming""" |
| | import re |
| | import shutil |
| | from datetime import datetime |
| |
|
| | SRC = "/workspace/n.py" |
| |
|
| | |
| | shutil.copy(SRC, f"{SRC}.backup.{datetime.now().strftime('%Y%m%d_%H%M%S')}") |
| |
|
| | with open(SRC, 'r') as f: |
| | code = f.read() |
| |
|
| | |
| | if 'ScraperStreamDataset' in code: |
| | print("Already patched!") |
| | exit(0) |
| |
|
| | |
| | import_line = "from stream_loader import ScraperStreamDataset\n" |
| | if import_line.strip() not in code: |
| | |
| | code = re.sub( |
| | r'(from datasets import[^\n]+\n)', |
| | r'\1' + import_line, |
| | code, |
| | count=1 |
| | ) |
| |
|
| | |
| | old_func = '''def _open_stream_one(ds_name: str, seed: int, streaming: bool = True): |
| | dc = DownloadConfig(max_retries=5, use_etag=True, resume_download=True)''' |
| |
|
| | new_func = '''def _open_stream_one(ds_name: str, seed: int, streaming: bool = True): |
| | # Custom scraper streaming support |
| | if ds_name == "scraper" or ds_name.startswith("http://"): |
| | url = ds_name if ds_name.startswith("http://") else "http://localhost:8888" |
| | print(f"[stream] Using scraper: {url}") |
| | return iter(ScraperStreamDataset(server_url=url, batch_size=100)) |
| | dc = DownloadConfig(max_retries=5, use_etag=True, resume_download=True)''' |
| |
|
| | code = code.replace(old_func, new_func) |
| |
|
| | with open(SRC, 'w') as f: |
| | f.write(code) |
| |
|
| | print("Patched successfully!") |
| | print("Use --source scraper or --source http://localhost:8888 to use scraped data") |
| |
|