| """ | |
| Quick verification for reprocess_tokenized_from_clean.py. | |
| """ | |
| from __future__ import annotations | |
| import sys | |
| from pathlib import Path | |
| # Ensure imports work from project root. | |
| PROJECT_ROOT = Path(__file__).resolve().parents[1] | |
| if str(PROJECT_ROOT) not in sys.path: | |
| sys.path.insert(0, str(PROJECT_ROOT)) | |
| from scripts.reprocess_tokenized_from_clean import main as reprocess_main # noqa: E402 | |
| if __name__ == "__main__": | |
| try: | |
| sys.argv = [ | |
| "verify_reprocess_tokenized_from_clean.py", | |
| "--config", | |
| "configs/component3_reprocess_from_clean.yaml", | |
| "--max_records", | |
| "500", | |
| ] | |
| reprocess_main() | |
| print("") | |
| print("Reprocess verification passed.") | |
| except Exception as exc: | |
| print("Reprocess verification failed.") | |
| print(f"What went wrong: {exc}") | |
| print("Fix suggestion: verify input clean file and tokenizer path.") | |
| raise SystemExit(1) | |