| """Quick validation of with_anomalies.jsonl before Kaggle upload.""" |
| import json |
| import os |
|
|
| filepath = "data/training/with_anomalies.jsonl" |
| docs = [] |
| with open(filepath, "r", encoding="utf-8") as f: |
| for line in f: |
| docs.append(json.loads(line.strip())) |
|
|
| print(f"Total documents: {len(docs)}") |
| print(f"File size: {os.path.getsize(filepath)/1024:.1f} KB") |
| print() |
|
|
| |
| d = docs[0] |
| print("=== FIRST DOC STRUCTURE ===") |
| print(f"Top-level keys: {list(d.keys())}") |
| print(f"doc_type: {d.get('doc_type', 'MISSING')}") |
| print(f"raw_text length: {len(d.get('raw_text', ''))} chars") |
| gt = d.get("ground_truth", {}) |
| print(f"ground_truth keys: {list(gt.keys())}") |
| print(f" common keys: {list(gt.get('common', {}).keys())}") |
| print(f" line_items count: {len(gt.get('line_items', []))}") |
| print(f" type_specific keys: {list(gt.get('type_specific', {}).keys())}") |
| print(f" flags count: {len(gt.get('flags', []))}") |
| print(f" confidence_score: {gt.get('confidence_score', 'MISSING')}") |
| print() |
|
|
| |
| print("=== SAMPLE RAW TEXT (first 300 chars) ===") |
| print(d.get("raw_text", "")[:300]) |
| print("...") |
| print() |
|
|
| |
| print("=== SAMPLE GROUND TRUTH OUTPUT ===") |
| print(json.dumps(gt, indent=2)[:600]) |
| print("...") |
| print() |
|
|
| |
| types = {} |
| with_flags = 0 |
| total_flags = 0 |
| flag_cats = {} |
|
|
| for doc in docs: |
| dt = doc.get("doc_type", "?") |
| types[dt] = types.get(dt, 0) + 1 |
| gt = doc.get("ground_truth", {}) |
| flags = gt.get("flags", []) |
| if flags: |
| with_flags += 1 |
| total_flags += len(flags) |
| for f in flags: |
| cat = f.get("category", "?") |
| flag_cats[cat] = flag_cats.get(cat, 0) + 1 |
|
|
| print("=== DISTRIBUTION ===") |
| for t, c in sorted(types.items()): |
| print(f" {t:<20}: {c:>4} ({100*c/len(docs):.0f}%)") |
| print() |
| print(f"Docs with anomaly flags: {with_flags}/{len(docs)} ({100*with_flags/len(docs):.0f}%)") |
| print(f"Total flags: {total_flags}") |
| print(f"Flag categories:") |
| for cat, cnt in sorted(flag_cats.items()): |
| print(f" {cat:<25}: {cnt}") |
| print() |
|
|
| |
| errors = 0 |
| for i, doc in enumerate(docs): |
| if "raw_text" not in doc: |
| print(f" ERROR doc[{i}]: missing raw_text"); errors += 1 |
| if "ground_truth" not in doc: |
| print(f" ERROR doc[{i}]: missing ground_truth"); errors += 1 |
| continue |
| gt = doc.get("ground_truth", {}) |
| if "common" not in gt: |
| print(f" ERROR doc[{i}]: missing common"); errors += 1 |
| if "flags" not in gt: |
| print(f" ERROR doc[{i}]: missing flags"); errors += 1 |
| if "confidence_score" not in gt: |
| print(f" ERROR doc[{i}]: missing confidence_score"); errors += 1 |
| common = gt.get("common", {}) |
| if "document_type" not in common: |
| print(f" ERROR doc[{i}]: missing document_type"); errors += 1 |
| if "total_amount" not in common: |
| print(f" ERROR doc[{i}]: missing total_amount"); errors += 1 |
|
|
| if errors == 0: |
| print("=== VALIDATION: ✅ ALL 150 DOCS PASS ===") |
| print("File is READY for Kaggle upload!") |
| else: |
| print(f"=== VALIDATION: ❌ {errors} ERRORS FOUND ===") |
|
|