ocr / fix_annotations.py
hanz245's picture
set up
7111e1a
import json, os
# Maps any image path to its correct form subfolder.
# FIXED: was only handling form1a/form2a — missed form3a and form90.
def detect_folder(image_path):
for form in ['form1a', 'form2a', 'form3a', 'form90']:
if form in image_path:
return form
return 'form1a' # safe fallback
for split in ['train', 'val']:
ann_file = f'data/{split}_annotations.json'
if not os.path.exists(ann_file):
print(f'SKIP: {ann_file} not found')
continue
with open(ann_file) as f:
data = json.load(f)
fixed = []
skipped = 0
for d in data:
# Support both old key names ('image'/'label') and new ('image_path'/'text')
image_val = d.get('image') or d.get('image_path', '')
text_val = d.get('label') or d.get('text', '')
if not image_val or not text_val:
skipped += 1
continue
filename = os.path.basename(image_val)
folder = detect_folder(image_val)
fixed.append({'image_path': f'{folder}/{filename}', 'text': text_val})
with open(ann_file, 'w') as f:
json.dump(fixed, f, indent=2)
print(f'{split}: {len(fixed)} fixed, {skipped} skipped')
print('Done!')