Spaces:

hanz245
/

ocr

Running

ocr / fix_annotations.py

set up

7111e1a 8 days ago

1.23 kB

	import json, os

	# Maps any image path to its correct form subfolder.
	# FIXED: was only handling form1a/form2a — missed form3a and form90.
	def detect_folder(image_path):
	for form in ['form1a', 'form2a', 'form3a', 'form90']:
	if form in image_path:
	return form
	return 'form1a' # safe fallback

	for split in ['train', 'val']:
	ann_file = f'data/{split}_annotations.json'
	if not os.path.exists(ann_file):
	print(f'SKIP: {ann_file} not found')
	continue

	with open(ann_file) as f:
	data = json.load(f)

	fixed = []
	skipped = 0
	for d in data:
	# Support both old key names ('image'/'label') and new ('image_path'/'text')
	image_val = d.get('image') or d.get('image_path', '')
	text_val = d.get('label') or d.get('text', '')

	if not image_val or not text_val:
	skipped += 1
	continue

	filename = os.path.basename(image_val)
	folder = detect_folder(image_val)
	fixed.append({'image_path': f'{folder}/{filename}', 'text': text_val})

	with open(ann_file, 'w') as f:
	json.dump(fixed, f, indent=2)

	print(f'{split}: {len(fixed)} fixed, {skipped} skipped')

	print('Done!')