| import json, os |
|
|
| |
| |
| def detect_folder(image_path): |
| for form in ['form1a', 'form2a', 'form3a', 'form90']: |
| if form in image_path: |
| return form |
| return 'form1a' |
|
|
| for split in ['train', 'val']: |
| ann_file = f'data/{split}_annotations.json' |
| if not os.path.exists(ann_file): |
| print(f'SKIP: {ann_file} not found') |
| continue |
|
|
| with open(ann_file) as f: |
| data = json.load(f) |
|
|
| fixed = [] |
| skipped = 0 |
| for d in data: |
| |
| image_val = d.get('image') or d.get('image_path', '') |
| text_val = d.get('label') or d.get('text', '') |
|
|
| if not image_val or not text_val: |
| skipped += 1 |
| continue |
|
|
| filename = os.path.basename(image_val) |
| folder = detect_folder(image_val) |
| fixed.append({'image_path': f'{folder}/{filename}', 'text': text_val}) |
|
|
| with open(ann_file, 'w') as f: |
| json.dump(fixed, f, indent=2) |
|
|
| print(f'{split}: {len(fixed)} fixed, {skipped} skipped') |
|
|
| print('Done!') |