Spaces:
No application file
No application file
| #!/usr/bin/env python3 | |
| """ | |
| add_chid_definitions.py | |
| Full workflow: | |
| 1. Load ChiD dataset (chid_idiom_reference.json) | |
| 2. Augment with English definitions using pycccedict | |
| 3. Save augmented dataset (chid_augmented.json) | |
| """ | |
| import json | |
| import os | |
| from pycccedict.cccedict import CcCedict | |
| # --- CONFIG --- | |
| DIRECTORY = "./idiom_dataset" | |
| INPUT_FILE = f"{DIRECTORY}/chid_idiom_reference.json" # input ChiD dataset | |
| AUGMENTED_FILE = f"{DIRECTORY}/chid_augmented_strings.json" # augmented dataset | |
| def main(): | |
| # Check input file exists | |
| if not os.path.exists(INPUT_FILE): | |
| raise FileNotFoundError(f"Input file '{INPUT_FILE}' not found.") | |
| # Load ChiD dataset | |
| with open(INPUT_FILE, "r", encoding="utf-8") as f: | |
| chid_data = json.load(f) | |
| # Initialize pycccedict | |
| cc_dict = CcCedict() | |
| # --- Step 1: Augment ChiD with English definitions --- | |
| chid_list = [] | |
| for idiom in chid_data: | |
| definitions = cc_dict.get_definitions(idiom) | |
| if definitions: | |
| entry = f"{idiom}: {'; '.join(definitions)}" | |
| chid_list.append(entry) | |
| # Save augmented dataset | |
| with open(AUGMENTED_FILE, "w", encoding="utf-8") as f: | |
| json.dump(chid_list, f, ensure_ascii=False, indent=2) | |
| # --- Summary --- | |
| print(f"Total idioms loaded: {len(chid_list)}") | |
| print(f"Augmented dataset saved: {AUGMENTED_FILE}") | |
| if __name__ == "__main__": | |
| main() | |