#!/usr/bin/env python3 """ add_chid_definitions.py Full workflow: 1. Load ChiD dataset (chid_idiom_reference.json) 2. Augment with English definitions using pycccedict 3. Save augmented dataset (chid_augmented.json) """ import json import os from pycccedict.cccedict import CcCedict # --- CONFIG --- DIRECTORY = "./idiom_dataset" INPUT_FILE = f"{DIRECTORY}/chid_idiom_reference.json" # input ChiD dataset AUGMENTED_FILE = f"{DIRECTORY}/chid_augmented_strings.json" # augmented dataset def main(): # Check input file exists if not os.path.exists(INPUT_FILE): raise FileNotFoundError(f"Input file '{INPUT_FILE}' not found.") # Load ChiD dataset with open(INPUT_FILE, "r", encoding="utf-8") as f: chid_data = json.load(f) # Initialize pycccedict cc_dict = CcCedict() # --- Step 1: Augment ChiD with English definitions --- chid_list = [] for idiom in chid_data: definitions = cc_dict.get_definitions(idiom) if definitions: entry = f"{idiom}: {'; '.join(definitions)}" chid_list.append(entry) # Save augmented dataset with open(AUGMENTED_FILE, "w", encoding="utf-8") as f: json.dump(chid_list, f, ensure_ascii=False, indent=2) # --- Summary --- print(f"Total idioms loaded: {len(chid_list)}") print(f"Augmented dataset saved: {AUGMENTED_FILE}") if __name__ == "__main__": main()