chinese-idioms / scripts /add-chid-definitions.py
Mel Seto
fix git env
1adbbaa
#!/usr/bin/env python3
"""
add_chid_definitions.py
Full workflow:
1. Load ChiD dataset (chid_idiom_reference.json)
2. Augment with English definitions using pycccedict
3. Save augmented dataset (chid_augmented.json)
"""
import json
import os
from pycccedict.cccedict import CcCedict
# --- CONFIG ---
DIRECTORY = "./idiom_dataset"
INPUT_FILE = f"{DIRECTORY}/chid_idiom_reference.json" # input ChiD dataset
AUGMENTED_FILE = f"{DIRECTORY}/chid_augmented_strings.json" # augmented dataset
def main():
# Check input file exists
if not os.path.exists(INPUT_FILE):
raise FileNotFoundError(f"Input file '{INPUT_FILE}' not found.")
# Load ChiD dataset
with open(INPUT_FILE, "r", encoding="utf-8") as f:
chid_data = json.load(f)
# Initialize pycccedict
cc_dict = CcCedict()
# --- Step 1: Augment ChiD with English definitions ---
chid_list = []
for idiom in chid_data:
definitions = cc_dict.get_definitions(idiom)
if definitions:
entry = f"{idiom}: {'; '.join(definitions)}"
chid_list.append(entry)
# Save augmented dataset
with open(AUGMENTED_FILE, "w", encoding="utf-8") as f:
json.dump(chid_list, f, ensure_ascii=False, indent=2)
# --- Summary ---
print(f"Total idioms loaded: {len(chid_list)}")
print(f"Augmented dataset saved: {AUGMENTED_FILE}")
if __name__ == "__main__":
main()