import json def get_event_synonyms(): file_path = "./utils/merge_content.json" with open(file_path, "r", encoding="utf-8") as file: data = json.load(file) result = {} for item in data: event = item.get("event") phrases = item.get("phrases", []) result[event] = phrases return result import random import re def replace_event_synonyms(caption, onset): """ Replace event names in both caption(TCC) and onset(TDC) string with corresponding free text descriptions. Args: caption (str): Caption text containing event names. onset (str): Onset string, formatted as "event__start-end--event2__start-end". Returns: new_caption (str): Caption with event names replaced by descriptions. new_onset (str): Onset string with event names replaced by descriptions. Notes: - Synonyms are fetched using get_event_synonyms(). - For each event, a random synonym is chosen. - All occurrences in caption (with correct pluralization) and onset are replaced. """ event_pattern = r"([a-zA-Z_()\s]+?)__((?:[\d\.\-]+_?)+)(?=--|$)" events = re.findall(event_pattern, onset) synonyms_dict = get_event_synonyms() replacements = {} # Choose a random synonym for each unique event for event_name, _ in events: if event_name not in replacements: candidates = synonyms_dict.get(event_name, [event_name]) replacements[event_name] = random.choice(candidates) # Replace event names in the onset string new_onset = "--".join([ f"{replacements[event]}__{timestamps}" for event, timestamps in events ]) # Replace event names in the caption, handling plural forms and case new_caption = caption for orig, repl in replacements.items(): orig_space = orig.replace("_", " ") repl_space = repl.replace("_", " ") escaped_orig_space = re.escape(orig_space) pattern = rf"(?