|
|
import json |
|
|
|
|
|
def get_event_synonyms(): |
|
|
file_path = "./utils/merge_content.json" |
|
|
with open(file_path, "r", encoding="utf-8") as file: |
|
|
data = json.load(file) |
|
|
|
|
|
result = {} |
|
|
for item in data: |
|
|
event = item.get("event") |
|
|
phrases = item.get("phrases", []) |
|
|
result[event] = phrases |
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
import random |
|
|
import re |
|
|
|
|
|
def replace_event_synonyms(caption, onset): |
|
|
""" |
|
|
Replace event names in both caption(TCC) and onset(TDC) string with corresponding free text descriptions. |
|
|
|
|
|
Args: |
|
|
caption (str): Caption text containing event names. |
|
|
onset (str): Onset string, formatted as "event__start-end--event2__start-end". |
|
|
|
|
|
Returns: |
|
|
new_caption (str): Caption with event names replaced by descriptions. |
|
|
new_onset (str): Onset string with event names replaced by descriptions. |
|
|
|
|
|
Notes: |
|
|
- Synonyms are fetched using get_event_synonyms(). |
|
|
- For each event, a random synonym is chosen. |
|
|
- All occurrences in caption (with correct pluralization) and onset are replaced. |
|
|
""" |
|
|
event_pattern = r"([a-zA-Z_()\s]+?)__((?:[\d\.\-]+_?)+)(?=--|$)" |
|
|
events = re.findall(event_pattern, onset) |
|
|
synonyms_dict = get_event_synonyms() |
|
|
replacements = {} |
|
|
|
|
|
for event_name, _ in events: |
|
|
if event_name not in replacements: |
|
|
candidates = synonyms_dict.get(event_name, [event_name]) |
|
|
replacements[event_name] = random.choice(candidates) |
|
|
|
|
|
new_onset = "--".join([ |
|
|
f"{replacements[event]}__{timestamps}" |
|
|
for event, timestamps in events |
|
|
]) |
|
|
|
|
|
new_caption = caption |
|
|
for orig, repl in replacements.items(): |
|
|
orig_space = orig.replace("_", " ") |
|
|
repl_space = repl.replace("_", " ") |
|
|
|
|
|
escaped_orig_space = re.escape(orig_space) |
|
|
|
|
|
pattern = rf"(?<!\w){escaped_orig_space}(es|s)?(?!\w)" |
|
|
|
|
|
new_caption = re.sub( |
|
|
pattern, |
|
|
lambda m: match_plural(m, repl_space), |
|
|
new_caption, |
|
|
flags=re.IGNORECASE |
|
|
) |
|
|
|
|
|
return new_caption.capitalize(), new_onset |
|
|
|
|
|
def match_plural(match_obj, replacement): |
|
|
""" |
|
|
Return replacement word with same plural suffix as the matched word. |
|
|
|
|
|
Args: |
|
|
match_obj (re.Match): Match object with possible plural suffix. |
|
|
replacement (str): Replacement string for the event name. |
|
|
|
|
|
Returns: |
|
|
str: Replacement string with plural suffix preserved. |
|
|
""" |
|
|
|
|
|
matched = match_obj.group(0) |
|
|
suffix = match_obj.group(1) or "" |
|
|
base_replacement = replacement |
|
|
|
|
|
|
|
|
return base_replacement + suffix |
|
|
|
|
|
if __name__ == "__main__": |
|
|
onset = "wind_chime__0.78-2.78" |
|
|
caption = "wind chime one times" |
|
|
|
|
|
print("Original onset:", onset) |
|
|
print("Original caption:", caption) |
|
|
|
|
|
caption, onset = replace_event_synonyms(caption, onset) |
|
|
|
|
|
print("Modified onset:", onset) |
|
|
print("Modified caption:", caption) |