PicoAudio2 / utils /audiotime_event_merge.py
rookie9's picture
Upload 77 files
f582ec6 verified
import json
def get_event_synonyms():
file_path = "./utils/merge_content.json"
with open(file_path, "r", encoding="utf-8") as file:
data = json.load(file)
result = {}
for item in data:
event = item.get("event")
phrases = item.get("phrases", [])
result[event] = phrases
return result
import random
import re
def replace_event_synonyms(caption, onset):
"""
Replace event names in both caption(TCC) and onset(TDC) string with corresponding free text descriptions.
Args:
caption (str): Caption text containing event names.
onset (str): Onset string, formatted as "event__start-end--event2__start-end".
Returns:
new_caption (str): Caption with event names replaced by descriptions.
new_onset (str): Onset string with event names replaced by descriptions.
Notes:
- Synonyms are fetched using get_event_synonyms().
- For each event, a random synonym is chosen.
- All occurrences in caption (with correct pluralization) and onset are replaced.
"""
event_pattern = r"([a-zA-Z_()\s]+?)__((?:[\d\.\-]+_?)+)(?=--|$)"
events = re.findall(event_pattern, onset)
synonyms_dict = get_event_synonyms()
replacements = {}
# Choose a random synonym for each unique event
for event_name, _ in events:
if event_name not in replacements:
candidates = synonyms_dict.get(event_name, [event_name])
replacements[event_name] = random.choice(candidates)
# Replace event names in the onset string
new_onset = "--".join([
f"{replacements[event]}__{timestamps}"
for event, timestamps in events
])
# Replace event names in the caption, handling plural forms and case
new_caption = caption
for orig, repl in replacements.items():
orig_space = orig.replace("_", " ")
repl_space = repl.replace("_", " ")
escaped_orig_space = re.escape(orig_space)
pattern = rf"(?<!\w){escaped_orig_space}(es|s)?(?!\w)"
new_caption = re.sub(
pattern,
lambda m: match_plural(m, repl_space),
new_caption,
flags=re.IGNORECASE
)
return new_caption.capitalize(), new_onset
def match_plural(match_obj, replacement):
"""
Return replacement word with same plural suffix as the matched word.
Args:
match_obj (re.Match): Match object with possible plural suffix.
replacement (str): Replacement string for the event name.
Returns:
str: Replacement string with plural suffix preserved.
"""
matched = match_obj.group(0)
suffix = match_obj.group(1) or "" # Get plural suffix if present
base_replacement = replacement
# Preserve plural suffix ("s" or "es") from original word
return base_replacement + suffix
if __name__ == "__main__":
onset = "wind_chime__0.78-2.78"
caption = "wind chime one times"
print("Original onset:", onset)
print("Original caption:", caption)
caption, onset = replace_event_synonyms(caption, onset)
print("Modified onset:", onset)
print("Modified caption:", caption)