| |
| |
| |
| |
|
|
| import numpy as np |
| import os |
| import tgt |
|
|
|
|
| def get_alignment(tier, cfg): |
| sample_rate = cfg["sample_rate"] |
| hop_size = cfg["hop_size"] |
|
|
| sil_phones = ["sil", "sp", "spn"] |
|
|
| phones = [] |
| durations = [] |
| start_time = 0 |
| end_time = 0 |
| end_idx = 0 |
|
|
| for t in tier._objects: |
| s, e, p = t.start_time, t.end_time, t.text |
|
|
| |
| if phones == []: |
| if p in sil_phones: |
| continue |
| else: |
| start_time = s |
|
|
| if p not in sil_phones: |
| |
| phones.append(p) |
| end_time = e |
| end_idx = len(phones) |
| else: |
| |
| phones.append(p) |
|
|
| durations.append( |
| int( |
| np.round(e * sample_rate / hop_size) |
| - np.round(s * sample_rate / hop_size) |
| ) |
| ) |
|
|
| |
| phones = phones[:end_idx] |
| durations = durations[:end_idx] |
|
|
| return phones, durations, start_time, end_time |
|
|
|
|
| def get_duration(utt, wav, cfg): |
| speaker = utt["Singer"] |
| basename = utt["Uid"] |
| dataset = utt["Dataset"] |
| sample_rate = cfg["sample_rate"] |
|
|
| |
| wav_path = os.path.join( |
| cfg.processed_dir, dataset, "raw_data", speaker, "{}.wav".format(basename) |
| ) |
| text_path = os.path.join( |
| cfg.processed_dir, dataset, "raw_data", speaker, "{}.lab".format(basename) |
| ) |
| tg_path = os.path.join( |
| cfg.processed_dir, dataset, "TextGrid", speaker, "{}.TextGrid".format(basename) |
| ) |
|
|
| |
| with open(text_path, "r") as f: |
| raw_text = f.readline().strip("\n") |
|
|
| |
| textgrid = tgt.io.read_textgrid(tg_path) |
| phone, duration, start, end = get_alignment( |
| textgrid.get_tier_by_name("phones"), cfg |
| ) |
| text = "{" + " ".join(phone) + "}" |
| if start >= end: |
| return None |
|
|
| return duration, text, int(sample_rate * start), int(sample_rate * end) |
|
|