|
|
|
|
|
|
|
|
|
|
|
import numpy as np |
|
import os |
|
import tgt |
|
|
|
|
|
def get_alignment(tier, cfg): |
|
sample_rate = cfg["sample_rate"] |
|
hop_size = cfg["hop_size"] |
|
|
|
sil_phones = ["sil", "sp", "spn"] |
|
|
|
phones = [] |
|
durations = [] |
|
start_time = 0 |
|
end_time = 0 |
|
end_idx = 0 |
|
|
|
for t in tier._objects: |
|
s, e, p = t.start_time, t.end_time, t.text |
|
|
|
|
|
if phones == []: |
|
if p in sil_phones: |
|
continue |
|
else: |
|
start_time = s |
|
|
|
if p not in sil_phones: |
|
|
|
phones.append(p) |
|
end_time = e |
|
end_idx = len(phones) |
|
else: |
|
|
|
phones.append(p) |
|
|
|
durations.append( |
|
int( |
|
np.round(e * sample_rate / hop_size) |
|
- np.round(s * sample_rate / hop_size) |
|
) |
|
) |
|
|
|
|
|
phones = phones[:end_idx] |
|
durations = durations[:end_idx] |
|
|
|
return phones, durations, start_time, end_time |
|
|
|
|
|
def get_duration(utt, wav, cfg): |
|
speaker = utt["Singer"] |
|
basename = utt["Uid"] |
|
dataset = utt["Dataset"] |
|
sample_rate = cfg["sample_rate"] |
|
|
|
|
|
wav_path = os.path.join( |
|
cfg.processed_dir, dataset, "raw_data", speaker, "{}.wav".format(basename) |
|
) |
|
text_path = os.path.join( |
|
cfg.processed_dir, dataset, "raw_data", speaker, "{}.lab".format(basename) |
|
) |
|
tg_path = os.path.join( |
|
cfg.processed_dir, dataset, "TextGrid", speaker, "{}.TextGrid".format(basename) |
|
) |
|
|
|
|
|
with open(text_path, "r") as f: |
|
raw_text = f.readline().strip("\n") |
|
|
|
|
|
textgrid = tgt.io.read_textgrid(tg_path) |
|
phone, duration, start, end = get_alignment( |
|
textgrid.get_tier_by_name("phones"), cfg |
|
) |
|
text = "{" + " ".join(phone) + "}" |
|
if start >= end: |
|
return None |
|
|
|
return duration, text, int(sample_rate * start), int(sample_rate * end) |
|
|