# Copyright (c) 2023 Amphion. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import numpy as np import os import tgt def get_alignment(tier, cfg): sample_rate = cfg["sample_rate"] hop_size = cfg["hop_size"] sil_phones = ["sil", "sp", "spn"] phones = [] durations = [] start_time = 0 end_time = 0 end_idx = 0 for t in tier._objects: s, e, p = t.start_time, t.end_time, t.text # Trim leading silences if phones == []: if p in sil_phones: continue else: start_time = s if p not in sil_phones: # For ordinary phones phones.append(p) end_time = e end_idx = len(phones) else: # For silent phones phones.append(p) durations.append( int( np.round(e * sample_rate / hop_size) - np.round(s * sample_rate / hop_size) ) ) # Trim tailing silences phones = phones[:end_idx] durations = durations[:end_idx] return phones, durations, start_time, end_time def get_duration(utt, wav, cfg): speaker = utt["Singer"] basename = utt["Uid"] dataset = utt["Dataset"] sample_rate = cfg["sample_rate"] # print(cfg.processed_dir, dataset, speaker, basename) wav_path = os.path.join( cfg.processed_dir, dataset, "raw_data", speaker, "{}.wav".format(basename) ) text_path = os.path.join( cfg.processed_dir, dataset, "raw_data", speaker, "{}.lab".format(basename) ) tg_path = os.path.join( cfg.processed_dir, dataset, "TextGrid", speaker, "{}.TextGrid".format(basename) ) # Read raw text with open(text_path, "r") as f: raw_text = f.readline().strip("\n") # Get alignments textgrid = tgt.io.read_textgrid(tg_path) phone, duration, start, end = get_alignment( textgrid.get_tier_by_name("phones"), cfg ) text = "{" + " ".join(phone) + "}" if start >= end: return None return duration, text, int(sample_rate * start), int(sample_rate * end)