Spaces:

amphion
/

NaturalSpeech2

Running on T4

File size: 2,224 Bytes

b725c5a

# Copyright (c) 2023 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import numpy as np
import os
import tgt


def get_alignment(tier, cfg):
    sample_rate = cfg["sample_rate"]
    hop_size = cfg["hop_size"]

    sil_phones = ["sil", "sp", "spn"]

    phones = []
    durations = []
    start_time = 0
    end_time = 0
    end_idx = 0

    for t in tier._objects:
        s, e, p = t.start_time, t.end_time, t.text

        # Trim leading silences
        if phones == []:
            if p in sil_phones:
                continue
            else:
                start_time = s

        if p not in sil_phones:
            # For ordinary phones
            phones.append(p)
            end_time = e
            end_idx = len(phones)
        else:
            # For silent phones
            phones.append(p)

        durations.append(
            int(
                np.round(e * sample_rate / hop_size)
                - np.round(s * sample_rate / hop_size)
            )
        )

    # Trim tailing silences
    phones = phones[:end_idx]
    durations = durations[:end_idx]

    return phones, durations, start_time, end_time


def get_duration(utt, wav, cfg):
    speaker = utt["Singer"]
    basename = utt["Uid"]
    dataset = utt["Dataset"]
    sample_rate = cfg["sample_rate"]

    # print(cfg.processed_dir, dataset, speaker, basename)
    wav_path = os.path.join(
        cfg.processed_dir, dataset, "raw_data", speaker, "{}.wav".format(basename)
    )
    text_path = os.path.join(
        cfg.processed_dir, dataset, "raw_data", speaker, "{}.lab".format(basename)
    )
    tg_path = os.path.join(
        cfg.processed_dir, dataset, "TextGrid", speaker, "{}.TextGrid".format(basename)
    )

    # Read raw text
    with open(text_path, "r") as f:
        raw_text = f.readline().strip("\n")

    # Get alignments
    textgrid = tgt.io.read_textgrid(tg_path)
    phone, duration, start, end = get_alignment(
        textgrid.get_tier_by_name("phones"), cfg
    )
    text = "{" + " ".join(phone) + "}"
    if start >= end:
        return None

    return duration, text, int(sample_rate * start), int(sample_rate * end)