File size: 2,224 Bytes
b725c5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# Copyright (c) 2023 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import numpy as np
import os
import tgt


def get_alignment(tier, cfg):
    sample_rate = cfg["sample_rate"]
    hop_size = cfg["hop_size"]

    sil_phones = ["sil", "sp", "spn"]

    phones = []
    durations = []
    start_time = 0
    end_time = 0
    end_idx = 0

    for t in tier._objects:
        s, e, p = t.start_time, t.end_time, t.text

        # Trim leading silences
        if phones == []:
            if p in sil_phones:
                continue
            else:
                start_time = s

        if p not in sil_phones:
            # For ordinary phones
            phones.append(p)
            end_time = e
            end_idx = len(phones)
        else:
            # For silent phones
            phones.append(p)

        durations.append(
            int(
                np.round(e * sample_rate / hop_size)
                - np.round(s * sample_rate / hop_size)
            )
        )

    # Trim tailing silences
    phones = phones[:end_idx]
    durations = durations[:end_idx]

    return phones, durations, start_time, end_time


def get_duration(utt, wav, cfg):
    speaker = utt["Singer"]
    basename = utt["Uid"]
    dataset = utt["Dataset"]
    sample_rate = cfg["sample_rate"]

    # print(cfg.processed_dir, dataset, speaker, basename)
    wav_path = os.path.join(
        cfg.processed_dir, dataset, "raw_data", speaker, "{}.wav".format(basename)
    )
    text_path = os.path.join(
        cfg.processed_dir, dataset, "raw_data", speaker, "{}.lab".format(basename)
    )
    tg_path = os.path.join(
        cfg.processed_dir, dataset, "TextGrid", speaker, "{}.TextGrid".format(basename)
    )

    # Read raw text
    with open(text_path, "r") as f:
        raw_text = f.readline().strip("\n")

    # Get alignments
    textgrid = tgt.io.read_textgrid(tg_path)
    phone, duration, start, end = get_alignment(
        textgrid.get_tier_by_name("phones"), cfg
    )
    text = "{" + " ".join(phone) + "}"
    if start >= end:
        return None

    return duration, text, int(sample_rate * start), int(sample_rate * end)