File size: 4,701 Bytes
df2accb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# Copyright (c) 2023 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import os
import numpy as np
import torch
import torchaudio


def save_feature(process_dir, feature_dir, item, feature, overrides=True):
    """Save features to path

    Args:
        process_dir (str): directory to store features
        feature_dir (_type_): directory to store one type of features (mel, energy, ...)
        item (str): uid
        feature (tensor): feature tensor
        overrides (bool, optional): whether to override existing files. Defaults to True.
    """
    process_dir = os.path.join(process_dir, feature_dir)
    os.makedirs(process_dir, exist_ok=True)
    out_path = os.path.join(process_dir, item + ".npy")

    if os.path.exists(out_path):
        if overrides:
            np.save(out_path, feature)
    else:
        np.save(out_path, feature)


def save_txt(process_dir, feature_dir, item, feature, overrides=True):
    process_dir = os.path.join(process_dir, feature_dir)
    os.makedirs(process_dir, exist_ok=True)
    out_path = os.path.join(process_dir, item + ".txt")

    if os.path.exists(out_path):
        if overrides:
            f = open(out_path, "w")
            f.writelines(feature)
            f.close()
    else:
        f = open(out_path, "w")
        f.writelines(feature)
        f.close()


def save_audio(path, waveform, fs, add_silence=False, turn_up=False, volume_peak=0.9):
    if turn_up:
        # continue to turn up to volume_peak
        ratio = volume_peak / max(waveform.max(), abs(waveform.min()))
        waveform = waveform * ratio

    if add_silence:
        silence_len = fs // 20
        silence = np.zeros((silence_len,), dtype=waveform.dtype)
        result = np.concatenate([silence, waveform, silence])
        waveform = result

    waveform = torch.as_tensor(waveform, dtype=torch.float32, device="cpu")
    if len(waveform.size()) == 1:
        waveform = waveform[None, :]
    elif waveform.size(0) != 1:
        # Stereo to mono
        waveform = torch.mean(waveform, dim=0, keepdim=True)
    torchaudio.save(path, waveform, fs, encoding="PCM_S", bits_per_sample=16)


async def async_load_audio(path, sample_rate: int = 24000):
    r"""
    Args:
        path: The source loading path.
        sample_rate: The target sample rate, will automatically resample if necessary.

    Returns:
        waveform: The waveform object. Should be [1 x sequence_len].
    """

    async def use_torchaudio_load(path):
        return torchaudio.load(path)

    waveform, sr = await use_torchaudio_load(path)
    waveform = torch.mean(waveform, dim=0, keepdim=True)

    if sr != sample_rate:
        waveform = torchaudio.functional.resample(waveform, sr, sample_rate)

    if torch.any(torch.isnan(waveform) or torch.isinf(waveform)):
        raise ValueError("NaN or Inf found in waveform.")
    return waveform


async def async_save_audio(
    path,
    waveform,
    sample_rate: int = 24000,
    add_silence: bool = False,
    volume_peak: float = 0.9,
):
    r"""
    Args:
        path: The target saving path.
        waveform: The waveform object. Should be [n_channel x sequence_len].
        sample_rate: Sample rate.
        add_silence: If ``true``, concat 0.05s silence to beginning and end.
        volume_peak: Turn up volume for larger number, vice versa.
    """

    async def use_torchaudio_save(path, waveform, sample_rate):
        torchaudio.save(
            path, waveform, sample_rate, encoding="PCM_S", bits_per_sample=16
        )

    waveform = torch.as_tensor(waveform, device="cpu", dtype=torch.float32)
    shape = waveform.size()[:-1]

    ratio = abs(volume_peak) / max(waveform.max(), abs(waveform.min()))
    waveform = waveform * ratio

    if add_silence:
        silence_len = sample_rate // 20
        silence = torch.zeros((*shape, silence_len), dtype=waveform.type())
        waveform = torch.concatenate((silence, waveform, silence), dim=-1)

    if waveform.dim() == 1:
        waveform = waveform[None]

    await use_torchaudio_save(path, waveform, sample_rate)


def load_mel_extrema(cfg, dataset_name, split):
    dataset_dir = os.path.join(
        cfg.OUTPUT_PATH,
        "preprocess/{}_version".format(cfg.data.process_version),
        dataset_name,
    )

    min_file = os.path.join(
        dataset_dir,
        "mel_min_max",
        split.split("_")[-1],
        "mel_min.npy",
    )
    max_file = os.path.join(
        dataset_dir,
        "mel_min_max",
        split.split("_")[-1],
        "mel_max.npy",
    )
    mel_min = np.load(min_file)
    mel_max = np.load(max_file)
    return mel_min, mel_max