File size: 5,928 Bytes
b725c5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
# Copyright (c) 2023 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import os
import numpy as np
import torch
import torchaudio


def save_feature(process_dir, feature_dir, item, feature, overrides=True):
    """Save features to path

    Args:
        process_dir (str): directory to store features
        feature_dir (_type_): directory to store one type of features (mel, energy, ...)
        item (str): uid
        feature (tensor): feature tensor
        overrides (bool, optional): whether to override existing files. Defaults to True.
    """
    process_dir = os.path.join(process_dir, feature_dir)
    os.makedirs(process_dir, exist_ok=True)
    out_path = os.path.join(process_dir, item + ".npy")

    if os.path.exists(out_path):
        if overrides:
            np.save(out_path, feature)
    else:
        np.save(out_path, feature)


def save_txt(process_dir, feature_dir, item, feature, overrides=True):
    process_dir = os.path.join(process_dir, feature_dir)
    os.makedirs(process_dir, exist_ok=True)
    out_path = os.path.join(process_dir, item + ".txt")

    if os.path.exists(out_path):
        if overrides:
            f = open(out_path, "w")
            f.writelines(feature)
            f.close()
    else:
        f = open(out_path, "w")
        f.writelines(feature)
        f.close()


def save_audio(path, waveform, fs, add_silence=False, turn_up=False, volume_peak=0.9):
    """Save audio to path with processing  (turn up volume, add silence)
    Args:
        path (str): path to save audio
        waveform (numpy array): waveform to save
        fs (int): sampling rate
        add_silence (bool, optional): whether to add silence to beginning and end. Defaults to False.
        turn_up (bool, optional): whether to turn up volume. Defaults to False.
        volume_peak (float, optional): volume peak. Defaults to 0.9.
    """
    if turn_up:
        # continue to turn up to volume_peak
        ratio = volume_peak / max(waveform.max(), abs(waveform.min()))
        waveform = waveform * ratio

    if add_silence:
        silence_len = fs // 20
        silence = np.zeros((silence_len,), dtype=waveform.dtype)
        result = np.concatenate([silence, waveform, silence])
        waveform = result

    waveform = torch.as_tensor(waveform, dtype=torch.float32, device="cpu")
    if len(waveform.size()) == 1:
        waveform = waveform[None, :]
    elif waveform.size(0) != 1:
        # Stereo to mono
        waveform = torch.mean(waveform, dim=0, keepdim=True)
    torchaudio.save(path, waveform, fs, encoding="PCM_S", bits_per_sample=16)


def save_torch_audio(process_dir, feature_dir, item, wav_torch, fs, overrides=True):
    """Save torch audio to path without processing
    Args:
        process_dir (str): directory to store features
        feature_dir (_type_): directory to store one type of features (mel, energy, ...)
        item (str): uid
        wav_torch (tensor): feature tensor
        fs (int): sampling rate
        overrides (bool, optional): whether to override existing files. Defaults to True.
    """
    if wav_torch.shape != 2:
        wav_torch = wav_torch.unsqueeze(0)

    process_dir = os.path.join(process_dir, feature_dir)
    os.makedirs(process_dir, exist_ok=True)
    out_path = os.path.join(process_dir, item + ".wav")

    torchaudio.save(out_path, wav_torch, fs)


async def async_load_audio(path, sample_rate: int = 24000):
    r"""
    Args:
        path: The source loading path.
        sample_rate: The target sample rate, will automatically resample if necessary.

    Returns:
        waveform: The waveform object. Should be [1 x sequence_len].
    """

    async def use_torchaudio_load(path):
        return torchaudio.load(path)

    waveform, sr = await use_torchaudio_load(path)
    waveform = torch.mean(waveform, dim=0, keepdim=True)

    if sr != sample_rate:
        waveform = torchaudio.functional.resample(waveform, sr, sample_rate)

    if torch.any(torch.isnan(waveform) or torch.isinf(waveform)):
        raise ValueError("NaN or Inf found in waveform.")
    return waveform


async def async_save_audio(
    path,
    waveform,
    sample_rate: int = 24000,
    add_silence: bool = False,
    volume_peak: float = 0.9,
):
    r"""
    Args:
        path: The target saving path.
        waveform: The waveform object. Should be [n_channel x sequence_len].
        sample_rate: Sample rate.
        add_silence: If ``true``, concat 0.05s silence to beginning and end.
        volume_peak: Turn up volume for larger number, vice versa.
    """

    async def use_torchaudio_save(path, waveform, sample_rate):
        torchaudio.save(
            path, waveform, sample_rate, encoding="PCM_S", bits_per_sample=16
        )

    waveform = torch.as_tensor(waveform, device="cpu", dtype=torch.float32)
    shape = waveform.size()[:-1]

    ratio = abs(volume_peak) / max(waveform.max(), abs(waveform.min()))
    waveform = waveform * ratio

    if add_silence:
        silence_len = sample_rate // 20
        silence = torch.zeros((*shape, silence_len), dtype=waveform.type())
        waveform = torch.concatenate((silence, waveform, silence), dim=-1)

    if waveform.dim() == 1:
        waveform = waveform[None]

    await use_torchaudio_save(path, waveform, sample_rate)


def load_mel_extrema(cfg, dataset_name, split):
    dataset_dir = os.path.join(
        cfg.OUTPUT_PATH,
        "preprocess/{}_version".format(cfg.data.process_version),
        dataset_name,
    )

    min_file = os.path.join(
        dataset_dir,
        "mel_min_max",
        split.split("_")[-1],
        "mel_min.npy",
    )
    max_file = os.path.join(
        dataset_dir,
        "mel_min_max",
        split.split("_")[-1],
        "mel_max.npy",
    )
    mel_min = np.load(min_file)
    mel_max = np.load(max_file)
    return mel_min, mel_max