Spaces:
Runtime error
Runtime error
File size: 4,701 Bytes
df2accb |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
# Copyright (c) 2023 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import os
import numpy as np
import torch
import torchaudio
def save_feature(process_dir, feature_dir, item, feature, overrides=True):
"""Save features to path
Args:
process_dir (str): directory to store features
feature_dir (_type_): directory to store one type of features (mel, energy, ...)
item (str): uid
feature (tensor): feature tensor
overrides (bool, optional): whether to override existing files. Defaults to True.
"""
process_dir = os.path.join(process_dir, feature_dir)
os.makedirs(process_dir, exist_ok=True)
out_path = os.path.join(process_dir, item + ".npy")
if os.path.exists(out_path):
if overrides:
np.save(out_path, feature)
else:
np.save(out_path, feature)
def save_txt(process_dir, feature_dir, item, feature, overrides=True):
process_dir = os.path.join(process_dir, feature_dir)
os.makedirs(process_dir, exist_ok=True)
out_path = os.path.join(process_dir, item + ".txt")
if os.path.exists(out_path):
if overrides:
f = open(out_path, "w")
f.writelines(feature)
f.close()
else:
f = open(out_path, "w")
f.writelines(feature)
f.close()
def save_audio(path, waveform, fs, add_silence=False, turn_up=False, volume_peak=0.9):
if turn_up:
# continue to turn up to volume_peak
ratio = volume_peak / max(waveform.max(), abs(waveform.min()))
waveform = waveform * ratio
if add_silence:
silence_len = fs // 20
silence = np.zeros((silence_len,), dtype=waveform.dtype)
result = np.concatenate([silence, waveform, silence])
waveform = result
waveform = torch.as_tensor(waveform, dtype=torch.float32, device="cpu")
if len(waveform.size()) == 1:
waveform = waveform[None, :]
elif waveform.size(0) != 1:
# Stereo to mono
waveform = torch.mean(waveform, dim=0, keepdim=True)
torchaudio.save(path, waveform, fs, encoding="PCM_S", bits_per_sample=16)
async def async_load_audio(path, sample_rate: int = 24000):
r"""
Args:
path: The source loading path.
sample_rate: The target sample rate, will automatically resample if necessary.
Returns:
waveform: The waveform object. Should be [1 x sequence_len].
"""
async def use_torchaudio_load(path):
return torchaudio.load(path)
waveform, sr = await use_torchaudio_load(path)
waveform = torch.mean(waveform, dim=0, keepdim=True)
if sr != sample_rate:
waveform = torchaudio.functional.resample(waveform, sr, sample_rate)
if torch.any(torch.isnan(waveform) or torch.isinf(waveform)):
raise ValueError("NaN or Inf found in waveform.")
return waveform
async def async_save_audio(
path,
waveform,
sample_rate: int = 24000,
add_silence: bool = False,
volume_peak: float = 0.9,
):
r"""
Args:
path: The target saving path.
waveform: The waveform object. Should be [n_channel x sequence_len].
sample_rate: Sample rate.
add_silence: If ``true``, concat 0.05s silence to beginning and end.
volume_peak: Turn up volume for larger number, vice versa.
"""
async def use_torchaudio_save(path, waveform, sample_rate):
torchaudio.save(
path, waveform, sample_rate, encoding="PCM_S", bits_per_sample=16
)
waveform = torch.as_tensor(waveform, device="cpu", dtype=torch.float32)
shape = waveform.size()[:-1]
ratio = abs(volume_peak) / max(waveform.max(), abs(waveform.min()))
waveform = waveform * ratio
if add_silence:
silence_len = sample_rate // 20
silence = torch.zeros((*shape, silence_len), dtype=waveform.type())
waveform = torch.concatenate((silence, waveform, silence), dim=-1)
if waveform.dim() == 1:
waveform = waveform[None]
await use_torchaudio_save(path, waveform, sample_rate)
def load_mel_extrema(cfg, dataset_name, split):
dataset_dir = os.path.join(
cfg.OUTPUT_PATH,
"preprocess/{}_version".format(cfg.data.process_version),
dataset_name,
)
min_file = os.path.join(
dataset_dir,
"mel_min_max",
split.split("_")[-1],
"mel_min.npy",
)
max_file = os.path.join(
dataset_dir,
"mel_min_max",
split.split("_")[-1],
"mel_max.npy",
)
mel_min = np.load(min_file)
mel_max = np.load(max_file)
return mel_min, mel_max
|