Style-Bert-VITS2-SU / spec_gen.py
TAKESHI0\ogawa
up
5548e77
import torch
from tqdm import tqdm
from multiprocessing import Pool
from mel_processing import spectrogram_torch, mel_spectrogram_torch
from utils import load_wav_to_torch
class AudioProcessor:
def __init__(
self,
max_wav_value,
use_mel_spec_posterior,
filter_length,
n_mel_channels,
sampling_rate,
hop_length,
win_length,
mel_fmin,
mel_fmax,
):
self.max_wav_value = max_wav_value
self.use_mel_spec_posterior = use_mel_spec_posterior
self.filter_length = filter_length
self.n_mel_channels = n_mel_channels
self.sampling_rate = sampling_rate
self.hop_length = hop_length
self.win_length = win_length
self.mel_fmin = mel_fmin
self.mel_fmax = mel_fmax
def process_audio(self, filename):
audio, sampling_rate = load_wav_to_torch(filename)
audio_norm = audio / self.max_wav_value
audio_norm = audio_norm.unsqueeze(0)
spec_filename = filename.replace(".wav", ".spec.pt")
if self.use_mel_spec_posterior:
spec_filename = spec_filename.replace(".spec.pt", ".mel.pt")
try:
spec = torch.load(spec_filename)
except:
if self.use_mel_spec_posterior:
spec = mel_spectrogram_torch(
audio_norm,
self.filter_length,
self.n_mel_channels,
self.sampling_rate,
self.hop_length,
self.win_length,
self.mel_fmin,
self.mel_fmax,
center=False,
)
else:
spec = spectrogram_torch(
audio_norm,
self.filter_length,
self.sampling_rate,
self.hop_length,
self.win_length,
center=False,
)
spec = torch.squeeze(spec, 0)
torch.save(spec, spec_filename)
return spec, audio_norm
# 使用示例
processor = AudioProcessor(
max_wav_value=32768.0,
use_mel_spec_posterior=False,
filter_length=2048,
n_mel_channels=128,
sampling_rate=44100,
hop_length=512,
win_length=2048,
mel_fmin=0.0,
mel_fmax="null",
)
with open("filelists/train.list", "r") as f:
filepaths = [line.split("|")[0] for line in f] # 取每一行的第一部分作为audiopath
# 使用多进程处理
with Pool(processes=32) as pool: # 使用4个进程
with tqdm(total=len(filepaths)) as pbar:
for i, _ in enumerate(pool.imap_unordered(processor.process_audio, filepaths)):
pbar.update()