import argparse import logging import multiprocessing import os import random from concurrent.futures import ProcessPoolExecutor from glob import glob from random import shuffle import librosa import numpy as np import torch from tqdm import tqdm import diffusion.logger.utils as du import utils from diffusion.vocoder import Vocoder from modules.mel_processing import spectrogram_torch logging.getLogger("numba").setLevel(logging.WARNING) logging.getLogger("matplotlib").setLevel(logging.WARNING) hps = utils.get_hparams_from_file("configs/config.json") dconfig = du.load_config("configs/diffusion.yaml") sampling_rate = hps.data.sampling_rate hop_length = hps.data.hop_length speech_encoder = hps["model"]["speech_encoder"] def process_one(filename, hmodel,f0p,diff=False,mel_extractor=None): # print(filename) wav, sr = librosa.load(filename, sr=sampling_rate) audio_norm = torch.FloatTensor(wav) audio_norm = audio_norm.unsqueeze(0) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") soft_path = filename + ".soft.pt" if not os.path.exists(soft_path): wav16k = librosa.resample(wav, orig_sr=sampling_rate, target_sr=16000) wav16k = torch.from_numpy(wav16k).to(device) c = hmodel.encoder(wav16k) torch.save(c.cpu(), soft_path) f0_path = filename + ".f0.npy" if not os.path.exists(f0_path): f0_predictor = utils.get_f0_predictor(f0p,sampling_rate=sampling_rate, hop_length=hop_length,device=None,threshold=0.05) f0,uv = f0_predictor.compute_f0_uv( wav ) np.save(f0_path, np.asanyarray((f0,uv),dtype=object)) spec_path = filename.replace(".wav", ".spec.pt") if not os.path.exists(spec_path): # Process spectrogram # The following code can't be replaced by torch.FloatTensor(wav) # because load_wav_to_torch return a tensor that need to be normalized if sr != hps.data.sampling_rate: raise ValueError( "{} SR doesn't match target {} SR".format( sr, hps.data.sampling_rate ) ) #audio_norm = audio / hps.data.max_wav_value spec = spectrogram_torch( audio_norm, hps.data.filter_length, hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length, center=False, ) spec = torch.squeeze(spec, 0) torch.save(spec, spec_path) if diff or hps.model.vol_embedding: volume_path = filename + ".vol.npy" volume_extractor = utils.Volume_Extractor(hop_length) if not os.path.exists(volume_path): volume = volume_extractor.extract(audio_norm) np.save(volume_path, volume.to('cpu').numpy()) if diff: mel_path = filename + ".mel.npy" if not os.path.exists(mel_path) and mel_extractor is not None: mel_t = mel_extractor.extract(audio_norm.to(device), sampling_rate) mel = mel_t.squeeze().to('cpu').numpy() np.save(mel_path, mel) aug_mel_path = filename + ".aug_mel.npy" aug_vol_path = filename + ".aug_vol.npy" max_amp = float(torch.max(torch.abs(audio_norm))) + 1e-5 max_shift = min(1, np.log10(1/max_amp)) log10_vol_shift = random.uniform(-1, max_shift) keyshift = random.uniform(-5, 5) if mel_extractor is not None: aug_mel_t = mel_extractor.extract(audio_norm * (10 ** log10_vol_shift), sampling_rate, keyshift = keyshift) aug_mel = aug_mel_t.squeeze().to('cpu').numpy() aug_vol = volume_extractor.extract(audio_norm * (10 ** log10_vol_shift)) if not os.path.exists(aug_mel_path): np.save(aug_mel_path,np.asanyarray((aug_mel,keyshift),dtype=object)) if not os.path.exists(aug_vol_path): np.save(aug_vol_path,aug_vol.to('cpu').numpy()) def process_batch(file_chunk, f0p, diff=False, mel_extractor=None): print("Loading speech encoder for content...") device = "cuda" if torch.cuda.is_available() else "cpu" hmodel = utils.get_speech_encoder(speech_encoder, device=device) print("Loaded speech encoder.") for filename in tqdm(file_chunk): process_one(filename, hmodel, f0p, diff, mel_extractor) def parallel_process(filenames, num_processes, f0p, diff, mel_extractor): with ProcessPoolExecutor(max_workers=num_processes) as executor: tasks = [] for i in range(num_processes): start = int(i * len(filenames) / num_processes) end = int((i + 1) * len(filenames) / num_processes) file_chunk = filenames[start:end] tasks.append(executor.submit(process_batch, file_chunk, f0p, diff, mel_extractor)) for task in tqdm(tasks): task.result() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "--in_dir", type=str, default="dataset/44k", help="path to input dir" ) parser.add_argument( '--use_diff',action='store_true', help='Whether to use the diffusion model' ) parser.add_argument( '--f0_predictor', type=str, default="dio", help='Select F0 predictor, can select crepe,pm,dio,harvest,rmvpe, default pm(note: crepe is original F0 using mean filter)' ) parser.add_argument( '--num_processes', type=int, default=1, help='You are advised to set the number of processes to the same as the number of CPU cores' ) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args = parser.parse_args() f0p = args.f0_predictor print(speech_encoder) print(f0p) print(args.use_diff) if args.use_diff: print("use_diff") print("Loading Mel Extractor...") mel_extractor = Vocoder(dconfig.vocoder.type, dconfig.vocoder.ckpt, device = device) print("Loaded Mel Extractor.") else: mel_extractor = None filenames = glob(f"{args.in_dir}/*/*.wav", recursive=True) # [:10] shuffle(filenames) multiprocessing.set_start_method("spawn", force=True) num_processes = args.num_processes if num_processes == 0: num_processes = os.cpu_count() parallel_process(filenames, num_processes, f0p, args.use_diff, mel_extractor)