from encoder.params_model import model_embedding_size as speaker_embedding_size from utils.argutils import print_args from utils.modelutils import check_model_paths from synthesizer.inference import Synthesizer from encoder import inference as encoder from vocoder.wavernn import inference as rnn_vocoder from vocoder.hifigan import inference as gan_vocoder from pathlib import Path import numpy as np import soundfile as sf import librosa import argparse import torch import sys import os import re import cn2an import glob from audioread.exceptions import NoBackendError vocoder = gan_vocoder def gen_one_wav(synthesizer, in_fpath, embed, texts, file_name, seq): embeds = [embed] * len(texts) # If you know what the attention layer alignments are, you can retrieve them here by # passing return_alignments=True specs = synthesizer.synthesize_spectrograms(texts, embeds, style_idx=-1, min_stop_token=4, steps=400) #spec = specs[0] breaks = [spec.shape[1] for spec in specs] spec = np.concatenate(specs, axis=1) # If seed is specified, reset torch seed and reload vocoder # Synthesizing the waveform is fairly straightforward. Remember that the longer the # spectrogram, the more time-efficient the vocoder. generated_wav, output_sample_rate = vocoder.infer_waveform(spec) # Add breaks b_ends = np.cumsum(np.array(breaks) * synthesizer.hparams.hop_size) b_starts = np.concatenate(([0], b_ends[:-1])) wavs = [generated_wav[start:end] for start, end, in zip(b_starts, b_ends)] breaks = [np.zeros(int(0.15 * synthesizer.sample_rate))] * len(breaks) generated_wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)]) ## Post-generation # There's a bug with sounddevice that makes the audio cut one second earlier, so we # pad it. # Trim excess silences to compensate for gaps in spectrograms (issue #53) generated_wav = encoder.preprocess_wav(generated_wav) generated_wav = generated_wav / np.abs(generated_wav).max() * 0.97 # Save it on the disk model=os.path.basename(in_fpath) filename = "%s_%d_%s.wav" %(file_name, seq, model) sf.write(filename, generated_wav, synthesizer.sample_rate) print("\nSaved output as %s\n\n" % filename) def generate_wav(enc_model_fpath, syn_model_fpath, voc_model_fpath, in_fpath, input_txt, file_name): if torch.cuda.is_available(): device_id = torch.cuda.current_device() gpu_properties = torch.cuda.get_device_properties(device_id) ## Print some environment information (for debugging purposes) print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with " "%.1fGb total memory.\n" % (torch.cuda.device_count(), device_id, gpu_properties.name, gpu_properties.major, gpu_properties.minor, gpu_properties.total_memory / 1e9)) else: print("Using CPU for inference.\n") print("Preparing the encoder, the synthesizer and the vocoder...") encoder.load_model(enc_model_fpath) synthesizer = Synthesizer(syn_model_fpath) vocoder.load_model(voc_model_fpath) encoder_wav = synthesizer.load_preprocess_wav(in_fpath) embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True) texts = input_txt.split("\n") seq=0 each_num=1500 punctuation = '!,。、,' # punctuate and split/clean text processed_texts = [] cur_num = 0 for text in texts: for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'): if processed_text: processed_texts.append(processed_text.strip()) cur_num += len(processed_text.strip()) if cur_num > each_num: seq = seq +1 gen_one_wav(synthesizer, in_fpath, embed, processed_texts, file_name, seq) processed_texts = [] cur_num = 0 if len(processed_texts)>0: seq = seq +1 gen_one_wav(synthesizer, in_fpath, embed, processed_texts, file_name, seq) if (len(sys.argv)>=3): my_txt = "" print("reading from :", sys.argv[1]) with open(sys.argv[1], "r") as f: for line in f.readlines(): #line = line.strip('\n') my_txt += line txt_file_name = sys.argv[1] wav_file_name = sys.argv[2] output = cn2an.transform(my_txt, "an2cn") print(output) generate_wav( Path("encoder/saved_models/pretrained.pt"), Path("synthesizer/saved_models/mandarin.pt"), Path("vocoder/saved_models/pretrained/g_hifigan.pt"), wav_file_name, output, txt_file_name ) else: print("please input the file name") exit(1)