Spaces:
Build error
Build error
File size: 4,734 Bytes
4817bcc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
from encoder.params_model import model_embedding_size as speaker_embedding_size
from utils.argutils import print_args
from utils.modelutils import check_model_paths
from synthesizer.inference import Synthesizer
from encoder import inference as encoder
from vocoder.wavernn import inference as rnn_vocoder
from vocoder.hifigan import inference as gan_vocoder
from pathlib import Path
import numpy as np
import soundfile as sf
import librosa
import argparse
import torch
import sys
import os
import re
import cn2an
import glob
from audioread.exceptions import NoBackendError
vocoder = gan_vocoder
def gen_one_wav(synthesizer, in_fpath, embed, texts, file_name, seq):
embeds = [embed] * len(texts)
# If you know what the attention layer alignments are, you can retrieve them here by
# passing return_alignments=True
specs = synthesizer.synthesize_spectrograms(texts, embeds, style_idx=-1, min_stop_token=4, steps=400)
#spec = specs[0]
breaks = [spec.shape[1] for spec in specs]
spec = np.concatenate(specs, axis=1)
# If seed is specified, reset torch seed and reload vocoder
# Synthesizing the waveform is fairly straightforward. Remember that the longer the
# spectrogram, the more time-efficient the vocoder.
generated_wav, output_sample_rate = vocoder.infer_waveform(spec)
# Add breaks
b_ends = np.cumsum(np.array(breaks) * synthesizer.hparams.hop_size)
b_starts = np.concatenate(([0], b_ends[:-1]))
wavs = [generated_wav[start:end] for start, end, in zip(b_starts, b_ends)]
breaks = [np.zeros(int(0.15 * synthesizer.sample_rate))] * len(breaks)
generated_wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])
## Post-generation
# There's a bug with sounddevice that makes the audio cut one second earlier, so we
# pad it.
# Trim excess silences to compensate for gaps in spectrograms (issue #53)
generated_wav = encoder.preprocess_wav(generated_wav)
generated_wav = generated_wav / np.abs(generated_wav).max() * 0.97
# Save it on the disk
model=os.path.basename(in_fpath)
filename = "%s_%d_%s.wav" %(file_name, seq, model)
sf.write(filename, generated_wav, synthesizer.sample_rate)
print("\nSaved output as %s\n\n" % filename)
def generate_wav(enc_model_fpath, syn_model_fpath, voc_model_fpath, in_fpath, input_txt, file_name):
if torch.cuda.is_available():
device_id = torch.cuda.current_device()
gpu_properties = torch.cuda.get_device_properties(device_id)
## Print some environment information (for debugging purposes)
print("Found %d GPUs available. Using GPU %d (%s) of compute capability %d.%d with "
"%.1fGb total memory.\n" %
(torch.cuda.device_count(),
device_id,
gpu_properties.name,
gpu_properties.major,
gpu_properties.minor,
gpu_properties.total_memory / 1e9))
else:
print("Using CPU for inference.\n")
print("Preparing the encoder, the synthesizer and the vocoder...")
encoder.load_model(enc_model_fpath)
synthesizer = Synthesizer(syn_model_fpath)
vocoder.load_model(voc_model_fpath)
encoder_wav = synthesizer.load_preprocess_wav(in_fpath)
embed, partial_embeds, _ = encoder.embed_utterance(encoder_wav, return_partials=True)
texts = input_txt.split("\n")
seq=0
each_num=1500
punctuation = '!,。、,' # punctuate and split/clean text
processed_texts = []
cur_num = 0
for text in texts:
for processed_text in re.sub(r'[{}]+'.format(punctuation), '\n', text).split('\n'):
if processed_text:
processed_texts.append(processed_text.strip())
cur_num += len(processed_text.strip())
if cur_num > each_num:
seq = seq +1
gen_one_wav(synthesizer, in_fpath, embed, processed_texts, file_name, seq)
processed_texts = []
cur_num = 0
if len(processed_texts)>0:
seq = seq +1
gen_one_wav(synthesizer, in_fpath, embed, processed_texts, file_name, seq)
if (len(sys.argv)>=3):
my_txt = ""
print("reading from :", sys.argv[1])
with open(sys.argv[1], "r") as f:
for line in f.readlines():
#line = line.strip('\n')
my_txt += line
txt_file_name = sys.argv[1]
wav_file_name = sys.argv[2]
output = cn2an.transform(my_txt, "an2cn")
print(output)
generate_wav(
Path("encoder/saved_models/pretrained.pt"),
Path("synthesizer/saved_models/mandarin.pt"),
Path("vocoder/saved_models/pretrained/g_hifigan.pt"), wav_file_name, output, txt_file_name
)
else:
print("please input the file name")
exit(1)
|