Spaces:
Runtime error
Runtime error
import argparse | |
from ctypes import alignment | |
import os | |
os.environ["CUDA_VISIBLE_DEVICES"] = "-1" | |
import sys | |
sys.path.append('rtvc/') | |
from pathlib import Path | |
import time | |
import spacy | |
import matplotlib.pyplot as plt | |
import librosa | |
import numpy as np | |
import soundfile as sf | |
import torch | |
import noisereduce as nr | |
import io | |
from scipy.io.wavfile import write | |
import base64 | |
import streamlit as st | |
from rtvc.encoder import inference as encoder | |
from rtvc.encoder.params_data import * | |
from rtvc.encoder.params_model import model_embedding_size as speaker_embedding_size | |
from rtvc.synthesizer.inference import Synthesizer_infer | |
from rtvc.utils.argutils import print_args | |
from rtvc.utils.default_models import ensure_default_models | |
from rtvc.vocoder import inference as vocoder | |
from rtvc.vocoder.display import save_attention_multiple, save_spectrogram, save_stop_tokens | |
from rtvc.synthesizer.utils.cleaners import english_cleaners_predict | |
from rtvc.speed_changer.fixSpeed import * | |
def tts(text, embed_name, nlp, autoplay=True): | |
run_id = "default" | |
models_dir = Path("rtvc/saved_models") | |
embed_path = f"embeds/{embed_name}.npy" | |
if torch.cuda.is_available(): | |
device_id = torch.cuda.current_device() | |
gpu_properties = torch.cuda.get_device_properties(device_id) | |
ensure_default_models(run_id, models_dir) | |
synthesizer = Synthesizer_infer(list(models_dir.glob(f"{run_id}/synthesizer.pt"))[0]) | |
# vocoder.load_model(list(models_dir.glob(f"{run_id}/vocoder.pt"))[0]) | |
## Generating the spectrogram | |
# The synthesizer works in batch, so you need to put your data in a list or numpy array | |
def split_text(text): | |
text = english_cleaners_predict(text) | |
texts = [i.text.strip() for i in nlp(text).sents] # split paragraph to sentences | |
return texts | |
texts = split_text(text) | |
print(f"the list of inputs texts:\n{texts}") | |
embed = np.load(embed_path) | |
specs = [] | |
alignments = [] | |
stop_tokens = [] | |
for text in texts: | |
spec, align, stop_token = synthesizer.synthesize_spectrograms([text], [embed], require_visualization=True) | |
specs.append(spec[0]) | |
alignments.append(align[0]) | |
stop_tokens.append(stop_token[0]) | |
breaks = [spec.shape[1] for spec in specs] | |
spec = np.concatenate(specs, axis=1) | |
## Save synthesizer visualization results | |
if not os.path.exists("syn_results"): | |
os.mkdir("syn_results") | |
save_attention_multiple(alignments, "syn_results/attention") | |
save_stop_tokens(stop_tokens, "syn_results/stop_tokens") | |
save_spectrogram(spec, "syn_results/mel") | |
print("Created the mel spectrogram") | |
## Generating the waveform | |
print("Synthesizing the waveform:") | |
# Synthesizing the waveform is fairly straightforward. Remember that the longer the | |
# spectrogram, the more time-efficient the vocoder. | |
wav = synthesizer.griffin_lim(spec) | |
wav = vocoder.waveform_denoising(wav) | |
# Add breaks | |
b_ends = np.cumsum(np.array(breaks) * Synthesizer_infer.hparams.hop_size) | |
b_starts = np.concatenate(([0], b_ends[:-1])) | |
wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)] | |
breaks = [np.zeros(int(0.15 * Synthesizer_infer.sample_rate))] * len(breaks) | |
wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)]) | |
# Trim excess silences to compensate for gaps in spectrograms (issue #53) | |
# generated_wav = encoder.preprocess_wav(generated_wav) | |
wav = wav / np.abs(wav).max() * 10 | |
if autoplay: | |
# Play the audio (non-blocking) | |
import sounddevice as sd | |
try: | |
sd.stop() | |
sd.play(wav, synthesizer.sample_rate) | |
time_span = len(wav)//synthesizer.sample_rate + 1 | |
time.sleep(time_span) | |
except sd.PortAudioError as e: | |
print("\nCaught exception: %s" % repr(e)) | |
print("Continuing without audio playback. Suppress this message with the \"--no_sound\" flag.\n") | |
except: | |
raise | |
bytes_wav = bytes() | |
byte_io = io.BytesIO(bytes_wav) | |
write(byte_io, synthesizer.sample_rate, wav.astype(np.float32)) | |
result_bytes = byte_io.read() | |
return base64.b64encode(result_bytes).decode() | |
if __name__ == "__main__": | |
text = "Adkins was raised by a young single mother in various working-class neighbourhoods of London." | |
embed_name = "Adele" | |
nlp = spacy.load('en_core_web_sm') | |
b64 = tts(text, embed_name, nlp, autoplay=False) | |
md = f""" | |
<audio controls autoplay> | |
<source src="data:audio/wav;base64,{b64}" type="audio/wav"> | |
Your browser does not support the audio element. | |
</audio> | |
""" | |
st.markdown(md, unsafe_allow_html=True) |