Spaces:
Runtime error
Runtime error
File size: 4,891 Bytes
6bc94ac 436ce71 6bc94ac d2b6583 6bc94ac 436ce71 6bc94ac 436ce71 6bc94ac 436ce71 6bc94ac 15303cb 6bc94ac 436ce71 6bc94ac |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
import argparse
from ctypes import alignment
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import sys
sys.path.append('rtvc/')
from pathlib import Path
import time
import spacy
import matplotlib.pyplot as plt
import librosa
import numpy as np
import soundfile as sf
import torch
import noisereduce as nr
import io
from scipy.io.wavfile import write
import base64
from rtvc.encoder import inference as encoder
from rtvc.encoder.params_data import *
from rtvc.encoder.params_model import model_embedding_size as speaker_embedding_size
from rtvc.synthesizer.inference import Synthesizer_infer
from rtvc.utils.argutils import print_args
from rtvc.utils.default_models import ensure_default_models
from rtvc.vocoder import inference as vocoder
from rtvc.vocoder.display import save_attention_multiple, save_spectrogram, save_stop_tokens
from rtvc.synthesizer.utils.cleaners import english_cleaners_predict
from rtvc.speed_changer.fixSpeed import *
def tts(text, embed_name, nlp, autoplay=True):
run_id = "default"
models_dir = Path("rtvc/saved_models")
embed_path = f"embeds/{embed_name}.npy"
if torch.cuda.is_available():
device_id = torch.cuda.current_device()
gpu_properties = torch.cuda.get_device_properties(device_id)
ensure_default_models(run_id, models_dir)
synthesizer = Synthesizer_infer(list(models_dir.glob(f"{run_id}/synthesizer.pt"))[0])
# vocoder.load_model(list(models_dir.glob(f"{run_id}/vocoder.pt"))[0])
## Generating the spectrogram
# The synthesizer works in batch, so you need to put your data in a list or numpy array
def split_text(text):
text = english_cleaners_predict(text)
texts = [i.text.strip() for i in nlp(text).sents] # split paragraph to sentences
return texts
texts = split_text(text)
print(f"the list of inputs texts:\n{texts}")
embed = np.load(embed_path)
specs = []
alignments = []
stop_tokens = []
for text in texts:
spec, align, stop_token = synthesizer.synthesize_spectrograms([text], [embed], require_visualization=True)
specs.append(spec[0])
alignments.append(align[0])
stop_tokens.append(stop_token[0])
breaks = [spec.shape[1] for spec in specs]
spec = np.concatenate(specs, axis=1)
## Save synthesizer visualization results
if not os.path.exists("syn_results"):
os.mkdir("syn_results")
save_attention_multiple(alignments, "syn_results/attention")
save_stop_tokens(stop_tokens, "syn_results/stop_tokens")
save_spectrogram(spec, "syn_results/mel")
print("Created the mel spectrogram")
## Generating the waveform
print("Synthesizing the waveform:")
# Synthesizing the waveform is fairly straightforward. Remember that the longer the
# spectrogram, the more time-efficient the vocoder.
wav = synthesizer.griffin_lim(spec)
wav = vocoder.waveform_denoising(wav)
# Add breaks
b_ends = np.cumsum(np.array(breaks) * Synthesizer_infer.hparams.hop_size)
b_starts = np.concatenate(([0], b_ends[:-1]))
wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)]
breaks = [np.zeros(int(0.15 * Synthesizer_infer.sample_rate))] * len(breaks)
wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)])
# Trim excess silences to compensate for gaps in spectrograms (issue #53)
# generated_wav = encoder.preprocess_wav(generated_wav)
wav = wav / np.abs(wav).max() * 1
if autoplay:
# Play the audio (non-blocking)
import sounddevice as sd
try:
sd.stop()
sd.play(wav, synthesizer.sample_rate)
time_span = len(wav)//synthesizer.sample_rate + 1
time.sleep(time_span)
except sd.PortAudioError as e:
print("\nCaught exception: %s" % repr(e))
print("Continuing without audio playback. Suppress this message with the \"--no_sound\" flag.\n")
except:
raise
return wav, synthesizer.sample_rate
if __name__ == "__main__":
text = "Adkins was raised by a young single mother in various working-class neighbourhoods of London. As a child, she enjoyed singing contemporary pop music and learned to play the guitar and the clarinet. However, it was not until her early teens, when she discovered rhythm-and-blues singer Etta James and other mid-20th-century performers, that she began to consider a musical career. While she honed her talents at a government-funded secondary school for the performing arts, a friend began posting songs Adkins had written and recorded onto the social networking Web site Myspace. Her music eventually caught the attention of record labels, and in 2006, several months after graduating, she signed a contract with XL Recordings."
embed_name = "Adele"
nlp = spacy.load('en_core_web_sm')
tts(text, embed_name, nlp) |