Spaces:
Runtime error
Runtime error
#@title Tacotron2 GPU Synthesizer | |
#@markdown --- | |
#!pip install -q torch==1.13.1+cu117 torchvision==0.14.1+cu117 torchaudio==0.13.1 torchtext==0.14.1 torchdata==0.5.1 --extra-index-url https://download.pytorch.org/whl/cu117 -U | |
#@markdown If the audio sounds too artificial, you can lower the superres_strength | |
#@markdown Choose Emotions: neutral , sad , happy , surprise ,angry (in lower case) | |
#Add new characters here. | |
#import subprocess | |
# Define the Git repository URL and target directory | |
#repository_url = "https://github.com/example/repository.git" | |
#target_directory = "/path/to/target/directory" | |
# Execute the git clone command | |
#subprocess.run(["git", "clone", repository_url, target_directory]) | |
import logging | |
logging.getLogger('matplotlib').setLevel(logging.WARNING) | |
logging.getLogger('numba').setLevel(logging.WARNING) | |
logging.getLogger('librosa').setLevel(logging.WARNING) | |
#Universal HiFi-GAN (has some robotic noise): 1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW | |
#Emotion = "angry" #@param {type:"string"} | |
hifigan_id = "universal" | |
Emotion=input("select your Emotion ['neutral','sad','happy','angry','surprise']") | |
Angry_tacotron_id = "1sJXE_fcCqfekZFZlF2kO01hrrI95FvwZ" #@param {type:"string"} | |
Sad_tacotron_id="1rWBPz-gVGAkYFLzaVoZgu8JnoWqVA3bb"#@param {type:"string"} | |
Happy_tacotron_id="1YDsuzEkiM-il7cESyux0KhOnvj5cDcYM"#@param {type:"string"} | |
Surprise_tacotron_id="1e1h85cItOQaj0KO8q4hyp-vM4MITIdUj"#@param {type:"string"} | |
Neutral_tacotron_id="104G09OHfu22uaRKaHqlSCYbrNpWQx4Pl"#@param {type:"string"} | |
if Emotion == "angry": | |
tacotron_id =Angry_tacotron_id #change tacotron_id | |
elif Emotion == "sad": | |
tacotron_id=Sad_tacotron_id#change tacotron_id | |
elif Emotion == "happy": | |
tacotron_id=Happy_tacotron_id#change tacotron_id | |
elif Emotion == "surprise": | |
tacotron_id=Surprise_tacotron_id#change tacotron_id | |
elif Emotion =="neutral": | |
tacotron_id=Neutral_tacotron_id#change tacotron_id | |
Emotion=Emotion.lower() | |
#@markdown --- | |
if tacotron_id != "": | |
TACOTRON2_ID = tacotron_id | |
else: | |
raise Exception("No ID provided.") | |
if hifigan_id in {"", "universal"}: | |
HIFIGAN_ID = "universal" | |
print("Using universal Hifi-Gan model.") | |
else: | |
HIFIGAN_ID = hifigan_id | |
# Check if Initialized | |
try: | |
initialized | |
except NameError: | |
print("Setting up, please wait.\n") | |
#!pip install tqdm -q | |
from tqdm.notebook import tqdm | |
with tqdm(total=5, leave=False) as pbar: | |
import os | |
from os.path import exists, join, basename, splitext | |
#!pip install resampy | |
#!pip install git+https://github.com/wkentaro/gdown.git | |
git_repo_url = 'https://github.com/justinjohn0306/TTS-TT2.git' | |
project_name = splitext(basename(git_repo_url))[0] | |
if not exists(project_name): | |
# clone and install | |
!git clone -q --recursive {git_repo_url} | |
!git clone -q --recursive https://github.com/justinjohn0306/hifi-gan | |
#!pip install -q unidecode | |
pbar.update(1) # downloaded TT2 and HiFi-GAN | |
import sys | |
sys.path.append('hifi-gan') | |
sys.path.append(project_name) | |
import time | |
import matplotlib | |
import matplotlib.pylab as plt | |
import gdown | |
d = 'https://drive.google.com/uc?id=' | |
%matplotlib inline | |
import IPython.display as ipd | |
import numpy as np | |
import torch | |
import json | |
from hparams import create_hparams | |
from model import Tacotron2 | |
from layers import TacotronSTFT | |
from audio_processing import griffin_lim | |
from text import text_to_sequence | |
from env import AttrDict | |
from meldataset import mel_spectrogram, MAX_WAV_VALUE | |
from models import Generator | |
from denoiser import Denoiser | |
import resampy | |
import scipy.signal | |
pbar.update(1) # initialized Dependancies | |
graph_width = 900 | |
graph_height = 360 | |
def plot_data(data, figsize=(int(graph_width/100), int(graph_height/100))): | |
%matplotlib inline | |
fig, axes = plt.subplots(1, len(data), figsize=figsize) | |
for i in range(len(data)): | |
axes[i].imshow(data[i], aspect='auto', origin='lower', | |
interpolation='none', cmap='inferno') | |
fig.canvas.draw() | |
plt.show() | |
# Setup Pronounciation Dictionary | |
!wget 'https://github.com/justinjohn0306/tacotron2/releases/download/assets/merged.dict.txt' | |
thisdict = {} | |
for line in reversed((open('merged.dict.txt', "r").read()).splitlines()): | |
thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip() | |
pbar.update(1) # Downloaded and Set up Pronounciation Dictionary | |
def ARPA(text, punctuation=r"!?,.;", EOS_Token=True): | |
out = '' | |
for word_ in text.split(" "): | |
word=word_; end_chars = '' | |
while any(elem in word for elem in punctuation) and len(word) > 1: | |
if word[-1] in punctuation: end_chars = word[-1] + end_chars; word = word[:-1] | |
else: break | |
try: | |
word_arpa = thisdict[word.upper()] | |
word = "{" + str(word_arpa) + "}" | |
except KeyError: pass | |
out = (out + " " + word + end_chars).strip() | |
if EOS_Token and out[-1] != ";": out += ";" | |
return out | |
def get_hifigan(MODEL_ID, conf_name): | |
# Download HiFi-GAN | |
hifigan_pretrained_model = 'hifimodel_' + conf_name | |
#gdown.download(d+MODEL_ID, hifigan_pretrained_model, quiet=False) | |
if MODEL_ID == 1: | |
!wget "https://github.com/justinjohn0306/tacotron2/releases/download/assets/Superres_Twilight_33000" -O $hifigan_pretrained_model | |
elif MODEL_ID == "universal": | |
!wget "https://github.com/justinjohn0306/tacotron2/releases/download/assets/g_02500000" -O $hifigan_pretrained_model | |
else: | |
gdown.download(d+MODEL_ID, hifigan_pretrained_model, quiet=False) | |
# Load HiFi-GAN | |
conf = os.path.join("hifi-gan", conf_name + ".json") | |
with open(conf) as f: | |
json_config = json.loads(f.read()) | |
h = AttrDict(json_config) | |
torch.manual_seed(h.seed) | |
hifigan = Generator(h).to(torch.device("cuda")) | |
state_dict_g = torch.load(hifigan_pretrained_model, map_location=torch.device("cuda")) | |
hifigan.load_state_dict(state_dict_g["generator"]) | |
hifigan.eval() | |
hifigan.remove_weight_norm() | |
denoiser = Denoiser(hifigan, mode="normal") | |
return hifigan, h, denoiser | |
# Download character HiFi-GAN | |
hifigan, h, denoiser = get_hifigan(HIFIGAN_ID, "config_v1") | |
# Download super-resolution HiFi-GAN | |
hifigan_sr, h2, denoiser_sr = get_hifigan(1, "config_32k") | |
pbar.update(1) # Downloaded and Set up HiFi-GAN | |
def has_MMI(STATE_DICT): | |
return any(True for x in STATE_DICT.keys() if "mi." in x) | |
def get_Tactron2(MODEL_ID): | |
# Download Tacotron2 | |
tacotron2_pretrained_model = 'MLPTTS' | |
gdown.download(d+MODEL_ID, tacotron2_pretrained_model, quiet=False) | |
if not exists(tacotron2_pretrained_model): | |
raise Exception("Tacotron2 model failed to download!") | |
# Load Tacotron2 and Config | |
hparams = create_hparams() | |
hparams.sampling_rate = 22050 | |
hparams.max_decoder_steps = 3000 # Max Duration | |
hparams.gate_threshold = 0.25 # Model must be 25% sure the clip is over before ending generation | |
model = Tacotron2(hparams) | |
state_dict = torch.load(tacotron2_pretrained_model)['state_dict'] | |
if has_MMI(state_dict): | |
raise Exception("ERROR: This notebook does not currently support MMI models.") | |
model.load_state_dict(state_dict) | |
_ = model.cuda().eval().half() | |
return model, hparams | |
model, hparams = get_Tactron2(TACOTRON2_ID) | |
previous_tt2_id = TACOTRON2_ID | |
pbar.update(1) # Downloaded and Set up Tacotron2 | |
# Extra Info | |
def end_to_end_infer(text, pronounciation_dictionary, show_graphs): | |
for i in [x for x in text.split("\n") if len(x)]: | |
if not pronounciation_dictionary: | |
if i[-1] != ";": i=i+";" | |
else: i = ARPA(i) | |
with torch.no_grad(): # save VRAM by not including gradients | |
sequence = np.array(text_to_sequence(i, ['english_cleaners']))[None, :] | |
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long() | |
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence) | |
if show_graphs: | |
plot_data((mel_outputs_postnet.float().data.cpu().numpy()[0], | |
alignments.float().data.cpu().numpy()[0].T)) | |
y_g_hat = hifigan(mel_outputs_postnet.float()) | |
audio = y_g_hat.squeeze() | |
audio = audio * MAX_WAV_VALUE | |
audio_denoised = denoiser(audio.view(1, -1), strength=35)[:, 0] | |
# Resample to 32k | |
audio_denoised = audio_denoised.cpu().numpy().reshape(-1) | |
normalize = (MAX_WAV_VALUE / np.max(np.abs(audio_denoised))) ** 0.9 | |
audio_denoised = audio_denoised * normalize | |
wave = resampy.resample( | |
audio_denoised, | |
h.sampling_rate, | |
h2.sampling_rate, | |
filter="sinc_window", | |
window=scipy.signal.windows.hann, | |
num_zeros=8, | |
) | |
wave_out = wave.astype(np.int16) | |
# HiFi-GAN super-resolution | |
wave = wave / MAX_WAV_VALUE | |
wave = torch.FloatTensor(wave).to(torch.device("cuda")) | |
new_mel = mel_spectrogram( | |
wave.unsqueeze(0), | |
h2.n_fft, | |
h2.num_mels, | |
h2.sampling_rate, | |
h2.hop_size, | |
h2.win_size, | |
h2.fmin, | |
h2.fmax, | |
) | |
y_g_hat2 = hifigan_sr(new_mel) | |
audio2 = y_g_hat2.squeeze() | |
audio2 = audio2 * MAX_WAV_VALUE | |
audio2_denoised = denoiser(audio2.view(1, -1), strength=35)[:, 0] | |
# High-pass filter, mixing and denormalizing | |
audio2_denoised = audio2_denoised.cpu().numpy().reshape(-1) | |
b = scipy.signal.firwin( | |
101, cutoff=10500, fs=h2.sampling_rate, pass_zero=False | |
) | |
y = scipy.signal.lfilter(b, [1.0], audio2_denoised) | |
y *= superres_strength | |
y_out = y.astype(np.int16) | |
y_padded = np.zeros(wave_out.shape) | |
y_padded[: y_out.shape[0]] = y_out | |
sr_mix = wave_out + y_padded | |
sr_mix = sr_mix / normalize | |
print("") | |
ipd.display(ipd.Audio(sr_mix.astype(np.int16), rate=h2.sampling_rate)) | |
from IPython.display import clear_output | |
clear_output() | |
initialized = "Ready" | |
if previous_tt2_id != TACOTRON2_ID: | |
print("Updating Models") | |
model, hparams = get_Tactron2(TACOTRON2_ID) | |
hifigan, h, denoiser = get_hifigan(HIFIGAN_ID, "config_v1") | |
previous_tt2_id = TACOTRON2_ID | |
pronounciation_dictionary = False #@param {type:"boolean"} | |
# disables automatic ARPAbet conversion, useful for inputting your own ARPAbet pronounciations or just for testing | |
show_graphs = False #@param {type:"boolean"} | |
max_duration = 20 #@param {type:"integer"} | |
model.decoder.max_decoder_steps = max_duration * 80 | |
stop_threshold = 0.5 #@param {type:"number"} | |
model.decoder.gate_threshold = stop_threshold | |
superres_strength = 10 #@param {type:"number"} | |
#@markdown --- | |
print(f"Current Config:\npronounciation_dictionary: {pronounciation_dictionary}\nshow_graphs: {show_graphs}\nmax_duration (in seconds): {max_duration}\nstop_threshold: {stop_threshold}\nsuperres_strength: {superres_strength}\n\n") | |
time.sleep(1) | |
print("Enter/Paste your text.") | |
contents = [] | |
while True: | |
try: | |
print("-"*50) | |
line = input("Enter your text here: ") | |
if line == "": | |
continue | |
end_to_end_infer(line, not pronounciation_dictionary, show_graphs) | |
#Emotion=input("select your emotion") | |
except EOFError: | |
break | |
except KeyboardInterrupt: | |
print("Stopping...") | |
break |