sudip1310's picture
Update app.py
e899374
raw
history blame contribute delete
No virus
13 kB
#@title Tacotron2 GPU Synthesizer
#@markdown ---
#!pip install -q torch==1.13.1+cu117 torchvision==0.14.1+cu117 torchaudio==0.13.1 torchtext==0.14.1 torchdata==0.5.1 --extra-index-url https://download.pytorch.org/whl/cu117 -U
#@markdown If the audio sounds too artificial, you can lower the superres_strength
#@markdown Choose Emotions: neutral , sad , happy , surprise ,angry (in lower case)
#Add new characters here.
#import subprocess
# Define the Git repository URL and target directory
#repository_url = "https://github.com/example/repository.git"
#target_directory = "/path/to/target/directory"
# Execute the git clone command
#subprocess.run(["git", "clone", repository_url, target_directory])
import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)
logging.getLogger('numba').setLevel(logging.WARNING)
logging.getLogger('librosa').setLevel(logging.WARNING)
#Universal HiFi-GAN (has some robotic noise): 1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW
#Emotion = "angry" #@param {type:"string"}
hifigan_id = "universal"
Emotion=input("select your Emotion ['neutral','sad','happy','angry','surprise']")
Angry_tacotron_id = "1sJXE_fcCqfekZFZlF2kO01hrrI95FvwZ" #@param {type:"string"}
Sad_tacotron_id="1rWBPz-gVGAkYFLzaVoZgu8JnoWqVA3bb"#@param {type:"string"}
Happy_tacotron_id="1YDsuzEkiM-il7cESyux0KhOnvj5cDcYM"#@param {type:"string"}
Surprise_tacotron_id="1e1h85cItOQaj0KO8q4hyp-vM4MITIdUj"#@param {type:"string"}
Neutral_tacotron_id="104G09OHfu22uaRKaHqlSCYbrNpWQx4Pl"#@param {type:"string"}
if Emotion == "angry":
tacotron_id =Angry_tacotron_id #change tacotron_id
elif Emotion == "sad":
tacotron_id=Sad_tacotron_id#change tacotron_id
elif Emotion == "happy":
tacotron_id=Happy_tacotron_id#change tacotron_id
elif Emotion == "surprise":
tacotron_id=Surprise_tacotron_id#change tacotron_id
elif Emotion =="neutral":
tacotron_id=Neutral_tacotron_id#change tacotron_id
Emotion=Emotion.lower()
#@markdown ---
if tacotron_id != "":
TACOTRON2_ID = tacotron_id
else:
raise Exception("No ID provided.")
if hifigan_id in {"", "universal"}:
HIFIGAN_ID = "universal"
print("Using universal Hifi-Gan model.")
else:
HIFIGAN_ID = hifigan_id
# Check if Initialized
try:
initialized
except NameError:
print("Setting up, please wait.\n")
#!pip install tqdm -q
from tqdm.notebook import tqdm
with tqdm(total=5, leave=False) as pbar:
import os
from os.path import exists, join, basename, splitext
#!pip install resampy
#!pip install git+https://github.com/wkentaro/gdown.git
git_repo_url = 'https://github.com/justinjohn0306/TTS-TT2.git'
project_name = splitext(basename(git_repo_url))[0]
if not exists(project_name):
# clone and install
!git clone -q --recursive {git_repo_url}
!git clone -q --recursive https://github.com/justinjohn0306/hifi-gan
#!pip install -q unidecode
pbar.update(1) # downloaded TT2 and HiFi-GAN
import sys
sys.path.append('hifi-gan')
sys.path.append(project_name)
import time
import matplotlib
import matplotlib.pylab as plt
import gdown
d = 'https://drive.google.com/uc?id='
%matplotlib inline
import IPython.display as ipd
import numpy as np
import torch
import json
from hparams import create_hparams
from model import Tacotron2
from layers import TacotronSTFT
from audio_processing import griffin_lim
from text import text_to_sequence
from env import AttrDict
from meldataset import mel_spectrogram, MAX_WAV_VALUE
from models import Generator
from denoiser import Denoiser
import resampy
import scipy.signal
pbar.update(1) # initialized Dependancies
graph_width = 900
graph_height = 360
def plot_data(data, figsize=(int(graph_width/100), int(graph_height/100))):
%matplotlib inline
fig, axes = plt.subplots(1, len(data), figsize=figsize)
for i in range(len(data)):
axes[i].imshow(data[i], aspect='auto', origin='lower',
interpolation='none', cmap='inferno')
fig.canvas.draw()
plt.show()
# Setup Pronounciation Dictionary
!wget 'https://github.com/justinjohn0306/tacotron2/releases/download/assets/merged.dict.txt'
thisdict = {}
for line in reversed((open('merged.dict.txt', "r").read()).splitlines()):
thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip()
pbar.update(1) # Downloaded and Set up Pronounciation Dictionary
def ARPA(text, punctuation=r"!?,.;", EOS_Token=True):
out = ''
for word_ in text.split(" "):
word=word_; end_chars = ''
while any(elem in word for elem in punctuation) and len(word) > 1:
if word[-1] in punctuation: end_chars = word[-1] + end_chars; word = word[:-1]
else: break
try:
word_arpa = thisdict[word.upper()]
word = "{" + str(word_arpa) + "}"
except KeyError: pass
out = (out + " " + word + end_chars).strip()
if EOS_Token and out[-1] != ";": out += ";"
return out
def get_hifigan(MODEL_ID, conf_name):
# Download HiFi-GAN
hifigan_pretrained_model = 'hifimodel_' + conf_name
#gdown.download(d+MODEL_ID, hifigan_pretrained_model, quiet=False)
if MODEL_ID == 1:
!wget "https://github.com/justinjohn0306/tacotron2/releases/download/assets/Superres_Twilight_33000" -O $hifigan_pretrained_model
elif MODEL_ID == "universal":
!wget "https://github.com/justinjohn0306/tacotron2/releases/download/assets/g_02500000" -O $hifigan_pretrained_model
else:
gdown.download(d+MODEL_ID, hifigan_pretrained_model, quiet=False)
# Load HiFi-GAN
conf = os.path.join("hifi-gan", conf_name + ".json")
with open(conf) as f:
json_config = json.loads(f.read())
h = AttrDict(json_config)
torch.manual_seed(h.seed)
hifigan = Generator(h).to(torch.device("cuda"))
state_dict_g = torch.load(hifigan_pretrained_model, map_location=torch.device("cuda"))
hifigan.load_state_dict(state_dict_g["generator"])
hifigan.eval()
hifigan.remove_weight_norm()
denoiser = Denoiser(hifigan, mode="normal")
return hifigan, h, denoiser
# Download character HiFi-GAN
hifigan, h, denoiser = get_hifigan(HIFIGAN_ID, "config_v1")
# Download super-resolution HiFi-GAN
hifigan_sr, h2, denoiser_sr = get_hifigan(1, "config_32k")
pbar.update(1) # Downloaded and Set up HiFi-GAN
def has_MMI(STATE_DICT):
return any(True for x in STATE_DICT.keys() if "mi." in x)
def get_Tactron2(MODEL_ID):
# Download Tacotron2
tacotron2_pretrained_model = 'MLPTTS'
gdown.download(d+MODEL_ID, tacotron2_pretrained_model, quiet=False)
if not exists(tacotron2_pretrained_model):
raise Exception("Tacotron2 model failed to download!")
# Load Tacotron2 and Config
hparams = create_hparams()
hparams.sampling_rate = 22050
hparams.max_decoder_steps = 3000 # Max Duration
hparams.gate_threshold = 0.25 # Model must be 25% sure the clip is over before ending generation
model = Tacotron2(hparams)
state_dict = torch.load(tacotron2_pretrained_model)['state_dict']
if has_MMI(state_dict):
raise Exception("ERROR: This notebook does not currently support MMI models.")
model.load_state_dict(state_dict)
_ = model.cuda().eval().half()
return model, hparams
model, hparams = get_Tactron2(TACOTRON2_ID)
previous_tt2_id = TACOTRON2_ID
pbar.update(1) # Downloaded and Set up Tacotron2
# Extra Info
def end_to_end_infer(text, pronounciation_dictionary, show_graphs):
for i in [x for x in text.split("\n") if len(x)]:
if not pronounciation_dictionary:
if i[-1] != ";": i=i+";"
else: i = ARPA(i)
with torch.no_grad(): # save VRAM by not including gradients
sequence = np.array(text_to_sequence(i, ['english_cleaners']))[None, :]
sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
if show_graphs:
plot_data((mel_outputs_postnet.float().data.cpu().numpy()[0],
alignments.float().data.cpu().numpy()[0].T))
y_g_hat = hifigan(mel_outputs_postnet.float())
audio = y_g_hat.squeeze()
audio = audio * MAX_WAV_VALUE
audio_denoised = denoiser(audio.view(1, -1), strength=35)[:, 0]
# Resample to 32k
audio_denoised = audio_denoised.cpu().numpy().reshape(-1)
normalize = (MAX_WAV_VALUE / np.max(np.abs(audio_denoised))) ** 0.9
audio_denoised = audio_denoised * normalize
wave = resampy.resample(
audio_denoised,
h.sampling_rate,
h2.sampling_rate,
filter="sinc_window",
window=scipy.signal.windows.hann,
num_zeros=8,
)
wave_out = wave.astype(np.int16)
# HiFi-GAN super-resolution
wave = wave / MAX_WAV_VALUE
wave = torch.FloatTensor(wave).to(torch.device("cuda"))
new_mel = mel_spectrogram(
wave.unsqueeze(0),
h2.n_fft,
h2.num_mels,
h2.sampling_rate,
h2.hop_size,
h2.win_size,
h2.fmin,
h2.fmax,
)
y_g_hat2 = hifigan_sr(new_mel)
audio2 = y_g_hat2.squeeze()
audio2 = audio2 * MAX_WAV_VALUE
audio2_denoised = denoiser(audio2.view(1, -1), strength=35)[:, 0]
# High-pass filter, mixing and denormalizing
audio2_denoised = audio2_denoised.cpu().numpy().reshape(-1)
b = scipy.signal.firwin(
101, cutoff=10500, fs=h2.sampling_rate, pass_zero=False
)
y = scipy.signal.lfilter(b, [1.0], audio2_denoised)
y *= superres_strength
y_out = y.astype(np.int16)
y_padded = np.zeros(wave_out.shape)
y_padded[: y_out.shape[0]] = y_out
sr_mix = wave_out + y_padded
sr_mix = sr_mix / normalize
print("")
ipd.display(ipd.Audio(sr_mix.astype(np.int16), rate=h2.sampling_rate))
from IPython.display import clear_output
clear_output()
initialized = "Ready"
if previous_tt2_id != TACOTRON2_ID:
print("Updating Models")
model, hparams = get_Tactron2(TACOTRON2_ID)
hifigan, h, denoiser = get_hifigan(HIFIGAN_ID, "config_v1")
previous_tt2_id = TACOTRON2_ID
pronounciation_dictionary = False #@param {type:"boolean"}
# disables automatic ARPAbet conversion, useful for inputting your own ARPAbet pronounciations or just for testing
show_graphs = False #@param {type:"boolean"}
max_duration = 20 #@param {type:"integer"}
model.decoder.max_decoder_steps = max_duration * 80
stop_threshold = 0.5 #@param {type:"number"}
model.decoder.gate_threshold = stop_threshold
superres_strength = 10 #@param {type:"number"}
#@markdown ---
print(f"Current Config:\npronounciation_dictionary: {pronounciation_dictionary}\nshow_graphs: {show_graphs}\nmax_duration (in seconds): {max_duration}\nstop_threshold: {stop_threshold}\nsuperres_strength: {superres_strength}\n\n")
time.sleep(1)
print("Enter/Paste your text.")
contents = []
while True:
try:
print("-"*50)
line = input("Enter your text here: ")
if line == "":
continue
end_to_end_infer(line, not pronounciation_dictionary, show_graphs)
#Emotion=input("select your emotion")
except EOFError:
break
except KeyboardInterrupt:
print("Stopping...")
break