Spaces:
Runtime error
Runtime error
| #@title Tacotron2 GPU Synthesizer | |
| #@markdown --- | |
| #!pip install -q torch==1.13.1+cu117 torchvision==0.14.1+cu117 torchaudio==0.13.1 torchtext==0.14.1 torchdata==0.5.1 --extra-index-url https://download.pytorch.org/whl/cu117 -U | |
| #@markdown If the audio sounds too artificial, you can lower the superres_strength | |
| #@markdown Choose Emotions: neutral , sad , happy , surprise ,angry (in lower case) | |
| #Add new characters here. | |
| #import subprocess | |
| # Define the Git repository URL and target directory | |
| #repository_url = "https://github.com/example/repository.git" | |
| #target_directory = "/path/to/target/directory" | |
| # Execute the git clone command | |
| #subprocess.run(["git", "clone", repository_url, target_directory]) | |
| import logging | |
| logging.getLogger('matplotlib').setLevel(logging.WARNING) | |
| logging.getLogger('numba').setLevel(logging.WARNING) | |
| logging.getLogger('librosa').setLevel(logging.WARNING) | |
| #Universal HiFi-GAN (has some robotic noise): 1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW | |
| #Emotion = "angry" #@param {type:"string"} | |
| hifigan_id = "universal" | |
| Emotion=input("select your Emotion ['neutral','sad','happy','angry','surprise']") | |
| Angry_tacotron_id = "1sJXE_fcCqfekZFZlF2kO01hrrI95FvwZ" #@param {type:"string"} | |
| Sad_tacotron_id="1rWBPz-gVGAkYFLzaVoZgu8JnoWqVA3bb"#@param {type:"string"} | |
| Happy_tacotron_id="1YDsuzEkiM-il7cESyux0KhOnvj5cDcYM"#@param {type:"string"} | |
| Surprise_tacotron_id="1e1h85cItOQaj0KO8q4hyp-vM4MITIdUj"#@param {type:"string"} | |
| Neutral_tacotron_id="104G09OHfu22uaRKaHqlSCYbrNpWQx4Pl"#@param {type:"string"} | |
| if Emotion == "angry": | |
| tacotron_id =Angry_tacotron_id #change tacotron_id | |
| elif Emotion == "sad": | |
| tacotron_id=Sad_tacotron_id#change tacotron_id | |
| elif Emotion == "happy": | |
| tacotron_id=Happy_tacotron_id#change tacotron_id | |
| elif Emotion == "surprise": | |
| tacotron_id=Surprise_tacotron_id#change tacotron_id | |
| elif Emotion =="neutral": | |
| tacotron_id=Neutral_tacotron_id#change tacotron_id | |
| Emotion=Emotion.lower() | |
| #@markdown --- | |
| if tacotron_id != "": | |
| TACOTRON2_ID = tacotron_id | |
| else: | |
| raise Exception("No ID provided.") | |
| if hifigan_id in {"", "universal"}: | |
| HIFIGAN_ID = "universal" | |
| print("Using universal Hifi-Gan model.") | |
| else: | |
| HIFIGAN_ID = hifigan_id | |
| # Check if Initialized | |
| try: | |
| initialized | |
| except NameError: | |
| print("Setting up, please wait.\n") | |
| #!pip install tqdm -q | |
| from tqdm.notebook import tqdm | |
| with tqdm(total=5, leave=False) as pbar: | |
| import os | |
| from os.path import exists, join, basename, splitext | |
| #!pip install resampy | |
| #!pip install git+https://github.com/wkentaro/gdown.git | |
| git_repo_url = 'https://github.com/justinjohn0306/TTS-TT2.git' | |
| project_name = splitext(basename(git_repo_url))[0] | |
| if not exists(project_name): | |
| # clone and install | |
| !git clone -q --recursive {git_repo_url} | |
| !git clone -q --recursive https://github.com/justinjohn0306/hifi-gan | |
| #!pip install -q unidecode | |
| pbar.update(1) # downloaded TT2 and HiFi-GAN | |
| import sys | |
| sys.path.append('hifi-gan') | |
| sys.path.append(project_name) | |
| import time | |
| import matplotlib | |
| import matplotlib.pylab as plt | |
| import gdown | |
| d = 'https://drive.google.com/uc?id=' | |
| %matplotlib inline | |
| import IPython.display as ipd | |
| import numpy as np | |
| import torch | |
| import json | |
| from hparams import create_hparams | |
| from model import Tacotron2 | |
| from layers import TacotronSTFT | |
| from audio_processing import griffin_lim | |
| from text import text_to_sequence | |
| from env import AttrDict | |
| from meldataset import mel_spectrogram, MAX_WAV_VALUE | |
| from models import Generator | |
| from denoiser import Denoiser | |
| import resampy | |
| import scipy.signal | |
| pbar.update(1) # initialized Dependancies | |
| graph_width = 900 | |
| graph_height = 360 | |
| def plot_data(data, figsize=(int(graph_width/100), int(graph_height/100))): | |
| %matplotlib inline | |
| fig, axes = plt.subplots(1, len(data), figsize=figsize) | |
| for i in range(len(data)): | |
| axes[i].imshow(data[i], aspect='auto', origin='lower', | |
| interpolation='none', cmap='inferno') | |
| fig.canvas.draw() | |
| plt.show() | |
| # Setup Pronounciation Dictionary | |
| !wget 'https://github.com/justinjohn0306/tacotron2/releases/download/assets/merged.dict.txt' | |
| thisdict = {} | |
| for line in reversed((open('merged.dict.txt', "r").read()).splitlines()): | |
| thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip() | |
| pbar.update(1) # Downloaded and Set up Pronounciation Dictionary | |
| def ARPA(text, punctuation=r"!?,.;", EOS_Token=True): | |
| out = '' | |
| for word_ in text.split(" "): | |
| word=word_; end_chars = '' | |
| while any(elem in word for elem in punctuation) and len(word) > 1: | |
| if word[-1] in punctuation: end_chars = word[-1] + end_chars; word = word[:-1] | |
| else: break | |
| try: | |
| word_arpa = thisdict[word.upper()] | |
| word = "{" + str(word_arpa) + "}" | |
| except KeyError: pass | |
| out = (out + " " + word + end_chars).strip() | |
| if EOS_Token and out[-1] != ";": out += ";" | |
| return out | |
| def get_hifigan(MODEL_ID, conf_name): | |
| # Download HiFi-GAN | |
| hifigan_pretrained_model = 'hifimodel_' + conf_name | |
| #gdown.download(d+MODEL_ID, hifigan_pretrained_model, quiet=False) | |
| if MODEL_ID == 1: | |
| !wget "https://github.com/justinjohn0306/tacotron2/releases/download/assets/Superres_Twilight_33000" -O $hifigan_pretrained_model | |
| elif MODEL_ID == "universal": | |
| !wget "https://github.com/justinjohn0306/tacotron2/releases/download/assets/g_02500000" -O $hifigan_pretrained_model | |
| else: | |
| gdown.download(d+MODEL_ID, hifigan_pretrained_model, quiet=False) | |
| # Load HiFi-GAN | |
| conf = os.path.join("hifi-gan", conf_name + ".json") | |
| with open(conf) as f: | |
| json_config = json.loads(f.read()) | |
| h = AttrDict(json_config) | |
| torch.manual_seed(h.seed) | |
| hifigan = Generator(h).to(torch.device("cuda")) | |
| state_dict_g = torch.load(hifigan_pretrained_model, map_location=torch.device("cuda")) | |
| hifigan.load_state_dict(state_dict_g["generator"]) | |
| hifigan.eval() | |
| hifigan.remove_weight_norm() | |
| denoiser = Denoiser(hifigan, mode="normal") | |
| return hifigan, h, denoiser | |
| # Download character HiFi-GAN | |
| hifigan, h, denoiser = get_hifigan(HIFIGAN_ID, "config_v1") | |
| # Download super-resolution HiFi-GAN | |
| hifigan_sr, h2, denoiser_sr = get_hifigan(1, "config_32k") | |
| pbar.update(1) # Downloaded and Set up HiFi-GAN | |
| def has_MMI(STATE_DICT): | |
| return any(True for x in STATE_DICT.keys() if "mi." in x) | |
| def get_Tactron2(MODEL_ID): | |
| # Download Tacotron2 | |
| tacotron2_pretrained_model = 'MLPTTS' | |
| gdown.download(d+MODEL_ID, tacotron2_pretrained_model, quiet=False) | |
| if not exists(tacotron2_pretrained_model): | |
| raise Exception("Tacotron2 model failed to download!") | |
| # Load Tacotron2 and Config | |
| hparams = create_hparams() | |
| hparams.sampling_rate = 22050 | |
| hparams.max_decoder_steps = 3000 # Max Duration | |
| hparams.gate_threshold = 0.25 # Model must be 25% sure the clip is over before ending generation | |
| model = Tacotron2(hparams) | |
| state_dict = torch.load(tacotron2_pretrained_model)['state_dict'] | |
| if has_MMI(state_dict): | |
| raise Exception("ERROR: This notebook does not currently support MMI models.") | |
| model.load_state_dict(state_dict) | |
| _ = model.cuda().eval().half() | |
| return model, hparams | |
| model, hparams = get_Tactron2(TACOTRON2_ID) | |
| previous_tt2_id = TACOTRON2_ID | |
| pbar.update(1) # Downloaded and Set up Tacotron2 | |
| # Extra Info | |
| def end_to_end_infer(text, pronounciation_dictionary, show_graphs): | |
| for i in [x for x in text.split("\n") if len(x)]: | |
| if not pronounciation_dictionary: | |
| if i[-1] != ";": i=i+";" | |
| else: i = ARPA(i) | |
| with torch.no_grad(): # save VRAM by not including gradients | |
| sequence = np.array(text_to_sequence(i, ['english_cleaners']))[None, :] | |
| sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long() | |
| mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence) | |
| if show_graphs: | |
| plot_data((mel_outputs_postnet.float().data.cpu().numpy()[0], | |
| alignments.float().data.cpu().numpy()[0].T)) | |
| y_g_hat = hifigan(mel_outputs_postnet.float()) | |
| audio = y_g_hat.squeeze() | |
| audio = audio * MAX_WAV_VALUE | |
| audio_denoised = denoiser(audio.view(1, -1), strength=35)[:, 0] | |
| # Resample to 32k | |
| audio_denoised = audio_denoised.cpu().numpy().reshape(-1) | |
| normalize = (MAX_WAV_VALUE / np.max(np.abs(audio_denoised))) ** 0.9 | |
| audio_denoised = audio_denoised * normalize | |
| wave = resampy.resample( | |
| audio_denoised, | |
| h.sampling_rate, | |
| h2.sampling_rate, | |
| filter="sinc_window", | |
| window=scipy.signal.windows.hann, | |
| num_zeros=8, | |
| ) | |
| wave_out = wave.astype(np.int16) | |
| # HiFi-GAN super-resolution | |
| wave = wave / MAX_WAV_VALUE | |
| wave = torch.FloatTensor(wave).to(torch.device("cuda")) | |
| new_mel = mel_spectrogram( | |
| wave.unsqueeze(0), | |
| h2.n_fft, | |
| h2.num_mels, | |
| h2.sampling_rate, | |
| h2.hop_size, | |
| h2.win_size, | |
| h2.fmin, | |
| h2.fmax, | |
| ) | |
| y_g_hat2 = hifigan_sr(new_mel) | |
| audio2 = y_g_hat2.squeeze() | |
| audio2 = audio2 * MAX_WAV_VALUE | |
| audio2_denoised = denoiser(audio2.view(1, -1), strength=35)[:, 0] | |
| # High-pass filter, mixing and denormalizing | |
| audio2_denoised = audio2_denoised.cpu().numpy().reshape(-1) | |
| b = scipy.signal.firwin( | |
| 101, cutoff=10500, fs=h2.sampling_rate, pass_zero=False | |
| ) | |
| y = scipy.signal.lfilter(b, [1.0], audio2_denoised) | |
| y *= superres_strength | |
| y_out = y.astype(np.int16) | |
| y_padded = np.zeros(wave_out.shape) | |
| y_padded[: y_out.shape[0]] = y_out | |
| sr_mix = wave_out + y_padded | |
| sr_mix = sr_mix / normalize | |
| print("") | |
| ipd.display(ipd.Audio(sr_mix.astype(np.int16), rate=h2.sampling_rate)) | |
| from IPython.display import clear_output | |
| clear_output() | |
| initialized = "Ready" | |
| if previous_tt2_id != TACOTRON2_ID: | |
| print("Updating Models") | |
| model, hparams = get_Tactron2(TACOTRON2_ID) | |
| hifigan, h, denoiser = get_hifigan(HIFIGAN_ID, "config_v1") | |
| previous_tt2_id = TACOTRON2_ID | |
| pronounciation_dictionary = False #@param {type:"boolean"} | |
| # disables automatic ARPAbet conversion, useful for inputting your own ARPAbet pronounciations or just for testing | |
| show_graphs = False #@param {type:"boolean"} | |
| max_duration = 20 #@param {type:"integer"} | |
| model.decoder.max_decoder_steps = max_duration * 80 | |
| stop_threshold = 0.5 #@param {type:"number"} | |
| model.decoder.gate_threshold = stop_threshold | |
| superres_strength = 10 #@param {type:"number"} | |
| #@markdown --- | |
| print(f"Current Config:\npronounciation_dictionary: {pronounciation_dictionary}\nshow_graphs: {show_graphs}\nmax_duration (in seconds): {max_duration}\nstop_threshold: {stop_threshold}\nsuperres_strength: {superres_strength}\n\n") | |
| time.sleep(1) | |
| print("Enter/Paste your text.") | |
| contents = [] | |
| while True: | |
| try: | |
| print("-"*50) | |
| line = input("Enter your text here: ") | |
| if line == "": | |
| continue | |
| end_to_end_infer(line, not pronounciation_dictionary, show_graphs) | |
| #Emotion=input("select your emotion") | |
| except EOFError: | |
| break | |
| except KeyboardInterrupt: | |
| print("Stopping...") | |
| break |