Spaces:

sudip1310
/

Emotion-based-Text-to-speech

Runtime error

App Files Files Community

Emotion-based-Text-to-speech / app.py

sudip1310

Update app.py

e899374 about 1 year ago

raw

history blame contribute delete

No virus

13 kB

	#@title Tacotron2 GPU Synthesizer

	#@markdown ---

	#!pip install -q torch==1.13.1+cu117 torchvision==0.14.1+cu117 torchaudio==0.13.1 torchtext==0.14.1 torchdata==0.5.1 --extra-index-url https://download.pytorch.org/whl/cu117 -U

	#@markdown If the audio sounds too artificial, you can lower the superres_strength

	#@markdown Choose Emotions: neutral , sad , happy , surprise ,angry (in lower case)

	#Add new characters here.
	#import subprocess

	# Define the Git repository URL and target directory
	#repository_url = "https://github.com/example/repository.git"
	#target_directory = "/path/to/target/directory"

	# Execute the git clone command
	#subprocess.run(["git", "clone", repository_url, target_directory])

	import logging

	logging.getLogger('matplotlib').setLevel(logging.WARNING)
	logging.getLogger('numba').setLevel(logging.WARNING)
	logging.getLogger('librosa').setLevel(logging.WARNING)


	#Universal HiFi-GAN (has some robotic noise): 1qpgI41wNXFcH-iKq1Y42JlBC9j0je8PW
	#Emotion = "angry" #@param {type:"string"}
	hifigan_id = "universal"
	Emotion=input("select your Emotion ['neutral','sad','happy','angry','surprise']")
	Angry_tacotron_id = "1sJXE_fcCqfekZFZlF2kO01hrrI95FvwZ" #@param {type:"string"}
	Sad_tacotron_id="1rWBPz-gVGAkYFLzaVoZgu8JnoWqVA3bb"#@param {type:"string"}
	Happy_tacotron_id="1YDsuzEkiM-il7cESyux0KhOnvj5cDcYM"#@param {type:"string"}
	Surprise_tacotron_id="1e1h85cItOQaj0KO8q4hyp-vM4MITIdUj"#@param {type:"string"}
	Neutral_tacotron_id="104G09OHfu22uaRKaHqlSCYbrNpWQx4Pl"#@param {type:"string"}
	if Emotion == "angry":
	tacotron_id =Angry_tacotron_id #change tacotron_id
	elif Emotion == "sad":
	tacotron_id=Sad_tacotron_id#change tacotron_id
	elif Emotion == "happy":
	tacotron_id=Happy_tacotron_id#change tacotron_id
	elif Emotion == "surprise":
	tacotron_id=Surprise_tacotron_id#change tacotron_id
	elif Emotion =="neutral":
	tacotron_id=Neutral_tacotron_id#change tacotron_id

	Emotion=Emotion.lower()

	#@markdown ---



	if tacotron_id != "":
	TACOTRON2_ID = tacotron_id
	else:
	raise Exception("No ID provided.")

	if hifigan_id in {"", "universal"}:
	HIFIGAN_ID = "universal"
	print("Using universal Hifi-Gan model.")
	else:
	HIFIGAN_ID = hifigan_id

	# Check if Initialized
	try:
	initialized
	except NameError:
	print("Setting up, please wait.\n")
	#!pip install tqdm -q
	from tqdm.notebook import tqdm
	with tqdm(total=5, leave=False) as pbar:
	import os
	from os.path import exists, join, basename, splitext
	#!pip install resampy
	#!pip install git+https://github.com/wkentaro/gdown.git
	git_repo_url = 'https://github.com/justinjohn0306/TTS-TT2.git'
	project_name = splitext(basename(git_repo_url))[0]
	if not exists(project_name):
	# clone and install
	!git clone -q --recursive {git_repo_url}
	!git clone -q --recursive https://github.com/justinjohn0306/hifi-gan
	#!pip install -q unidecode
	pbar.update(1) # downloaded TT2 and HiFi-GAN
	import sys
	sys.path.append('hifi-gan')
	sys.path.append(project_name)
	import time
	import matplotlib
	import matplotlib.pylab as plt
	import gdown
	d = 'https://drive.google.com/uc?id='

	%matplotlib inline
	import IPython.display as ipd
	import numpy as np
	import torch
	import json
	from hparams import create_hparams
	from model import Tacotron2
	from layers import TacotronSTFT
	from audio_processing import griffin_lim
	from text import text_to_sequence
	from env import AttrDict
	from meldataset import mel_spectrogram, MAX_WAV_VALUE
	from models import Generator
	from denoiser import Denoiser
	import resampy
	import scipy.signal

	pbar.update(1) # initialized Dependancies

	graph_width = 900
	graph_height = 360
	def plot_data(data, figsize=(int(graph_width/100), int(graph_height/100))):
	%matplotlib inline
	fig, axes = plt.subplots(1, len(data), figsize=figsize)
	for i in range(len(data)):
	axes[i].imshow(data[i], aspect='auto', origin='lower',
	interpolation='none', cmap='inferno')
	fig.canvas.draw()
	plt.show()

	# Setup Pronounciation Dictionary
	!wget 'https://github.com/justinjohn0306/tacotron2/releases/download/assets/merged.dict.txt'
	thisdict = {}
	for line in reversed((open('merged.dict.txt', "r").read()).splitlines()):
	thisdict[(line.split(" ",1))[0]] = (line.split(" ",1))[1].strip()

	pbar.update(1) # Downloaded and Set up Pronounciation Dictionary

	def ARPA(text, punctuation=r"!?,.;", EOS_Token=True):
	out = ''
	for word_ in text.split(" "):
	word=word_; end_chars = ''
	while any(elem in word for elem in punctuation) and len(word) > 1:
	if word[-1] in punctuation: end_chars = word[-1] + end_chars; word = word[:-1]
	else: break
	try:
	word_arpa = thisdict[word.upper()]
	word = "{" + str(word_arpa) + "}"
	except KeyError: pass
	out = (out + " " + word + end_chars).strip()
	if EOS_Token and out[-1] != ";": out += ";"
	return out

	def get_hifigan(MODEL_ID, conf_name):
	# Download HiFi-GAN
	hifigan_pretrained_model = 'hifimodel_' + conf_name
	#gdown.download(d+MODEL_ID, hifigan_pretrained_model, quiet=False)

	if MODEL_ID == 1:
	!wget "https://github.com/justinjohn0306/tacotron2/releases/download/assets/Superres_Twilight_33000" -O $hifigan_pretrained_model
	elif MODEL_ID == "universal":
	!wget "https://github.com/justinjohn0306/tacotron2/releases/download/assets/g_02500000" -O $hifigan_pretrained_model
	else:
	gdown.download(d+MODEL_ID, hifigan_pretrained_model, quiet=False)

	# Load HiFi-GAN
	conf = os.path.join("hifi-gan", conf_name + ".json")
	with open(conf) as f:
	json_config = json.loads(f.read())
	h = AttrDict(json_config)
	torch.manual_seed(h.seed)
	hifigan = Generator(h).to(torch.device("cuda"))
	state_dict_g = torch.load(hifigan_pretrained_model, map_location=torch.device("cuda"))
	hifigan.load_state_dict(state_dict_g["generator"])
	hifigan.eval()
	hifigan.remove_weight_norm()
	denoiser = Denoiser(hifigan, mode="normal")
	return hifigan, h, denoiser

	# Download character HiFi-GAN
	hifigan, h, denoiser = get_hifigan(HIFIGAN_ID, "config_v1")
	# Download super-resolution HiFi-GAN
	hifigan_sr, h2, denoiser_sr = get_hifigan(1, "config_32k")
	pbar.update(1) # Downloaded and Set up HiFi-GAN

	def has_MMI(STATE_DICT):
	return any(True for x in STATE_DICT.keys() if "mi." in x)

	def get_Tactron2(MODEL_ID):
	# Download Tacotron2
	tacotron2_pretrained_model = 'MLPTTS'
	gdown.download(d+MODEL_ID, tacotron2_pretrained_model, quiet=False)
	if not exists(tacotron2_pretrained_model):
	raise Exception("Tacotron2 model failed to download!")
	# Load Tacotron2 and Config
	hparams = create_hparams()
	hparams.sampling_rate = 22050
	hparams.max_decoder_steps = 3000 # Max Duration
	hparams.gate_threshold = 0.25 # Model must be 25% sure the clip is over before ending generation
	model = Tacotron2(hparams)
	state_dict = torch.load(tacotron2_pretrained_model)['state_dict']
	if has_MMI(state_dict):
	raise Exception("ERROR: This notebook does not currently support MMI models.")
	model.load_state_dict(state_dict)
	_ = model.cuda().eval().half()
	return model, hparams

	model, hparams = get_Tactron2(TACOTRON2_ID)
	previous_tt2_id = TACOTRON2_ID

	pbar.update(1) # Downloaded and Set up Tacotron2

	# Extra Info
	def end_to_end_infer(text, pronounciation_dictionary, show_graphs):
	for i in [x for x in text.split("\n") if len(x)]:
	if not pronounciation_dictionary:
	if i[-1] != ";": i=i+";"
	else: i = ARPA(i)
	with torch.no_grad(): # save VRAM by not including gradients
	sequence = np.array(text_to_sequence(i, ['english_cleaners']))[None, :]
	sequence = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long()
	mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence)
	if show_graphs:
	plot_data((mel_outputs_postnet.float().data.cpu().numpy()[0],
	alignments.float().data.cpu().numpy()[0].T))
	y_g_hat = hifigan(mel_outputs_postnet.float())
	audio = y_g_hat.squeeze()
	audio = audio * MAX_WAV_VALUE
	audio_denoised = denoiser(audio.view(1, -1), strength=35)[:, 0]

	# Resample to 32k
	audio_denoised = audio_denoised.cpu().numpy().reshape(-1)

	normalize = (MAX_WAV_VALUE / np.max(np.abs(audio_denoised))) ** 0.9
	audio_denoised = audio_denoised * normalize
	wave = resampy.resample(
	audio_denoised,
	h.sampling_rate,
	h2.sampling_rate,
	filter="sinc_window",
	window=scipy.signal.windows.hann,
	num_zeros=8,
	)
	wave_out = wave.astype(np.int16)

	# HiFi-GAN super-resolution
	wave = wave / MAX_WAV_VALUE
	wave = torch.FloatTensor(wave).to(torch.device("cuda"))
	new_mel = mel_spectrogram(
	wave.unsqueeze(0),
	h2.n_fft,
	h2.num_mels,
	h2.sampling_rate,
	h2.hop_size,
	h2.win_size,
	h2.fmin,
	h2.fmax,
	)
	y_g_hat2 = hifigan_sr(new_mel)
	audio2 = y_g_hat2.squeeze()
	audio2 = audio2 * MAX_WAV_VALUE
	audio2_denoised = denoiser(audio2.view(1, -1), strength=35)[:, 0]

	# High-pass filter, mixing and denormalizing
	audio2_denoised = audio2_denoised.cpu().numpy().reshape(-1)
	b = scipy.signal.firwin(
	101, cutoff=10500, fs=h2.sampling_rate, pass_zero=False
	)
	y = scipy.signal.lfilter(b, [1.0], audio2_denoised)
	y *= superres_strength
	y_out = y.astype(np.int16)
	y_padded = np.zeros(wave_out.shape)
	y_padded[: y_out.shape[0]] = y_out
	sr_mix = wave_out + y_padded
	sr_mix = sr_mix / normalize

	print("")
	ipd.display(ipd.Audio(sr_mix.astype(np.int16), rate=h2.sampling_rate))
	from IPython.display import clear_output
	clear_output()
	initialized = "Ready"

	if previous_tt2_id != TACOTRON2_ID:
	print("Updating Models")
	model, hparams = get_Tactron2(TACOTRON2_ID)
	hifigan, h, denoiser = get_hifigan(HIFIGAN_ID, "config_v1")
	previous_tt2_id = TACOTRON2_ID

	pronounciation_dictionary = False #@param {type:"boolean"}
	# disables automatic ARPAbet conversion, useful for inputting your own ARPAbet pronounciations or just for testing
	show_graphs = False #@param {type:"boolean"}
	max_duration = 20 #@param {type:"integer"}
	model.decoder.max_decoder_steps = max_duration * 80
	stop_threshold = 0.5 #@param {type:"number"}
	model.decoder.gate_threshold = stop_threshold
	superres_strength = 10 #@param {type:"number"}

	#@markdown ---

	print(f"Current Config:\npronounciation_dictionary: {pronounciation_dictionary}\nshow_graphs: {show_graphs}\nmax_duration (in seconds): {max_duration}\nstop_threshold: {stop_threshold}\nsuperres_strength: {superres_strength}\n\n")

	time.sleep(1)
	print("Enter/Paste your text.")

	contents = []
	while True:
	try:
	print("-"*50)
	line = input("Enter your text here: ")
	if line == "":
	continue
	end_to_end_infer(line, not pronounciation_dictionary, show_graphs)
	#Emotion=input("select your emotion")
	except EOFError:
	break
	except KeyboardInterrupt:
	print("Stopping...")
	break