Spaces:

sudip1310
/

BANAO-Task2-Text-to-speech

Runtime error

App Files Files Community

BANAO-Task2-Text-to-speech / app.py

sudip1310

Update app.py

436153a over 1 year ago

raw

history blame

5.33 kB

	#git clone https://github.com/Emotional-Text-to-Speech/pytorch-dc-tts
	#git clone --recursive https://github.com/Emotional-Text-to-Speech/tacotron_pytorch.git
	#cd "tacotron_pytorch/" && pip install -e .
	#mkdir trained_models

	import gdown
	url = 'https://drive.google.com/uc?id=1rmhtEl3N3kAfnQM6J0vDGSCCHlHLK6kw'
	output = 'trained_models/angry_dctts.pth'
	gdown.download(url, output, quiet=False)
	url = 'https://drive.google.com/uc?id=1bP0eJ6z4onr2klolzU17Y8SaNspxQjF-'
	output = 'trained_models/neutral_dctts.pth'
	gdown.download(url, output, quiet=False)
	url = 'https://drive.google.com/uc?id=1WWE9zxS3FRgD0Y5yIdNmLY9-t5gnBsNt'
	output = 'trained_models/ssrn.pth'
	gdown.download(url, output, quiet=False)
	url = 'https://drive.google.com/uc?id=1N6Ykrd1IaPiNdos_iv0J6JbY2gBDghod'
	output = 'trained_models/disgust_tacotron.pth'
	gdown.download(url, output, quiet=False)
	url = 'https://drive.google.com/file/d/1xMGnS0vvgW703a9lGXeJNLK1G140RbNI/view?usp=share_link'
	output = 'trained_models/amused_tacotron.pth'
	gdown.download(url, output, quiet=False)
	url = 'https://drive.google.com/uc?id=1D6HGWYWvhdvLWQt4uOYqdmuVO7ZVLWNa'
	output = 'trained_models/sleepiness_tacotron.pth'
	gdown.download(url, output, quiet=False)

	%tensorflow_version 1.x
	%pylab inline
	rcParams["figure.figsize"] = (10,5)

	import os
	import sys
	import numpy as np
	sys.path.append('pytorch-dc-tts/')
	sys.path.append('pytorch-dc-tts/models')
	sys.path.append("tacotron_pytorch/")
	sys.path.append("tacotron_pytorch/lib/tacotron")

	# For the DC-TTS
	import torch
	from text2mel import Text2Mel
	from ssrn import SSRN
	from audio import save_to_wav, spectrogram2wav
	from utils import get_last_checkpoint_file_name, load_checkpoint_test, save_to_png, load_checkpoint
	from datasets.emovdb import vocab, get_test_data

	# For the Tacotron
	from text import text_to_sequence, symbols
	# from util import audio

	from tacotron_pytorch import Tacotron
	from synthesis import tts as _tts

	# For Audio/Display purposes
	import librosa.display
	import IPython
	from IPython.display import Audio
	from IPython.display import display
	from google.colab import widgets
	from google.colab import output
	import warnings
	warnings.filterwarnings('ignore')


	torch.set_grad_enabled(False)
	text2mel = Text2Mel(vocab).eval()

	ssrn = SSRN().eval()
	load_checkpoint('trained_models/ssrn.pth', ssrn, None)

	model = Tacotron(n_vocab=len(symbols),
	embedding_dim=256,
	mel_dim=80,
	linear_dim=1025,
	r=5,
	padding_idx=None,
	use_memory_mask=False,
	)

	def visualize(alignment, spectrogram, Emotion):
	label_fontsize = 16
	tb = widgets.TabBar(['Alignment', 'Spectrogram'], location='top')
	with tb.output_to('Alignment'):
	imshow(alignment.T, aspect="auto", origin="lower", interpolation=None)
	xlabel("Decoder timestamp", fontsize=label_fontsize)
	ylabel("Encoder timestamp", fontsize=label_fontsize)
	with tb.output_to('Spectrogram'):
	if Emotion == 'Disgust' or Emotion == 'Amused' or Emotion == 'Sleepiness':
	librosa.display.specshow(spectrogram.T, sr=fs,hop_length=hop_length, x_axis="time", y_axis="linear")
	else:
	librosa.display.specshow(spectrogram, sr=fs,hop_length=hop_length, x_axis="time", y_axis="linear")

	xlabel("Time", fontsize=label_fontsize)
	ylabel("Hz", fontsize=label_fontsize)

	def tts_dctts(text2mel, ssrn, text):
	sentences = [text]

	max_N = len(text)
	L = torch.from_numpy(get_test_data(sentences, max_N))
	zeros = torch.from_numpy(np.zeros((1, 80, 1), np.float32))
	Y = zeros
	A = None

	for t in range(210):
	_, Y_t, A = text2mel(L, Y, monotonic_attention=True)
	Y = torch.cat((zeros, Y_t), -1)
	_, attention = torch.max(A[0, :, -1], 0)
	attention = attention.item()
	if L[0, attention] == vocab.index('E'): # EOS
	break

	_, Z = ssrn(Y)
	Y = Y.cpu().detach().numpy()
	A = A.cpu().detach().numpy()
	Z = Z.cpu().detach().numpy()

	return spectrogram2wav(Z[0, :, :].T), A[0, :, :], Y[0, :, :]


	def tts_tacotron(model, text):
	waveform, alignment, spectrogram = _tts(model, text)
	return waveform, alignment, spectrogram

	def present(waveform, Emotion, figures=False):
	if figures!=False:
	visualize(figures[0], figures[1], Emotion)
	IPython.display.display(Audio(waveform, rate=fs))


	fs = 20000 #20000
	hop_length = 250
	model.decoder.max_decoder_steps = 200

	#@title Select the emotion and type the text

	%pylab inline

	Emotion = "Neutral" #@param ["Neutral", "Angry", "Disgust", "Sleepiness", "Amused"]
	Text = 'I am exhausted.' #@param {type:"string"}

	wav, align, mel = None, None, None

	if Emotion == "Neutral":
	load_checkpoint('trained_models/'+Emotion.lower()+'_dctts.pth', text2mel, None)
	wav, align, mel = tts_dctts(text2mel, ssrn, Text)
	elif Emotion == "Angry":
	load_checkpoint_test('trained_models/'+Emotion.lower()+'_dctts.pth', text2mel, None)
	wav, align, mel = tts_dctts(text2mel, ssrn, Text)
	# wav = wav.T
	elif Emotion == "Disgust" or Emotion == "Amused" or Emotion == "Sleepiness":
	checkpoint = torch.load('trained_models/'+Emotion.lower()+'_tacotron.pth', map_location=torch.device('cpu'))
	model.load_state_dict(checkpoint["state_dict"])
	wav, align, mel = tts_tacotron(model, Text)

	present(wav, Emotion, (align,mel))