Spaces:

pytorch
/

Tacotron2

Running

App Files Files Community

Tacotron2 / app.py

akhaliq HF Staff

Update app.py

cf4af18 almost 4 years ago

raw

history blame contribute delete

2.28 kB

	import torch
	import torchaudio
	import gradio as gr
	import matplotlib.pyplot as plt

	device="cpu"
	bundle = torchaudio.pipelines.TACOTRON2_WAVERNN_PHONE_LJSPEECH
	processor = bundle.get_text_processor()
	tacotron2 = bundle.get_tacotron2().to(device)

	# Workaround to load model mapped on GPU
	# https://stackoverflow.com/a/61840832
	waveglow = torch.hub.load(
	"NVIDIA/DeepLearningExamples:torchhub",
	"nvidia_waveglow",
	model_math="fp32",
	pretrained=False,
	)
	checkpoint = torch.hub.load_state_dict_from_url(
	"https://api.ngc.nvidia.com/v2/models/nvidia/waveglowpyt_fp32/versions/1/files/nvidia_waveglowpyt_fp32_20190306.pth", # noqa: E501
	progress=False,
	map_location=device,
	)
	state_dict = {key.replace("module.", ""): value for key, value in checkpoint["state_dict"].items()}

	waveglow.load_state_dict(state_dict)
	waveglow = waveglow.remove_weightnorm(waveglow)
	waveglow = waveglow.to(device)
	waveglow.eval()

	def inference(text):

	with torch.inference_mode():
	processed, lengths = processor(text)
	processed = processed.to(device)
	lengths = lengths.to(device)
	spec, _, _ = tacotron2.infer(processed, lengths)

	plt.imshow(spec[0].cpu().detach())
	plt.axis('off')
	plt.savefig("test.png", bbox_inches='tight')

	with torch.no_grad():
	waveforms = waveglow.infer(spec)

	torchaudio.save("output_waveglow.wav", waveforms[0:1].cpu(), sample_rate=22050)
	return "output_waveglow.wav","test.png"

	title="TACOTRON 2"
	description="Gradio demo for TACOTRON 2: The Tacotron 2 model for generating mel spectrograms from text. To use it, simply add you text or click on one of the examples to load them. Read more at the links below."
	article = "<p style='text-align: center'><a href='https://arxiv.org/abs/1712.05884' target='_blank'>Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions</a> \| <a href='https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/SpeechSynthesis/Tacotron2' target='_blank'>Github Repo</a></p>"
	examples=[["life is like a box of chocolates"]]
	gr.Interface(inference,"text",[gr.outputs.Audio(type="file",label="Audio"),gr.outputs.Image(type="file",label="Spectrogram")],title=title,description=description,article=article,examples=examples).launch(enable_queue=True)