Spaces:

Yehor
/

radtts-uk-bigvgan-demo

Build error

App Files Files Community

radtts-uk-bigvgan-demo / app.py

Yehor

A fix

202457e 2 months ago

raw

history blame contribute delete

12 kB

	import os
	import sys
	import json
	import time

	from importlib.metadata import version
	from enum import Enum

	from huggingface_hub import hf_hub_download

	use_zerogpu = False

	try:
	import spaces # it's for ZeroGPU
	use_zerogpu = True
	print("ZeroGPU is available, changing inference call.")
	except ImportError:
	print("ZeroGPU is not available, skipping...")

	import gradio as gr

	import torch
	import torchaudio

	# BigVGAN
	import bigvgan

	# RAD-TTS code
	from radtts import RADTTS
	from data import Data
	from common import update_params

	use_cuda = torch.cuda.is_available()

	if use_cuda:
	print("CUDA is available, setting correct inference_device variable.")
	device = "cuda"
	else:
	device = "cpu"


	def download_file_from_repo(
	repo_id: str,
	filename: str,
	local_dir: str = ".",
	repo_type: str = "model",
	) -> str:
	try:
	os.makedirs(local_dir, exist_ok=True)

	file_path = hf_hub_download(
	repo_id=repo_id,
	filename=filename,
	local_dir=local_dir,
	cache_dir=None,
	force_download=False,
	repo_type=repo_type,
	)

	return file_path
	except Exception as e:
	raise Exception(f"An error occurred during download: {e}") from e


	download_file_from_repo(
	"Yehor/radtts-uk",
	"radtts-pp-dap-model/model_dap_84000.pt",
	"./models/",
	)

	# Init the model
	seed = 1234

	config = "configs/radtts-pp-dap-model.json"
	radtts_path = "models/radtts-pp-dap-model/model_dap_84000.pt"

	params = []

	# Load the config
	with open(config) as f:
	data = f.read()

	config = json.loads(data)
	update_params(config, params)

	data_config = config["data_config"]
	model_config = config["model_config"]

	# Seed
	torch.manual_seed(seed)
	torch.cuda.manual_seed(seed)

	# Load vocoder
	vocoder_model = bigvgan.BigVGAN.from_pretrained(
	"nvidia/bigvgan_v2_22khz_80band_fmax8k_256x", use_cuda_kernel=False,
	)
	vocoder_model.remove_weight_norm()
	vocoder_model = vocoder_model.eval().to(device)

	# Load RAD-TTS
	if use_cuda:
	radtts = RADTTS(**model_config).cuda()
	else:
	radtts = RADTTS(**model_config)

	radtts.enable_inverse_cache() # cache inverse matrix for 1x1 invertible convs

	checkpoint_dict = torch.load(radtts_path, map_location="cpu") # todo: CPU?
	radtts.load_state_dict(checkpoint_dict["state_dict"], strict=False)
	radtts.eval()

	print(f"Loaded checkpoint '{radtts_path}')")

	ignore_keys = ["training_files", "validation_files"]
	trainset = Data(
	data_config["training_files"],
	**dict((k, v) for k, v in data_config.items() if k not in ignore_keys),
	)

	# Config
	concurrency_limit = 5

	title = "RAD-TTS++ Ukrainian"

	# https://www.tablesgenerator.com/markdown_tables
	authors_table = """
	## Authors

	Follow them on social networks and contact if you need any help or have any questions:

	\| <img src="https://avatars.githubusercontent.com/u/7875085?v=4" width="100"> Yehor Smoliakov \|
	\|-------------------------------------------------------------------------------------------------\|
	\| https://t.me/smlkw in Telegram \|
	\| https://x.com/yehor_smoliakov at X \|
	\| https://github.com/egorsmkv at GitHub \|
	\| https://huggingface.co/Yehor at Hugging Face \|
	\| or use egorsmkv@gmail.com \|
	""".strip()

	description_head = f"""
	# {title}

	## Overview

	Type your text in Ukrainian and select a voice to synthesize speech using [the RAD-TTS++ model](https://huggingface.co/Yehor/radtts-uk) and [BigVGAN v2](https://huggingface.co/nvidia/bigvgan_v2_22khz_80band_fmax8k_256x) with 22050 Hz.
	""".strip()

	description_foot = f"""
	{authors_table}
	""".strip()

	tech_env = f"""
	#### Environment

	- Python: {sys.version}
	""".strip()

	tech_libraries = f"""
	#### Libraries

	- gradio: {version("gradio")}
	- torch: {version("torch")}
	- scipy: {version("scipy")}
	- numba: {version("numba")}
	- librosa: {version("librosa")}
	- unidecode: {version("unidecode")}
	- inflect: {version("inflect")}
	""".strip()


	class VoiceOption(Enum):
	Tetiana = "Tetiana (female) 👩"
	Mykyta = "Mykyta (male) 👨"
	Lada = "Lada (female) 👩"


	voice_mapping = {
	VoiceOption.Tetiana.value: "tetiana",
	VoiceOption.Mykyta.value: "mykyta",
	VoiceOption.Lada.value: "lada",
	}


	examples = [
	[
	"Прокинувся ґазда вранці. Пішов, вичистив з-під коня, вичистив з-під бика, вичистив з-під овечок, вибрав молодняк, відніс його набік.",
	VoiceOption.Mykyta.value,
	],
	[
	"Пішов взяв сіна, дав корові. Пішов взяв сіна, дав бикові. Ячміню коняці насипав. Зайшов почистив корову, зайшов почистив бика, зайшов почистив коня, за яйця його мацнув.",
	VoiceOption.Lada.value,
	],
	[
	"Кінь ногою здригнув, на хазяїна ласкавим оком подивився. Тоді дядько пішов відкрив курей, гусей, качок, повиносив їм зерна, огірків нарізаних, нагодував. Коли чує – з хати дружина кличе. Зайшов. Дітки повмивані, сидять за столом, всі чекають тата. Взяв він ложку, перехрестив дітей, перехрестив лоба, почали снідати. Поснідали, він дістав пряників, роздав дітям. Діти зібралися, пішли в школу. Дядько вийшов, сів на призьбі, взяв сапку, почав мантачити. Мантачив-мантачив, коли – жінка виходить. Він їй ту сапку дає, ласкаво за сраку вщипнув, жінка до нього лагідно всміхнулася, пішла на город – сапати. Коли – йде пастух і товар кличе в череду. Повідмикав дядько овечок, коровку, бика, коня, все відпустив. Сів попри хати, дістав табАку, відірвав шмат газети, насипав, наслинив собі гарну таку цигарку. Благодать божа – і сонечко вже здійнялося над деревами. Дядько встромив цигарку в рота, дістав сірники, тільки чиркати – коли раптом з хати: Доброе утро! Московское время – шесть часов утра! Витяг дядько цигарку с рота, сплюнув набік, і сам собі каже: Ана маєш. Прокинулись, бляді!",
	VoiceOption.Tetiana.value,
	],
	]


	def inference(text, voice):
	if not text:
	raise gr.Error("Please paste your text.")

	gr.Info("Starting...", duration=0.5)

	speaker = voice_mapping[voice]
	speaker = speaker_text = speaker_attributes = speaker

	n_takes = 1

	sigma = 0.8 # sampling sigma for decoder
	sigma_tkndur = 0.666 # sampling sigma for duration
	sigma_f0 = 1.0 # sampling sigma for f0
	sigma_energy = 1.0 # sampling sigma for energy avg

	token_dur_scaling = 1.0

	f0_mean = 0
	f0_std = 0
	energy_mean = 0
	energy_std = 0

	if use_cuda:
	speaker_id = trainset.get_speaker_id(speaker).cuda()
	speaker_id_text, speaker_id_attributes = speaker_id, speaker_id

	if speaker_text is not None:
	speaker_id_text = trainset.get_speaker_id(speaker_text).cuda()

	if speaker_attributes is not None:
	speaker_id_attributes = trainset.get_speaker_id(speaker_attributes).cuda()

	tensor_text = trainset.get_text(text).cuda()[None]
	else:
	speaker_id = trainset.get_speaker_id(speaker)
	speaker_id_text, speaker_id_attributes = speaker_id, speaker_id

	if speaker_text is not None:
	speaker_id_text = trainset.get_speaker_id(speaker_text)

	if speaker_attributes is not None:
	speaker_id_attributes = trainset.get_speaker_id(speaker_attributes)

	tensor_text = trainset.get_text(text)[None]

	inference_start = time.time()

	for take in range(n_takes):
	with torch.autocast(device, enabled=False):
	with torch.inference_mode():
	outputs = radtts.infer(
	speaker_id,
	tensor_text,
	sigma,
	sigma_tkndur,
	sigma_f0,
	sigma_energy,
	token_dur_scaling,
	token_duration_max=100,
	speaker_id_text=speaker_id_text,
	speaker_id_attributes=speaker_id_attributes,
	f0_mean=f0_mean,
	f0_std=f0_std,
	energy_mean=energy_mean,
	energy_std=energy_std,
	use_cuda=use_cuda,
	)

	mel = outputs["mel"]

	gr.Info(
	"Synthesized MEL spectrogram, converting to WAVE.", duration=0.5
	)

	wav_gen = vocoder_model(mel)
	wav_gen_float = wav_gen.squeeze(0).cpu()

	torchaudio.save("audio.wav", wav_gen_float, 22_050, encoding="PCM_S")

	duration = len(wav_gen_float[0]) / 22_050

	elapsed_time = time.time() - inference_start
	rtf = elapsed_time / duration

	speed_ratio = duration / elapsed_time
	speech_rate = len(text.split(" ")) / duration

	rtf_value = f"Real-Time Factor: {round(rtf, 4)}, time: {round(elapsed_time, 4)} seconds, audio duration: {round(duration, 4)} seconds. Speed ratio: {round(speed_ratio, 2)}x. Speech rate: {round(speech_rate, 4)} words-per-second."

	gr.Success("Finished!", duration=0.5)

	return [gr.Audio("audio.wav"), rtf_value]


	try:
	@spaces.GPU
	def inference_zerogpu(text, voice):
	return inference(text, voice)
	except NameError:
	print("ZeroGPU is not available, skipping...")


	def inference_cpu(text, voice):
	return inference(text, voice)


	demo = gr.Blocks(
	title=title,
	analytics_enabled=False,
	theme=gr.themes.Base(),
	)

	with demo:
	gr.Markdown(description_head)

	gr.Markdown("## Usage")

	with gr.Row():
	with gr.Column():
	audio = gr.Audio(label="Synthesized audio")
	rtf = gr.Markdown(
	label="Real-Time Factor",
	value="Here you will see how fast the model and the speaker is.",
	)

	with gr.Row():
	with gr.Column():
	text = gr.Text(
	label="Text",
	value="Сл+ава Укра+їні! — українське вітання, національне гасло.",
	)
	voice = gr.Radio(
	label="Voice",
	choices=[option.value for option in VoiceOption],
	value=VoiceOption.Tetiana.value,
	)

	gr.Button("Run").click(
	inference_zerogpu if use_zerogpu else inference_cpu,
	concurrency_limit=concurrency_limit,
	inputs=[text, voice],
	outputs=[audio, rtf],
	)

	with gr.Row():
	gr.Examples(
	label="Choose an example",
	inputs=[text, voice],
	examples=examples,
	)

	gr.Markdown(description_foot)

	gr.Markdown("### Gradio app uses:")
	gr.Markdown(tech_env)
	gr.Markdown(tech_libraries)

	if __name__ == "__main__":
	demo.queue()
	demo.launch()