Spaces:

AlexK-PL
/

vits-v2-8khz-inference

Runtime error

App Files Files Community

vits-v2-8khz-inference / app.py

AlexK-PL

Update app.py

523eb17 verified 10 months ago

raw

history blame contribute delete

8.29 kB

	import tempfile
	import subprocess
	import time

	from typing import Optional
	from AinaTheme import AinaGradioTheme
	import gradio as gr
	import numpy as np
	import torch
	import os
	from TTS.utils.synthesizer import Synthesizer

	from dotenv import load_dotenv

	torch.manual_seed(0)
	np.random.seed(0)

	import json
	from copy import deepcopy

	import numpy as np
	import torch

	import torchaudio
	import torchaudio.transforms as T

	import random

	random.seed(0)
	torch.manual_seed(0)
	np.random.seed(0)

	SAMPLE_RATE = 8000

	#############################################################################################################

	load_dotenv()

	MAX_INPUT_TEXT_LEN = int(os.environ.get("MAX_INPUT_TEXT_LEN", default=500))

	# Dynamically read model files, exclude 'speakers.pth'
	model_files = [f for f in os.listdir(os.getcwd()) if f.endswith('.pth') and f != 'speakers.pth']
	# model_files = [f for f in os.listdir(os.path.join(os.getcwd(), 'checkpoints')) if f.endswith('.pth')]
	# model_files.sort(key=lambda x: os.path.getmtime(os.path.join(os.getcwd(), x)), reverse=True)

	speakers_path = "speakers.pth"
	speakers_list = torch.load(speakers_path)
	speakers_list = list(speakers_list.keys())
	speakers_list = [speaker for speaker in speakers_list]

	default_speaker_list = speakers_list #

	# Filtered lists based on dataset
	festcat_speakers = [s for s in speakers_list if len(s) == 3] #
	google_speakers = [s for s in speakers_list if 3 < len(s) < 20] #
	commonvoice_speakers = [s for s in speakers_list if len(s) > 20] #

	hop_128_checkpoints = [c for c in model_files if c.split('_')[1] == "M"]
	hop_96_checkpoints = [c for c in model_files if c.split('_')[1] == "reduced"]

	DEFAULT_SPEAKER_ID = os.environ.get("DEFAULT_SPEAKER_ID", default="pau")
	DEFAULT_CHECKPOINT = os.environ.get("DEFAULT_CHECKPOINT", default=model_files[-1])

	model_config = "config.json" # by default 128 hop

	# model_file = model_files[0] # change this!!

	# model_path = os.path.join(os.getcwd(), model_file)
	# config_path = os.path.join(os.getcwd(), "config.json")

	# vocoder_path = None
	# vocoder_config_path = None

	# synthesizer = Synthesizer(
	# model_path, config_path, speakers_path, None, vocoder_path, vocoder_config_path,
	# )


	def get_phonetic_transcription(text: str):
	try:
	result = subprocess.run(
	['espeak-ng', '--ipa', '-v', 'ca', text],
	stdout=subprocess.PIPE,
	stderr=subprocess.PIPE,
	text=True,
	check=True
	)
	return result.stdout.strip()
	except subprocess.CalledProcessError as e:
	print(f"An error occurred: {e}")
	return None


	def tts_inference(text: str, speaker_idx: str = None, model_file: str=None):

	model_path = os.path.join(os.getcwd(), model_file)
	speakers_file_path = "speakers.pth"
	if model_file.split('_')[1] == "M":
	config_path = "config.json"
	elif model_file.split('_')[1] == "reduced":
	config_path = "config_hop_96.json"
	else:
	config_path = "config.json"
	vocoder_path = None
	vocoder_config_path = None

	synthesizer = Synthesizer(model_path, config_path, speakers_path, None,
	vocoder_path, vocoder_config_path)
	# synthesize
	if synthesizer is None:
	raise NameError("model not found")
	t1 = time.time()
	wavs = synthesizer.tts(text, speaker_idx)
	# print(type(wavs))
	wavs_den = wavs

	# return output
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
	# wavs must be a list of integers
	synthesizer.save_wav(wavs_den, fp)
	t2 = time.time() - t1
	print(round(t2, 2))
	output_audio = fp.name

	return output_audio


	title = "🗣️ Catalan Multispeaker TTS Tester 🗣️"
	description = """
	1️⃣ Enter the text to synthesize.
	2️⃣ Select a voice from the dropdown menu.
	3️⃣ Enjoy!
	"""


	def submit_input(input_, speaker_id, model_chkpt):
	output_audio = None
	output_phonetic = None
	if input_ is not None and len(input_) < MAX_INPUT_TEXT_LEN:
	output_audio = tts_inference(input_, speaker_id, model_chkpt)
	output_phonetic = get_phonetic_transcription(input_)
	else:
	gr.Warning(f"Your text exceeds the {MAX_INPUT_TEXT_LEN}-character limit.")
	return output_audio, output_phonetic


	def change_interactive(text):
	input_state = text
	if input_state.strip() != "":
	return gr.update(interactive=True)
	else:
	return gr.update(interactive=False)


	def clean():
	return (
	None,
	None,
	)


	with gr.Blocks(**AinaGradioTheme().get_kwargs()) as app:
	gr.Markdown(f"<h1 style='text-align: center; margin-bottom: 1rem'>{title}</h1>")
	gr.Markdown(description)

	with gr.Row(equal_height=False):

	with gr.Column(variant='panel'):
	input_ = gr.Textbox(
	label="Text",
	value="Introdueix el text a sintetitzar.",
	lines=4
	)

	dataset = gr.Radio(["All", "Festcat", "Google TTS", "CommonVoice"], label="Speakers Dataset",
	value="All")


	def update_speaker_list(dataset):
	print("Updating speaker list based on dataset:", dataset)
	if dataset == "Festcat":
	current_speakers = festcat_speakers
	elif dataset == "Google TTS":
	current_speakers = google_speakers
	elif dataset == "CommonVoice":
	current_speakers = commonvoice_speakers
	else:
	current_speakers = speakers_list

	return gr.update(choices=current_speakers, value=current_speakers[0])


	def update_checkpoint_list(model_hop):
	print("Updating checkpoint list based on model config:", model_hop)
	if model_hop == "hop_size_128":
	current_checkpoints = hop_128_checkpoints
	# model_config = "config.json"
	elif model_hop == "hop_size_96":
	current_checkpoints = hop_96_checkpoints
	else:
	current_checkpoints = model_files

	return gr.update(choices=current_checkpoints, value=current_checkpoints[0])



	speaker_id = gr.Dropdown(label="Select a voice", choices=speakers_list, value=DEFAULT_SPEAKER_ID,
	interactive=True)
	dataset.change(fn=update_speaker_list, inputs=dataset, outputs=speaker_id)

	model_hop = gr.Radio(["hop_size_128", "hop_size_96"], label="Model Type", value="hop_size_128")

	model_chkpt = gr.Dropdown(label="Select a checkpoint", choices=model_files, value=DEFAULT_CHECKPOINT,
	interactive=True)

	model_hop.change(fn=update_checkpoint_list, inputs=model_hop, outputs=model_chkpt)

	# model = gr.Dropdown(label="Select a model", choices=model_files, value=DEFAULT_MODEL_FILE_NAME)
	with gr.Row():
	clear_btn = gr.ClearButton(value='Clean', components=[input_])
	# clear_btn = gr.Button(
	# "Clean",
	# )
	submit_btn = gr.Button(
	"Submit",
	variant="primary",
	)
	# use_denoise = gr.Radio(choices=[("Yes", 0), ("No", 1)], value=0)
	with gr.Column(variant='panel'):
	output_audio = gr.Audio(label="Output", type="filepath", autoplay=True, show_share_button=False)
	# output_audio_den = gr.Audio(label="Output denoised", type="filepath", autoplay=False, show_share_button=False)

	output_phonetic = gr.Textbox(label="Phonetic Transcription", readonly=True)

	for button in [submit_btn]: # clear_btn
	input_.change(fn=change_interactive, inputs=[input_], outputs=button)

	# clear_btn.click(fn=clean, inputs=[], outputs=[input_, output_audio, output_phonetic], queue=False)
	submit_btn.click(fn=submit_input, inputs=[input_, speaker_id, model_chkpt], outputs=[output_audio, output_phonetic])

	app.queue(concurrency_count=1, api_open=False)
	app.launch(show_api=False, server_name="0.0.0.0", server_port=7860)