import os from TTS.utils.download import download_url from TTS.utils.synthesizer import Synthesizer import gradio as gr import tempfile import torch import json from TTS.tts.utils.synthesis import synthesis from TTS.tts.configs.vits_config import VitsConfig from TTS.tts.models.vits import Vits, VitsCharacters from TTS.tts.utils.text.tokenizer import TTSTokenizer import numpy as np from TTS.utils.audio.numpy_transforms import save_wav MAX_TXT_LEN = 800 BASE_DIR = "kbd-vits-tts-{}" MALE_MODEL_URL = "https://huggingface.co/anzorq/kbd-vits-tts-male/resolve/main/checkpoint_56000.pth" MALE_CONFIG_URL = "https://huggingface.co/anzorq/kbd-vits-tts-male/resolve/main/config_35000.json" FEMALE_MODEL_URL = "https://huggingface.co/anzorq/kbd-vits-tts-female/resolve/main/best_model_56351.pth" FEMALE_CONFIG_URL = "https://huggingface.co/anzorq/kbd-vits-tts-female/resolve/main/config.json" MALE_ONNX_MODEL_URL = "https://huggingface.co/anzorq/kbd-vits-tts-male/resolve/main/onnx/kbd_vits_male.onnx" FEMALE_ONNX_MODEL_URL = "https://huggingface.co/anzorq/kbd-vits-tts-female/resolve/main/onnx/kbd_vits_female.onnx" def download_model_and_config(gender): dir_path = BASE_DIR.format(gender) if not os.path.exists(dir_path): os.makedirs(dir_path) model_url = MALE_MODEL_URL if gender == "male" else FEMALE_MODEL_URL config_url = MALE_CONFIG_URL if gender == "male" else FEMALE_CONFIG_URL onnx_model_url = MALE_ONNX_MODEL_URL if gender == "male" else FEMALE_ONNX_MODEL_URL download_url(model_url, dir_path, "model.pth") download_url(config_url, dir_path, "config.json") download_url(onnx_model_url, dir_path, "model.onnx") return dir_path download_model_and_config("male") download_model_and_config("female") def tts(text: str, voice: str = "Male", use_onnx: bool = True): if len(text) > MAX_TXT_LEN: text = text[:MAX_TXT_LEN] print(f"Input text was cutoff since it went over the {MAX_TXT_LEN} character limit.") model_dir = BASE_DIR.format("male" if voice == "Male" else "female") config_file = f"{model_dir}/config.json" text = text.replace("I", "ӏ") # Replace capital "I" with "Palochka" symbol text = text.lower() if use_onnx: onnx_model_path = f"{model_dir}/model.onnx" config = VitsConfig() config.load_json(config_file) tokenizer = TTSTokenizer( use_phonemes=False, text_cleaner=config.text_cleaner, characters=VitsCharacters(), phonemizer=None, add_blank=config.add_blank, ) vits = Vits.init_from_config(config) vits.load_onnx(onnx_model_path) text_inputs = np.asarray( vits.tokenizer.text_to_ids(text), dtype=np.int64, )[None, :] audio = vits.inference_onnx(text_inputs) # Create a temporary WAV file with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: out_path = temp_file.name save_wav(wav=audio[0], path=out_path, sample_rate=24000) else: # Synthesize synthesizer = Synthesizer(f"{model_dir}/model.pth", config_file) wavs = synthesizer.tts(text) # Create a temporary WAV file with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file: out_path = temp_file.name synthesizer.save_wav(wavs, out_path) return out_path iface = gr.Interface( fn=tts, inputs=[ gr.Textbox( label="Text", value="Дауэ ущыт?", ), gr.Radio( choices=["Male", "Female"], value="Male", label="Voice" ), gr.Checkbox( label="Use ONNX", value=True, ), ], outputs=gr.Audio(label="Output", type='filepath'), title="KBD TTS", live=False ) iface.launch(share=False)