from tts import TTS_object import soundfile as sf import gradio as gr import subprocess import random import torch import os import re # Preparing paths BASE_PATH = os.path.dirname(__file__) MODEL_PATH_v1 = os.path.join(BASE_PATH, "xtts_v1.1") MODEL_PATH_v2 = os.path.join(BASE_PATH, "xtts_v2") OUTPUTS_PATH = os.path.join(BASE_PATH, "outputs") REFERENCES_PATH = os.path.join(BASE_PATH, "references") HTML_BUTTON = """
""" models = { "xtts_v2_wajeez": TTS_object("models", torch.device('cuda:0')) } MODELS_COUNT = len(models) def predict(text, speaker): reference_file_path = os.path.join(REFERENCES_PATH, speaker + ".wav") output_paths = [] for model_name, model in models.items(): wav = model.inference(text, reference_file_path) path = os.path.join(OUTPUTS_PATH, model_name + ".wav") sf.write(path, wav, 24000) output_paths.append(path) random.shuffle(output_paths) actual_models = '\\n'.join([f"- The model number {i + 1} is {path.split('/')[-1][:-4]}" for i, path in enumerate(output_paths)]) return (text, *output_paths, HTML_BUTTON.format(actual_models)) # Get speakers from references path to prepare the speakers list speakers = ["Wajeez"] playground = gr.Interface( fn = predict, inputs = [ gr.Textbox( value = "مرحبا كيف حالك؟", label = "Input text", info = "One or two sentences at a time is better. Up to 200 text characters." ), gr.Dropdown( speakers, value="Wajeez", label = "Speaker / Reference source", info = "Choose your speaker or choose to upload / record a new speaker." ), ], outputs = [gr.Textbox( label = "Synthesized text", info = "The text used as input after preprocessing is done (if any)." )] + [gr.components.Audio(label = f'Model {i + 1}', type = 'filepath') for i in range(MODELS_COUNT)] + [gr.HTML()], cache_examples = False, allow_flagging = 'never' ) playground.launch()