from tts import TTS_object
import soundfile as sf
import gradio as gr
import subprocess
import random
import torch
import os
import re
# Preparing paths
BASE_PATH = os.path.dirname(__file__)
MODEL_PATH_v1 = os.path.join(BASE_PATH, "xtts_v1.1")
MODEL_PATH_v2 = os.path.join(BASE_PATH, "xtts_v2")
OUTPUTS_PATH = os.path.join(BASE_PATH, "outputs")
REFERENCES_PATH = os.path.join(BASE_PATH, "references")
HTML_BUTTON = """
"""
models = {
"xtts_v2_wajeez": TTS_object("models", torch.device('cuda:0'))
}
MODELS_COUNT = len(models)
def predict(text, speaker):
reference_file_path = os.path.join(REFERENCES_PATH, speaker + ".wav")
output_paths = []
for model_name, model in models.items():
wav = model.inference(text, reference_file_path)
path = os.path.join(OUTPUTS_PATH, model_name + ".wav")
sf.write(path, wav, 24000)
output_paths.append(path)
random.shuffle(output_paths)
actual_models = '\\n'.join([f"- The model number {i + 1} is {path.split('/')[-1][:-4]}" for i, path in enumerate(output_paths)])
return (text, *output_paths, HTML_BUTTON.format(actual_models))
# Get speakers from references path to prepare the speakers list
speakers = ["Wajeez"]
playground = gr.Interface(
fn = predict,
inputs = [
gr.Textbox(
value = "مرحبا كيف حالك؟",
label = "Input text",
info = "One or two sentences at a time is better. Up to 200 text characters."
),
gr.Dropdown(
speakers,
value="Wajeez",
label = "Speaker / Reference source",
info = "Choose your speaker or choose to upload / record a new speaker."
),
],
outputs = [gr.Textbox(
label = "Synthesized text",
info = "The text used as input after preprocessing is done (if any)."
)] + [gr.components.Audio(label = f'Model {i + 1}', type = 'filepath') for i in range(MODELS_COUNT)] + [gr.HTML()],
cache_examples = False,
allow_flagging = 'never'
)
playground.launch()