sintetica-TTS / app.py
franciscorogevallone's picture
update: return implementation to service sintetica-webpage
b3e99a7
import gradio as gr
import requests
from typing import Union
from pydub import AudioSegment
from io import BytesIO
import os
from utils import CheckLanguageIsSpanish
title = """
<div style="display: flex; align-items: center;">
<img src="https://i.postimg.cc/NGVZL3bg/logo-sintetica.png" alt="sintetica_logo" width="40" height="40" style="margin-right: 10px;">
<h1 style="font-size: 24px;">Sintética: Text-to-Speech (TTS)</h1>
</div>
"""
description = """
<div style="font-size: 16px;">
Sintética: Sistema text-to-speech (TTS) desarrollado por MecanTronic S.A.
Para más información:
</div>
<div style="display: flex; flex-direction: column;">
<a href="https://sintetica.com.ar/" style="display: flex; align-items: center; margin-bottom: 5px; font-size:16px;">
<img src="https://i.postimg.cc/NGVZL3bg/logo-sintetica.png" alt="sintetica_logo" width="20" height="20" style="margin-right: 10px">
Sintética
</a>
<a href="https://mecantronic.com.ar/" style="display: flex; align-items: center; font-size:16px;">
<img src="https://i.postimg.cc/X7Cm5sD9/logo-MEC.png" alt="mecantronic_logo" width="20" height="20" style="margin-right: 10px;">
MecanTronic
</a>
</div>
"""
speaker_mapping = {
"Micaela": "arf_00295",
"Florencia": "arf_02121",
"Rocío": "arf_02484",
"Pedro": "arm_03397",
"Pablo": "arm_06136",
"Juan": "arm_05223"}
def process( text: str, speaker: str, speed: Union[int, float]):
speaker = speaker_mapping[speaker]
if check_errors(text):
return None
request = {
"text": text,
"speaker": speaker,
"pitch_scale_factor": 0.95,
"length_scale_factor": 1/speed,
"output_samplerate": 44100}
audio_path = post_request(request)
audio_widget = play_audio(audio_path)
return audio_widget
def post_request(request: dict):
url = os.environ.get("URL_INFERENCE")
headers = {'Content-Type': 'application/json'}
response = requests.post(url, json=request, headers=headers)
return BytesIO(response.content)
def play_audio(audio_bytes: BytesIO):
audio = AudioSegment.from_file(audio_bytes)
audio_widget = gr.Audio(audio.export(format="wav").read())
return audio_widget
def check_errors(text):
error = False
if len(text) > 330:
gr.Warning(f'La longitud del texto ({len(text)} caracteres) sobrepasa el máximo permitido.')
error = True
if not CheckLanguageIsSpanish().detect_english(text):
gr.Warning('El texto está en Ingles o posee oraciones en dicho idioma.')
error = True
return error
demo = gr.Blocks()
with demo:
gr.HTML(title)
language_choices = ["Español"]
language = gr.Radio(
choices=language_choices,
value=language_choices[0],
label="Idioma",
info="Próximamente nuevos idiomas disponibles"
)
with gr.Tabs():
with gr.TabItem("Opciones del TTS"):
input_text = gr.Textbox(
label="330 caracteres como máximo",
type="text",
lines=3,
placeholder="Ingresa aquí el texto"
)
input_speaker = gr.Dropdown(
label="Hablante",
choices=speaker_mapping.keys(),
type="value",
value=list(speaker_mapping.keys())[0]
)
input_speed = gr.Slider(
minimum=0.5,
maximum=1.5,
value=1,
step=0.05,
label="Velocidad de reproducción",
)
with gr.Row():
input_button = gr.Button("Procesar", variant="primary")
clear_button = gr.ClearButton([input_text], value="Limpiar")
gr.HTML('<div style="height: 15px;"></div>')
output_audio = gr.Audio(
type="filepath",
label="Audio sintetizado"
)
input_button.click(
process,
inputs=[
input_text,
input_speaker,
input_speed,
],
outputs=[
output_audio,
],
)
gr.Markdown(description)
if __name__ == "__main__":
demo.launch()