Staticaliza's picture
Update app.py
56e4c28 verified
raw
history blame contribute delete
No virus
3.19 kB
from typing import cast
import gradio as gr
from balacoon_tts import TTS
from huggingface_hub import hf_hub_download, list_repo_files
import os
import io
import wave
import base64
KEY = # os.environ.get("KEY")
default_max_lehgth = 250
default_text_model = "en_us_hifi_jets_cpu.addon"
default_text_speaker = "8051"
model_path = hf_hub_download(repo_id = "balacoon/tts", filename = default_text_model)
tts = TTS(model_path)
base64_data = ""
def audio_to_base64(sample_rate, audio_data):
buffer = io.BytesIO()
with wave.open(buffer, 'w') as wav_file:
wav_file.setnchannels(1)
wav_file.setsampwidth(2)
wav_file.setframerate(sample_rate)
wav_file.writeframes(audio_data.tobytes())
wav_bytes = buffer.getvalue()
base64_str = base64.b64encode(wav_bytes).decode('utf-8')
return base64_str
def synthesize_audio(access_key: str, text_str: str, text_model_str : str = "", text_speaker_str: str = ""):
print(">>> MODEL CALLED: Input: " + text_str + ", Model: " + str(text_model_str) + ", Speaker: " + str(text_speaker_str))
if (access_key != KEY):
print(">>> MODEL FAILED: Attempted Key: " + access_key)
return;
if not text_str:
return None
if len(text_str) > default_max_lehgth:
text_str = text_str[:default_max_lehgth]
speakers = tts.get_speakers()
value = speakers[-1]
samples = cast(TTS, tts).synthesize(text_str, text_speaker_str)
sampling_rate = cast(TTS, tts).get_sampling_rate()
value = (cast(TTS, tts).get_sampling_rate(), samples)
get_audio = gr.Audio.update(value = value)
base64_data = audio_to_base64(value[0], value[1])
return [get_audio, base64_data]
def main():
with gr.Blocks() as demo:
with gr.Row(variant = "panel"):
gr.Markdown("This is a basic Text-To-Speech (TTS) demo based on the Balacoon model.\n\n\nTo change the model / speaker, please refer to: https://huggingface.co/spaces/balacoon/tts")
with gr.Row():
with gr.Column(variant = "panel"):
text = gr.Textbox(label = "Text Input", placeholder = "Input ...")
with gr.Row(variant = "panel"):
access_key = gr.Textbox(label = "Access Key", lines = 1)
with gr.Row():
with gr.Column(variant = "panel"):
get_text_model = gr.Textbox(label = "Model Input", placeholder = "Model ...", value = default_text_model)
with gr.Row(variant = "panel"):
get_text_speaker = gr.Textbox(label = "Speaker Input", placeholder = "Speaker ...", value = default_text_speaker)
with gr.Row(variant = "panel"):
generate = gr.Button("Generate")
with gr.Row(variant = "panel"):
audio = gr.Audio()
with gr.Row(variant = "panel"):
base_output = gr.Textbox(label = "Model Output", placeholder = "Output ...", value = "")
generate.click(synthesize_audio, inputs = [access_key, text, get_text_model, get_text_speaker], outputs = [audio, base_output])
demo.launch()
if __name__ == "__main__":
main()