|
from typing import cast |
|
|
|
import gradio as gr |
|
from balacoon_tts import TTS |
|
from huggingface_hub import hf_hub_download, list_repo_files |
|
|
|
import os |
|
import io |
|
import wave |
|
import base64 |
|
|
|
KEY = |
|
|
|
default_max_lehgth = 250 |
|
|
|
default_text_model = "en_us_hifi_jets_cpu.addon" |
|
default_text_speaker = "8051" |
|
|
|
model_path = hf_hub_download(repo_id = "balacoon/tts", filename = default_text_model) |
|
tts = TTS(model_path) |
|
base64_data = "" |
|
|
|
def audio_to_base64(sample_rate, audio_data): |
|
buffer = io.BytesIO() |
|
with wave.open(buffer, 'w') as wav_file: |
|
wav_file.setnchannels(1) |
|
wav_file.setsampwidth(2) |
|
wav_file.setframerate(sample_rate) |
|
wav_file.writeframes(audio_data.tobytes()) |
|
|
|
wav_bytes = buffer.getvalue() |
|
base64_str = base64.b64encode(wav_bytes).decode('utf-8') |
|
return base64_str |
|
|
|
|
|
def synthesize_audio(access_key: str, text_str: str, text_model_str : str = "", text_speaker_str: str = ""): |
|
|
|
print(">>> MODEL CALLED: Input: " + text_str + ", Model: " + str(text_model_str) + ", Speaker: " + str(text_speaker_str)) |
|
if (access_key != KEY): |
|
print(">>> MODEL FAILED: Attempted Key: " + access_key) |
|
return; |
|
|
|
if not text_str: |
|
return None |
|
if len(text_str) > default_max_lehgth: |
|
text_str = text_str[:default_max_lehgth] |
|
|
|
speakers = tts.get_speakers() |
|
value = speakers[-1] |
|
|
|
samples = cast(TTS, tts).synthesize(text_str, text_speaker_str) |
|
sampling_rate = cast(TTS, tts).get_sampling_rate() |
|
|
|
value = (cast(TTS, tts).get_sampling_rate(), samples) |
|
|
|
get_audio = gr.Audio.update(value = value) |
|
base64_data = audio_to_base64(value[0], value[1]) |
|
|
|
return [get_audio, base64_data] |
|
|
|
def main(): |
|
|
|
with gr.Blocks() as demo: |
|
with gr.Row(variant = "panel"): |
|
gr.Markdown("This is a basic Text-To-Speech (TTS) demo based on the Balacoon model.\n\n\nTo change the model / speaker, please refer to: https://huggingface.co/spaces/balacoon/tts") |
|
|
|
with gr.Row(): |
|
with gr.Column(variant = "panel"): |
|
text = gr.Textbox(label = "Text Input", placeholder = "Input ...") |
|
with gr.Row(variant = "panel"): |
|
access_key = gr.Textbox(label = "Access Key", lines = 1) |
|
|
|
with gr.Row(): |
|
with gr.Column(variant = "panel"): |
|
get_text_model = gr.Textbox(label = "Model Input", placeholder = "Model ...", value = default_text_model) |
|
with gr.Row(variant = "panel"): |
|
get_text_speaker = gr.Textbox(label = "Speaker Input", placeholder = "Speaker ...", value = default_text_speaker) |
|
|
|
with gr.Row(variant = "panel"): |
|
generate = gr.Button("Generate") |
|
|
|
with gr.Row(variant = "panel"): |
|
audio = gr.Audio() |
|
|
|
with gr.Row(variant = "panel"): |
|
base_output = gr.Textbox(label = "Model Output", placeholder = "Output ...", value = "") |
|
|
|
generate.click(synthesize_audio, inputs = [access_key, text, get_text_model, get_text_speaker], outputs = [audio, base_output]) |
|
|
|
demo.launch() |
|
|
|
if __name__ == "__main__": |
|
main() |