from typing import cast import gradio as gr from balacoon_tts import TTS from huggingface_hub import hf_hub_download, list_repo_files import os import io import wave import base64 KEY = # os.environ.get("KEY") default_max_lehgth = 250 default_text_model = "en_us_hifi_jets_cpu.addon" default_text_speaker = "8051" model_path = hf_hub_download(repo_id = "balacoon/tts", filename = default_text_model) tts = TTS(model_path) base64_data = "" def audio_to_base64(sample_rate, audio_data): buffer = io.BytesIO() with wave.open(buffer, 'w') as wav_file: wav_file.setnchannels(1) wav_file.setsampwidth(2) wav_file.setframerate(sample_rate) wav_file.writeframes(audio_data.tobytes()) wav_bytes = buffer.getvalue() base64_str = base64.b64encode(wav_bytes).decode('utf-8') return base64_str def synthesize_audio(access_key: str, text_str: str, text_model_str : str = "", text_speaker_str: str = ""): print(">>> MODEL CALLED: Input: " + text_str + ", Model: " + str(text_model_str) + ", Speaker: " + str(text_speaker_str)) if (access_key != KEY): print(">>> MODEL FAILED: Attempted Key: " + access_key) return; if not text_str: return None if len(text_str) > default_max_lehgth: text_str = text_str[:default_max_lehgth] speakers = tts.get_speakers() value = speakers[-1] samples = cast(TTS, tts).synthesize(text_str, text_speaker_str) sampling_rate = cast(TTS, tts).get_sampling_rate() value = (cast(TTS, tts).get_sampling_rate(), samples) get_audio = gr.Audio.update(value = value) base64_data = audio_to_base64(value[0], value[1]) return [get_audio, base64_data] def main(): with gr.Blocks() as demo: with gr.Row(variant = "panel"): gr.Markdown("This is a basic Text-To-Speech (TTS) demo based on the Balacoon model.\n\n\nTo change the model / speaker, please refer to: https://huggingface.co/spaces/balacoon/tts") with gr.Row(): with gr.Column(variant = "panel"): text = gr.Textbox(label = "Text Input", placeholder = "Input ...") with gr.Row(variant = "panel"): access_key = gr.Textbox(label = "Access Key", lines = 1) with gr.Row(): with gr.Column(variant = "panel"): get_text_model = gr.Textbox(label = "Model Input", placeholder = "Model ...", value = default_text_model) with gr.Row(variant = "panel"): get_text_speaker = gr.Textbox(label = "Speaker Input", placeholder = "Speaker ...", value = default_text_speaker) with gr.Row(variant = "panel"): generate = gr.Button("Generate") with gr.Row(variant = "panel"): audio = gr.Audio() with gr.Row(variant = "panel"): base_output = gr.Textbox(label = "Model Output", placeholder = "Output ...", value = "") generate.click(synthesize_audio, inputs = [access_key, text, get_text_model, get_text_speaker], outputs = [audio, base_output]) demo.launch() if __name__ == "__main__": main()