| |
| |
| |
| |
|
|
| import atexit |
| import math |
| import torch |
| import gradio as gr |
| from config import ( |
| AVAILABLE_VOICES, |
| DEFAULT_VOICE, |
| DEFAULT_MODEL_VARIANT, |
| DEFAULT_TEMPERATURE, |
| DEFAULT_LSD_DECODE_STEPS, |
| DEFAULT_EOS_THRESHOLD, |
| DEFAULT_NOISE_CLAMP, |
| DEFAULT_FRAMES_AFTER_EOS, |
| MAXIMUM_INPUT_LENGTH, |
| VOICE_MODE_PRESET, |
| VOICE_MODE_CLONE, |
| EXAMPLE_PROMPTS, |
| ACCELERATOR_ENABLED, |
| PYTORCH_COMPUTATION_THREADS, |
| PYTORCH_INTEROP_THREADS |
| ) |
| torch.set_num_threads(PYTORCH_COMPUTATION_THREADS) |
| torch.set_num_interop_threads(PYTORCH_INTEROP_THREADS) |
| from src.core.authentication import authenticate_huggingface |
| authenticate_huggingface() |
| if ACCELERATOR_ENABLED: |
| from src.accelerator.client import start_accelerator_daemon, stop_accelerator_daemon |
| accelerator_started = start_accelerator_daemon() |
| if accelerator_started: |
| print("Accelerator daemon started successfully", flush=True) |
| else: |
| print("Accelerator daemon not available, using Python fallback", flush=True) |
| atexit.register(stop_accelerator_daemon) |
| from src.core.memory import start_background_cleanup_thread |
| start_background_cleanup_thread() |
| from src.generation.handler import ( |
| perform_speech_generation, |
| request_generation_stop |
| ) |
| from src.ui.state import ( |
| check_generate_button_state, |
| calculate_character_count_display, |
| determine_clear_button_visibility, |
| update_voice_mode_visibility |
| ) |
| from src.ui.handlers import ( |
| switch_to_generating_state, |
| switch_to_idle_state, |
| perform_clear_action, |
| create_example_handler, |
| format_example_button_label |
| ) |
| from assets.css.styles import CSS |
| from assets.static.title import TITLE |
| from assets.static.header import HEADER |
| from assets.static.footer import FOOTER |
| from assets.static.sidebar import SIDEBAR |
|
|
| with gr.Blocks(css=CSS, fill_height=False, fill_width=True) as app: |
| ui_state = gr.State({"generating": False}) |
|
|
| with gr.Sidebar(): |
| gr.HTML(SIDEBAR()) |
|
|
| with gr.Column(elem_classes="header-section"): |
| gr.HTML(TITLE()) |
| gr.HTML(HEADER()) |
|
|
| with gr.Row(): |
| with gr.Column(): |
| audio_output_component = gr.Audio( |
| label="Generated Speech Output", |
| type="filepath", |
| interactive=False |
| ) |
|
|
| with gr.Accordion("Voice Selection", open=True): |
| voice_mode_radio = gr.Radio( |
| label="Voice Mode", |
| choices=[ |
| VOICE_MODE_PRESET, |
| VOICE_MODE_CLONE |
| ], |
| value=VOICE_MODE_PRESET, |
| info="Choose between preset voices or clone a voice from uploaded audio", |
| elem_id="voice-mode" |
| ) |
|
|
| with gr.Column(visible=True) as preset_voice_container: |
| voice_preset_dropdown = gr.Dropdown( |
| label="Select Preset Voice", |
| choices=AVAILABLE_VOICES, |
| value=DEFAULT_VOICE |
| ) |
|
|
| with gr.Column(visible=False) as clone_voice_container: |
| voice_clone_audio_input = gr.Audio( |
| label="Upload Audio for Voice Cloning", |
| type="filepath" |
| ) |
|
|
| with gr.Accordion("Model Parameters", open=False): |
| with gr.Row(): |
| temperature_slider = gr.Slider( |
| label="Temperature", |
| minimum=0.1, |
| maximum=2.0, |
| step=0.05, |
| value=DEFAULT_TEMPERATURE, |
| info="Higher values produce more expressive speech" |
| ) |
| |
| lsd_decode_steps_slider = gr.Slider( |
| label="LSD Decode Steps", |
| minimum=1, |
| maximum=20, |
| step=1, |
| value=DEFAULT_LSD_DECODE_STEPS, |
| info="More steps may improve quality but slower" |
| ) |
|
|
| with gr.Row(): |
| noise_clamp_slider = gr.Slider( |
| label="Noise Clamp", |
| minimum=0.0, |
| maximum=2.0, |
| step=0.05, |
| value=DEFAULT_NOISE_CLAMP, |
| info="Maximum noise sampling value (0 = disabled)" |
| ) |
| |
| eos_threshold_slider = gr.Slider( |
| label="End of Sequence Threshold", |
| minimum=-10.0, |
| maximum=0.0, |
| step=0.25, |
| value=DEFAULT_EOS_THRESHOLD, |
| info="Smaller values cause earlier completion" |
| ) |
|
|
| with gr.Accordion("Advanced Settings", open=False): |
| model_variant_textbox = gr.Textbox( |
| label="Model Variant Identifier", |
| value=DEFAULT_MODEL_VARIANT, |
| info="Model signature for generation" |
| ) |
|
|
| with gr.Row(): |
| enable_custom_frames_checkbox = gr.Checkbox( |
| label="Enable Custom Frames After EOS", |
| value=False, |
| info="Manually control post-EOS frame generation" |
| ) |
| |
| frames_after_eos_slider = gr.Slider( |
| label="Frames After EOS", |
| minimum=0, |
| maximum=100, |
| step=1, |
| value=DEFAULT_FRAMES_AFTER_EOS, |
| info="Additional frames after end-of-sequence (80ms per frame)" |
| ) |
|
|
| with gr.Column(scale=1): |
| text_input_component = gr.Textbox( |
| label="Prompt", |
| placeholder="Enter the text you want to convert to speech...", |
| lines=2, |
| max_lines=20, |
| max_length=MAXIMUM_INPUT_LENGTH, |
| autoscroll=True |
| ) |
|
|
| character_count_display = gr.HTML( |
| f""" |
| <div class="character-count"> |
| <span>0 / {MAXIMUM_INPUT_LENGTH}</span> |
| </div> |
| """, |
| visible=False |
| ) |
|
|
| generate_button = gr.Button( |
| "Generate", |
| variant="primary", |
| size="lg", |
| interactive=False |
| ) |
|
|
| stop_button = gr.Button( |
| "Stop", |
| variant="stop", |
| size="lg", |
| visible=False |
| ) |
|
|
| clear_button = gr.Button( |
| "Clear", |
| variant="secondary", |
| size="lg", |
| visible=False |
| ) |
|
|
| gr.HTML( |
| """ |
| <div class="example-prompts"> |
| <h3>Example Prompts</h3> |
| <p>Click any example to generate speech with its assigned voice</p> |
| </div> |
| """ |
| ) |
|
|
| example_buttons_list = [] |
| num_examples = len(EXAMPLE_PROMPTS) |
| examples_per_row = 2 |
| num_rows = math.ceil(num_examples / examples_per_row) |
|
|
| for row_idx in range(num_rows): |
| with gr.Row(): |
| start_idx = row_idx * examples_per_row |
| end_idx = min(start_idx + examples_per_row, num_examples) |
| for i in range(start_idx, end_idx): |
| btn = gr.Button( |
| format_example_button_label( |
| EXAMPLE_PROMPTS[i]["text"], |
| EXAMPLE_PROMPTS[i]["voice"] |
| ), |
| size="sm", |
| variant="secondary" |
| ) |
| example_buttons_list.append(btn) |
|
|
| gr.HTML(FOOTER()) |
|
|
| generation_inputs = [ |
| text_input_component, |
| voice_mode_radio, |
| voice_preset_dropdown, |
| voice_clone_audio_input, |
| model_variant_textbox, |
| lsd_decode_steps_slider, |
| temperature_slider, |
| noise_clamp_slider, |
| eos_threshold_slider, |
| frames_after_eos_slider, |
| enable_custom_frames_checkbox |
| ] |
|
|
| voice_mode_radio.change( |
| fn=update_voice_mode_visibility, |
| inputs=[voice_mode_radio], |
| outputs=[ |
| preset_voice_container, |
| clone_voice_container |
| ] |
| ) |
|
|
| text_input_component.change( |
| fn=calculate_character_count_display, |
| inputs=[text_input_component], |
| outputs=[character_count_display] |
| ) |
|
|
| text_input_component.change( |
| fn=check_generate_button_state, |
| inputs=[ |
| text_input_component, |
| ui_state |
| ], |
| outputs=[generate_button] |
| ) |
|
|
| text_input_component.change( |
| fn=determine_clear_button_visibility, |
| inputs=[ |
| text_input_component, |
| ui_state |
| ], |
| outputs=[clear_button] |
| ) |
|
|
| generate_button.click( |
| fn=switch_to_generating_state, |
| inputs=[ui_state], |
| outputs=[ |
| generate_button, |
| stop_button, |
| clear_button, |
| ui_state |
| ] |
| ).then( |
| fn=perform_speech_generation, |
| inputs=generation_inputs, |
| outputs=[audio_output_component] |
| ).then( |
| fn=switch_to_idle_state, |
| inputs=[ |
| text_input_component, |
| ui_state |
| ], |
| outputs=[ |
| generate_button, |
| stop_button, |
| clear_button, |
| ui_state |
| ] |
| ) |
|
|
| stop_button.click( |
| fn=request_generation_stop, |
| outputs=[stop_button] |
| ) |
|
|
| clear_button.click( |
| fn=perform_clear_action, |
| outputs=[ |
| text_input_component, |
| audio_output_component, |
| clear_button, |
| voice_mode_radio, |
| voice_preset_dropdown, |
| voice_clone_audio_input |
| ] |
| ) |
|
|
| for button_index, example_button in enumerate(example_buttons_list): |
| example_text = EXAMPLE_PROMPTS[button_index]["text"] |
| example_voice = EXAMPLE_PROMPTS[button_index]["voice"] |
|
|
| example_button.click( |
| fn=switch_to_generating_state, |
| inputs=[ui_state], |
| outputs=[ |
| generate_button, |
| stop_button, |
| clear_button, |
| ui_state |
| ] |
| ).then( |
| fn=create_example_handler(example_text, example_voice), |
| outputs=[ |
| text_input_component, |
| voice_mode_radio, |
| voice_preset_dropdown |
| ] |
| ).then( |
| fn=perform_speech_generation, |
| inputs=generation_inputs, |
| outputs=[audio_output_component] |
| ).then( |
| fn=switch_to_idle_state, |
| inputs=[ |
| text_input_component, |
| ui_state |
| ], |
| outputs=[ |
| generate_button, |
| stop_button, |
| clear_button, |
| ui_state |
| ] |
| ) |
|
|
| app.launch( |
| server_name="0.0.0.0", |
| max_file_size="1mb" |
| ) |