Spaces:
Running
Running
| import gradio as gr | |
| from gradio_client import Client | |
| import os | |
| import random | |
| import numpy as np | |
| import scipy.io.wavfile as wavfile | |
| # try: | |
| # client = Client(os.environ['src']) | |
| # except: | |
| # client = Client("http://localhost:7861/") | |
| css = """ | |
| .gradio-container input::placeholder, | |
| .gradio-container textarea::placeholder { | |
| color: #333333 !important; | |
| } | |
| code { | |
| background-color: #ffde9f; | |
| padding: 2px 4px; | |
| border-radius: 3px; | |
| } | |
| .gr-checkbox label span, | |
| .gr-check-radio label span, | |
| [data-testid="checkbox"] label span, | |
| .checkbox-container span { | |
| color: #ECF2F7 !important; | |
| } | |
| #advanced-accordion > button, | |
| #advanced-accordion > button span, | |
| #advanced-accordion > div > button, | |
| #advanced-accordion > div > button span, | |
| #advanced-accordion .label-wrap, | |
| #advanced-accordion .label-wrap span, | |
| #advanced-accordion > .open, | |
| #advanced-accordion > .open span { | |
| color: #FFD700 !important; | |
| } | |
| #voice-preset-container .gallery button, | |
| #voice-preset-container .gr-examples button, | |
| #voice-preset-container .examples button, | |
| #voice-preset-container button.sample { | |
| background-color: #c8b8d4 !important; | |
| border: 1px solid #b8a8c4 !important; | |
| color: #1a1a1a !important; | |
| font-weight: 500 !important; | |
| margin: 4px !important; | |
| padding: 10px 14px !important; | |
| border-radius: 6px !important; | |
| transition: background-color 0.2s ease !important; | |
| } | |
| #voice-preset-container .gallery button:hover, | |
| #voice-preset-container .gr-examples button:hover, | |
| #voice-preset-container .examples button:hover, | |
| #voice-preset-container button.sample:hover { | |
| background-color: #baadc9 !important; | |
| border-color: #a89ab8 !important; | |
| } | |
| body { | |
| background: none !important; | |
| } | |
| body::before { | |
| content: ""; | |
| position: fixed; | |
| top: 0; | |
| left: 0; | |
| width: 100%; | |
| height: 100%; | |
| z-index: -1; | |
| pointer-events: none; | |
| background: url('https://i.postimg.cc/1smD6GPf/gradio-theme-rin2.png') center center / cover no-repeat; | |
| } | |
| """ | |
| VOICE_EXAMPLES = { | |
| "甘えた女の子 / ゆっくり": "かわいくて高い声の女の子が、甘えながらゆっくりのんびりしゃべってる感じの音声がほしい。", # https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/onnanoko_amai.wav | |
| "激怒する女性 / 感情爆発": "低くて激しい声の女性が、感情を抑えきれずに怒りを爆発させながら、早口でまくしたてるような声で読んでほしい。", # https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/angry.wav | |
| "落ち着いた男性 / 呆れ気味": "落ち着いた低めの声の男性が、相手の言動に少し呆れつつも感情を表に出さず、静かで平坦なトーンで淡々と話してるような声で読んでほしい。", # https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/guy_cool.wav | |
| "Calm man / mildly exasperated (EN)": "Read this in the voice of a calm, low-pitched man who sounds mildly exasperated but keeps his emotions in check, speaking in a flat, even tone without much expression.", # Nothing | |
| "冷たい女性 / 憎しみ (1)": "低くて冷たい声の女性が、怒りを内に秘めながら憎しみのこもった口調で、淡々と早めに話してるような声で読んでほしい。", # https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/woman_cold_frustrated_2.wav | |
| "冷たい女性 / 憎しみ (2)": "低くて冷たい声の女性が、怒りを内に秘めながら憎しみのこもった口調で、淡々と早めに話してるような声で読んでほしい。", # same text different result --> https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/woman_cold_frustrated.wav | |
| } | |
| VOICE_PRESET_LIST = list(VOICE_EXAMPLES.items()) | |
| # label -> local file path (ship these in your Space repo under samples/) | |
| PREGENERATED_AUDIO = { | |
| "甘えた女の子 / ゆっくり": "samples/onnanoko_amai.wav", | |
| "激怒する女性 / 感情爆発": "samples/angry.wav", | |
| "落ち着いた男性 / 呆れ気味": "samples/guy_cool.wav", | |
| "冷たい女性 / 憎しみ (1)": "samples/woman_cold_frustrated_2.wav", | |
| "冷たい女性 / 憎しみ (2)": "samples/woman_cold_frustrated.wav", | |
| } | |
| def load_pregenerated_to_main(label): | |
| """ | |
| Click handler from Examples tab: | |
| loads instruction text into the Instruction box (optional) | |
| and loads the pre-generated WAV into the MAIN tab audio_output. | |
| """ | |
| desc = VOICE_EXAMPLES.get(label, "") | |
| path = PREGENERATED_AUDIO.get(label) | |
| if path and os.path.exists(path): | |
| sr, data = wavfile.read(path) | |
| if isinstance(data, np.ndarray) and data.ndim == 2 and data.shape[0] in (1, 2) and data.shape[0] < data.shape[1]: | |
| data = data.T | |
| return ( | |
| gr.update(value=desc), # voice_desc_input | |
| (sr, data), # audio_output (MAIN TAB) | |
| f"Status: Loaded pre-generated sample: {label}" | |
| ) | |
| return ( | |
| gr.update(value=desc), | |
| None, | |
| f"Status: No pre-generated audio found for: {label}" | |
| ) | |
| def run_generation_pipeline_client( | |
| raw_text, | |
| voice_description, | |
| cfg_text, | |
| cfg_style, | |
| min_temp, | |
| max_temp, | |
| top_k, | |
| min_p, | |
| dry_multiplier, | |
| seed, | |
| ): | |
| try: | |
| result = client.predict( | |
| raw_text, | |
| voice_description, | |
| cfg_text, | |
| cfg_style, | |
| min_temp, | |
| max_temp, | |
| top_k, | |
| min_p, | |
| dry_multiplier, | |
| seed, | |
| "", | |
| api_name="/run_generation_pipeline" | |
| ) | |
| if result is None: | |
| return None, "Status: No response from server" | |
| if isinstance(result, (list, tuple)) and len(result) == 2: | |
| audio_result, status_msg = result | |
| if audio_result is not None: | |
| if isinstance(audio_result, str) and os.path.exists(audio_result): | |
| sr, data = wavfile.read(audio_result) | |
| elif isinstance(audio_result, (list, tuple)) and len(audio_result) >= 2: | |
| sr = audio_result[0] | |
| data = np.array(audio_result[1]) if isinstance(audio_result[1], list) else audio_result[1] | |
| else: | |
| return None, status_msg | |
| if isinstance(data, np.ndarray) and data.ndim == 2 and data.shape[0] in (1, 2) and data.shape[0] < data.shape[1]: | |
| data = data.T | |
| return (sr, data), status_msg | |
| return None, status_msg | |
| return None, "Status: Unexpected response format from server" | |
| except Exception as e: | |
| return None, f"Status: Connection error: {str(e)}" | |
| with gr.Blocks(theme="Respair/Shiki@10.1.0", css=css) as demo: | |
| gr.Markdown( | |
| """ | |
| <div style="text-align: left;"> | |
| Demo is closed until further notice; thank you for using it. Feel free to check the pre-generated samples at the <code>Examples</code> tab. <br> | |
| </div> | |
| """ | |
| ) | |
| with gr.Tabs(): | |
| with gr.TabItem("Speech Generation"): | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| text_input = gr.Textbox( | |
| label="Text", | |
| lines=5, | |
| max_length=125, | |
| value="準備もできましたけど、いきなり本題に入ると分かりにくいかもしれないので、まずは今日やることを短く整理して、手順を一つずつ確認しながら進めていきますね。途中で気になるところがあったら、その都度止めて大丈夫です。", | |
| ) | |
| with gr.Column(elem_id="voice-desc-wrap"): | |
| voice_desc_input = gr.Textbox( | |
| label="Instruction", | |
| value="低くて激しい声の女性が、感情を抑えきれずに怒りを爆発させながら、早口でまくしたてるような声で読んでほしい。", | |
| lines=2, | |
| ) | |
| with gr.Row(equal_height=False): | |
| with gr.Accordion("----------------------------------⭐ 🛠️ ⭐", open=False): | |
| seed_slider = gr.Slider( | |
| label="Seed (-1 for random)", minimum=-1, maximum=2700000000, value=2700000000, step=1 | |
| ) | |
| gr.Markdown('<h3 style="color: #FFD700;">Style / CFG Parameters</h3>') | |
| cfg_text_slider = gr.Slider( | |
| label="CFG Text", minimum=0.5, maximum=3.0, value=1.15, step=0.05, | |
| ) | |
| cfg_style_slider = gr.Slider( | |
| label="CFG Style", | |
| minimum=0.5, maximum=3.0, value=1.2, step=0.1, | |
| ) | |
| gr.Markdown('<h3 style="color: #FFD700;">Sampling Parameters</h3>') | |
| min_temp_slider = gr.Slider( | |
| label="Min Temperature (adaptive)", minimum=0.0, maximum=2.0, value=0.25, step=0.05, | |
| ) | |
| max_temp_slider = gr.Slider( | |
| label="Max Temperature (adaptive)", minimum=0.0, maximum=2.0, value=1.0, step=0.05, | |
| ) | |
| top_k_slider = gr.Slider( | |
| label="Top K (0 = off)", minimum=0, maximum=200, value=0, step=5, | |
| ) | |
| min_p_slider = gr.Slider( | |
| label="Min P (0 = off)", minimum=0.0, maximum=1.0, value=0.0, step=0.01, | |
| ) | |
| gr.Markdown('<h3 style="color: #FFD700;">Repetition Control</h3>') | |
| dry_multiplier_slider = gr.Slider( | |
| label="DRY Multiplier (0 = off)", minimum=0.0, maximum=5.0, value=0.8, step=0.1, | |
| ) | |
| # gr.Markdown('<h3 style="color: #FFD700;">Other</h3>') | |
| with gr.Column(scale=1): | |
| generate_button = gr.Button("🎤 Generate", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| status_output = gr.Textbox(label="Status", interactive=False) | |
| audio_output = gr.Audio( | |
| label="Generated Speech", | |
| interactive=False | |
| ) | |
| # random_desc_button.click( | |
| # fn=lambda: random.choice(VOICE_PRESET_LIST)[1], | |
| # inputs=[], | |
| # outputs=[voice_desc_input], | |
| # ) | |
| generate_button.click( | |
| fn=run_generation_pipeline_client, | |
| inputs=[ | |
| text_input, | |
| voice_desc_input, | |
| cfg_text_slider, | |
| cfg_style_slider, | |
| min_temp_slider, | |
| max_temp_slider, | |
| top_k_slider, | |
| min_p_slider, | |
| dry_multiplier_slider, | |
| seed_slider, | |
| ], | |
| outputs=[audio_output, status_output], | |
| concurrency_limit=4, | |
| ) | |
| with gr.TabItem("Examples"): | |
| gr.HTML(""" | |
| <div style="background-color: rgba(255, 255, 255, 0.025); padding: 20px; border-radius: 12px; backdrop-filter: blur(10px); box-shadow: 0 4px 6px rgba(0,0,0,0.5); margin-top: 8px;"> | |
| <p style="color: #1a1a1a; font-weight: 500; line-height: 1.6; font-size: 14px; text-align: center; margin: 0;"> | |
| クリックするとメインタブの音声プレイヤーにプリジェネ音声がロードされます。 / Click a preset to load the pre-generated audio into the main tab player. | |
| </p> | |
| </div> | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=1, elem_id="voice-preset-container"): | |
| gr.HTML(""" | |
| <div style="background-color: rgba(255, 255, 255, 0.55); padding: 8px 12px; border-radius: 8px; backdrop-filter: blur(10px); box-shadow: 0 2px 4px rgba(0,0,0,0.08); text-align: center; max-width: 220px; margin: 0 auto 12px auto;"> | |
| <h3 style="color: #000000; margin: 0; font-size: 16px;">Examples</h3> | |
| </div> | |
| """) | |
| example_label_holder = gr.Textbox(visible=False) | |
| gr.Examples( | |
| examples=[[label] for label in PREGENERATED_AUDIO.keys()], | |
| inputs=[example_label_holder], | |
| outputs=[voice_desc_input, audio_output, status_output], # <-- MAIN TAB outputs | |
| fn=load_pregenerated_to_main, | |
| label="Click to load a pre-generated sample", | |
| cache_examples=False, | |
| run_on_click=True, | |
| examples_per_page=10, | |
| ) | |
| with gr.TabItem("Info"): | |
| gr.HTML('<h1 style="text-align: center;">🌸 Takane - Voice Design 🎨 </h1>') | |
| gr.HTML(""" | |
| <div style="background-color: rgba(255, 255, 255, 0.525); padding: 30px; border-radius: 12px; backdrop-filter: blur(5px); max-width: 100%; box-shadow: 0 4px 6px rgba(0,0,0,0.5);"> | |
| <div style="display: flex; gap: 24px; flex-wrap: wrap; justify-content: center;"> | |
| <div style="flex: 1; min-width: 280px;"> | |
| <h3 style="color: #000000; margin: 0 0 12px 0; font-size: 20px; text-align: center;">日本語</h3> | |
| <p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 16px; margin: 0; text-align: center;"> | |
| 本モデルのバックボーンは | |
| <a href="https://huggingface.co/spaces/Respair/Takane" target="_blank" rel="noopener noreferrer" | |
| style="color: #b45309; text-decoration: none; font-weight: 600;"> | |
| Takane | |
| </a> | |
| を改良したもので、ネイティブ 44.1kHz コーデックを備えた完全自回帰のエンコーダ・デコーダ型 Transformer です。<br><br> | |
| <strong>CFG Style</strong> を上げると指示への追従が強くなりますが、上げすぎると過剰な条件付け(over-conditioning)が起きて音質が劣化する場合があります。 | |
| </p> | |
| </div> | |
| <div style="flex: 1; min-width: 280px;"> | |
| <h3 style="color: #000000; margin: 0 0 12px 0; font-size: 20px; text-align: center;">English</h3> | |
| <p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 16px; margin: 0; text-align: center;"> | |
| The backbone is a modified version of | |
| <a href="https://huggingface.co/spaces/Respair/Takane" target="_blank" rel="noopener noreferrer" | |
| style="color: #b45309; text-decoration: none; font-weight: 600;"> | |
| Takane | |
| </a>, | |
| a fully autoregressive encoder-decoder transformer with a native 44.1khz codec.<br><br> | |
| Raise <strong>CFG Style</strong> if you want stronger adherence; pushing it too high can cause over-conditioning and degrade quality. <br><br> | |
| <code>This model is only in Japanese</code>, if you enjoy anime, this is yours to play with. | |
| </p> | |
| </div> | |
| </div> | |
| </div> | |
| """) | |
| def load_default(): | |
| label = "激怒する女性 / 感情爆発" | |
| desc = VOICE_EXAMPLES.get(label, "") | |
| path = PREGENERATED_AUDIO.get(label) | |
| if path and os.path.exists(path): | |
| sr, data = wavfile.read(path) | |
| if isinstance(data, np.ndarray) and data.ndim == 2 and data.shape[0] in (1, 2) and data.shape[0] < data.shape[1]: | |
| data = data.T | |
| return gr.update(value=desc), (sr, data), gr.update(value=f"Status: Loaded default sample: {label}") | |
| return gr.update(value=desc), None, gr.update(value=f"Status: Default sample missing: {label}") | |
| demo.load( | |
| fn=load_default, | |
| inputs=None, | |
| outputs=[voice_desc_input, audio_output, status_output], | |
| ) | |
| if __name__ == "__main__": | |
| demo.queue(api_open=False, max_size=15).launch() |