Voice_Design / app.py
Respair's picture
Update app.py
812b614 verified
import gradio as gr
from gradio_client import Client
import os
import random
import numpy as np
import scipy.io.wavfile as wavfile
# try:
# client = Client(os.environ['src'])
# except:
# client = Client("http://localhost:7861/")
css = """
.gradio-container input::placeholder,
.gradio-container textarea::placeholder {
color: #333333 !important;
}
code {
background-color: #ffde9f;
padding: 2px 4px;
border-radius: 3px;
}
.gr-checkbox label span,
.gr-check-radio label span,
[data-testid="checkbox"] label span,
.checkbox-container span {
color: #ECF2F7 !important;
}
#advanced-accordion > button,
#advanced-accordion > button span,
#advanced-accordion > div > button,
#advanced-accordion > div > button span,
#advanced-accordion .label-wrap,
#advanced-accordion .label-wrap span,
#advanced-accordion > .open,
#advanced-accordion > .open span {
color: #FFD700 !important;
}
#voice-preset-container .gallery button,
#voice-preset-container .gr-examples button,
#voice-preset-container .examples button,
#voice-preset-container button.sample {
background-color: #c8b8d4 !important;
border: 1px solid #b8a8c4 !important;
color: #1a1a1a !important;
font-weight: 500 !important;
margin: 4px !important;
padding: 10px 14px !important;
border-radius: 6px !important;
transition: background-color 0.2s ease !important;
}
#voice-preset-container .gallery button:hover,
#voice-preset-container .gr-examples button:hover,
#voice-preset-container .examples button:hover,
#voice-preset-container button.sample:hover {
background-color: #baadc9 !important;
border-color: #a89ab8 !important;
}
body {
background: none !important;
}
body::before {
content: "";
position: fixed;
top: 0;
left: 0;
width: 100%;
height: 100%;
z-index: -1;
pointer-events: none;
background: url('https://i.postimg.cc/1smD6GPf/gradio-theme-rin2.png') center center / cover no-repeat;
}
"""
VOICE_EXAMPLES = {
"甘えた女の子 / ゆっくり": "かわいくて高い声の女の子が、甘えながらゆっくりのんびりしゃべってる感じの音声がほしい。", # https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/onnanoko_amai.wav
"激怒する女性 / 感情爆発": "低くて激しい声の女性が、感情を抑えきれずに怒りを爆発させながら、早口でまくしたてるような声で読んでほしい。", # https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/angry.wav
"落ち着いた男性 / 呆れ気味": "落ち着いた低めの声の男性が、相手の言動に少し呆れつつも感情を表に出さず、静かで平坦なトーンで淡々と話してるような声で読んでほしい。", # https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/guy_cool.wav
"Calm man / mildly exasperated (EN)": "Read this in the voice of a calm, low-pitched man who sounds mildly exasperated but keeps his emotions in check, speaking in a flat, even tone without much expression.", # Nothing
"冷たい女性 / 憎しみ (1)": "低くて冷たい声の女性が、怒りを内に秘めながら憎しみのこもった口調で、淡々と早めに話してるような声で読んでほしい。", # https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/woman_cold_frustrated_2.wav
"冷たい女性 / 憎しみ (2)": "低くて冷たい声の女性が、怒りを内に秘めながら憎しみのこもった口調で、淡々と早めに話してるような声で読んでほしい。", # same text different result --> https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/woman_cold_frustrated.wav
}
VOICE_PRESET_LIST = list(VOICE_EXAMPLES.items())
# label -> local file path (ship these in your Space repo under samples/)
PREGENERATED_AUDIO = {
"甘えた女の子 / ゆっくり": "samples/onnanoko_amai.wav",
"激怒する女性 / 感情爆発": "samples/angry.wav",
"落ち着いた男性 / 呆れ気味": "samples/guy_cool.wav",
"冷たい女性 / 憎しみ (1)": "samples/woman_cold_frustrated_2.wav",
"冷たい女性 / 憎しみ (2)": "samples/woman_cold_frustrated.wav",
}
def load_pregenerated_to_main(label):
"""
Click handler from Examples tab:
loads instruction text into the Instruction box (optional)
and loads the pre-generated WAV into the MAIN tab audio_output.
"""
desc = VOICE_EXAMPLES.get(label, "")
path = PREGENERATED_AUDIO.get(label)
if path and os.path.exists(path):
sr, data = wavfile.read(path)
if isinstance(data, np.ndarray) and data.ndim == 2 and data.shape[0] in (1, 2) and data.shape[0] < data.shape[1]:
data = data.T
return (
gr.update(value=desc), # voice_desc_input
(sr, data), # audio_output (MAIN TAB)
f"Status: Loaded pre-generated sample: {label}"
)
return (
gr.update(value=desc),
None,
f"Status: No pre-generated audio found for: {label}"
)
def run_generation_pipeline_client(
raw_text,
voice_description,
cfg_text,
cfg_style,
min_temp,
max_temp,
top_k,
min_p,
dry_multiplier,
seed,
):
try:
result = client.predict(
raw_text,
voice_description,
cfg_text,
cfg_style,
min_temp,
max_temp,
top_k,
min_p,
dry_multiplier,
seed,
"",
api_name="/run_generation_pipeline"
)
if result is None:
return None, "Status: No response from server"
if isinstance(result, (list, tuple)) and len(result) == 2:
audio_result, status_msg = result
if audio_result is not None:
if isinstance(audio_result, str) and os.path.exists(audio_result):
sr, data = wavfile.read(audio_result)
elif isinstance(audio_result, (list, tuple)) and len(audio_result) >= 2:
sr = audio_result[0]
data = np.array(audio_result[1]) if isinstance(audio_result[1], list) else audio_result[1]
else:
return None, status_msg
if isinstance(data, np.ndarray) and data.ndim == 2 and data.shape[0] in (1, 2) and data.shape[0] < data.shape[1]:
data = data.T
return (sr, data), status_msg
return None, status_msg
return None, "Status: Unexpected response format from server"
except Exception as e:
return None, f"Status: Connection error: {str(e)}"
with gr.Blocks(theme="Respair/Shiki@10.1.0", css=css) as demo:
gr.Markdown(
"""
<div style="text-align: left;">
Demo is closed until further notice; thank you for using it. Feel free to check the pre-generated samples at the <code>Examples</code> tab. <br>
</div>
"""
)
with gr.Tabs():
with gr.TabItem("Speech Generation"):
with gr.Row():
with gr.Column(scale=2):
text_input = gr.Textbox(
label="Text",
lines=5,
max_length=125,
value="準備もできましたけど、いきなり本題に入ると分かりにくいかもしれないので、まずは今日やることを短く整理して、手順を一つずつ確認しながら進めていきますね。途中で気になるところがあったら、その都度止めて大丈夫です。",
)
with gr.Column(elem_id="voice-desc-wrap"):
voice_desc_input = gr.Textbox(
label="Instruction",
value="低くて激しい声の女性が、感情を抑えきれずに怒りを爆発させながら、早口でまくしたてるような声で読んでほしい。",
lines=2,
)
with gr.Row(equal_height=False):
with gr.Accordion("----------------------------------⭐ 🛠️ ⭐", open=False):
seed_slider = gr.Slider(
label="Seed (-1 for random)", minimum=-1, maximum=2700000000, value=2700000000, step=1
)
gr.Markdown('<h3 style="color: #FFD700;">Style / CFG Parameters</h3>')
cfg_text_slider = gr.Slider(
label="CFG Text", minimum=0.5, maximum=3.0, value=1.15, step=0.05,
)
cfg_style_slider = gr.Slider(
label="CFG Style",
minimum=0.5, maximum=3.0, value=1.2, step=0.1,
)
gr.Markdown('<h3 style="color: #FFD700;">Sampling Parameters</h3>')
min_temp_slider = gr.Slider(
label="Min Temperature (adaptive)", minimum=0.0, maximum=2.0, value=0.25, step=0.05,
)
max_temp_slider = gr.Slider(
label="Max Temperature (adaptive)", minimum=0.0, maximum=2.0, value=1.0, step=0.05,
)
top_k_slider = gr.Slider(
label="Top K (0 = off)", minimum=0, maximum=200, value=0, step=5,
)
min_p_slider = gr.Slider(
label="Min P (0 = off)", minimum=0.0, maximum=1.0, value=0.0, step=0.01,
)
gr.Markdown('<h3 style="color: #FFD700;">Repetition Control</h3>')
dry_multiplier_slider = gr.Slider(
label="DRY Multiplier (0 = off)", minimum=0.0, maximum=5.0, value=0.8, step=0.1,
)
# gr.Markdown('<h3 style="color: #FFD700;">Other</h3>')
with gr.Column(scale=1):
generate_button = gr.Button("🎤 Generate", variant="primary", size="lg")
with gr.Column(scale=1):
status_output = gr.Textbox(label="Status", interactive=False)
audio_output = gr.Audio(
label="Generated Speech",
interactive=False
)
# random_desc_button.click(
# fn=lambda: random.choice(VOICE_PRESET_LIST)[1],
# inputs=[],
# outputs=[voice_desc_input],
# )
generate_button.click(
fn=run_generation_pipeline_client,
inputs=[
text_input,
voice_desc_input,
cfg_text_slider,
cfg_style_slider,
min_temp_slider,
max_temp_slider,
top_k_slider,
min_p_slider,
dry_multiplier_slider,
seed_slider,
],
outputs=[audio_output, status_output],
concurrency_limit=4,
)
with gr.TabItem("Examples"):
gr.HTML("""
<div style="background-color: rgba(255, 255, 255, 0.025); padding: 20px; border-radius: 12px; backdrop-filter: blur(10px); box-shadow: 0 4px 6px rgba(0,0,0,0.5); margin-top: 8px;">
<p style="color: #1a1a1a; font-weight: 500; line-height: 1.6; font-size: 14px; text-align: center; margin: 0;">
クリックするとメインタブの音声プレイヤーにプリジェネ音声がロードされます。 / Click a preset to load the pre-generated audio into the main tab player.
</p>
</div>
""")
with gr.Row():
with gr.Column(scale=1, elem_id="voice-preset-container"):
gr.HTML("""
<div style="background-color: rgba(255, 255, 255, 0.55); padding: 8px 12px; border-radius: 8px; backdrop-filter: blur(10px); box-shadow: 0 2px 4px rgba(0,0,0,0.08); text-align: center; max-width: 220px; margin: 0 auto 12px auto;">
<h3 style="color: #000000; margin: 0; font-size: 16px;">Examples</h3>
</div>
""")
example_label_holder = gr.Textbox(visible=False)
gr.Examples(
examples=[[label] for label in PREGENERATED_AUDIO.keys()],
inputs=[example_label_holder],
outputs=[voice_desc_input, audio_output, status_output], # <-- MAIN TAB outputs
fn=load_pregenerated_to_main,
label="Click to load a pre-generated sample",
cache_examples=False,
run_on_click=True,
examples_per_page=10,
)
with gr.TabItem("Info"):
gr.HTML('<h1 style="text-align: center;">🌸 Takane - Voice Design 🎨 </h1>')
gr.HTML("""
<div style="background-color: rgba(255, 255, 255, 0.525); padding: 30px; border-radius: 12px; backdrop-filter: blur(5px); max-width: 100%; box-shadow: 0 4px 6px rgba(0,0,0,0.5);">
<div style="display: flex; gap: 24px; flex-wrap: wrap; justify-content: center;">
<div style="flex: 1; min-width: 280px;">
<h3 style="color: #000000; margin: 0 0 12px 0; font-size: 20px; text-align: center;">日本語</h3>
<p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 16px; margin: 0; text-align: center;">
本モデルのバックボーンは
<a href="https://huggingface.co/spaces/Respair/Takane" target="_blank" rel="noopener noreferrer"
style="color: #b45309; text-decoration: none; font-weight: 600;">
Takane
</a>
を改良したもので、ネイティブ 44.1kHz コーデックを備えた完全自回帰のエンコーダ・デコーダ型 Transformer です。<br><br>
<strong>CFG Style</strong> を上げると指示への追従が強くなりますが、上げすぎると過剰な条件付け(over-conditioning)が起きて音質が劣化する場合があります。
</p>
</div>
<div style="flex: 1; min-width: 280px;">
<h3 style="color: #000000; margin: 0 0 12px 0; font-size: 20px; text-align: center;">English</h3>
<p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 16px; margin: 0; text-align: center;">
The backbone is a modified version of
<a href="https://huggingface.co/spaces/Respair/Takane" target="_blank" rel="noopener noreferrer"
style="color: #b45309; text-decoration: none; font-weight: 600;">
Takane
</a>,
a fully autoregressive encoder-decoder transformer with a native 44.1khz codec.<br><br>
Raise <strong>CFG Style</strong> if you want stronger adherence; pushing it too high can cause over-conditioning and degrade quality. <br><br>
<code>This model is only in Japanese</code>, if you enjoy anime, this is yours to play with.
</p>
</div>
</div>
</div>
""")
def load_default():
label = "激怒する女性 / 感情爆発"
desc = VOICE_EXAMPLES.get(label, "")
path = PREGENERATED_AUDIO.get(label)
if path and os.path.exists(path):
sr, data = wavfile.read(path)
if isinstance(data, np.ndarray) and data.ndim == 2 and data.shape[0] in (1, 2) and data.shape[0] < data.shape[1]:
data = data.T
return gr.update(value=desc), (sr, data), gr.update(value=f"Status: Loaded default sample: {label}")
return gr.update(value=desc), None, gr.update(value=f"Status: Default sample missing: {label}")
demo.load(
fn=load_default,
inputs=None,
outputs=[voice_desc_input, audio_output, status_output],
)
if __name__ == "__main__":
demo.queue(api_open=False, max_size=15).launch()