Spaces:

Respair
/

Voice_Design

Running

App Files Files Community

Voice_Design / app.py

Respair

Update app.py

812b614 verified 4 days ago

raw

history blame contribute delete

16.8 kB

	import gradio as gr
	from gradio_client import Client
	import os
	import random
	import numpy as np
	import scipy.io.wavfile as wavfile




	# try:
	# client = Client(os.environ['src'])
	# except:
	# client = Client("http://localhost:7861/")

	css = """
	.gradio-container input::placeholder,
	.gradio-container textarea::placeholder {
	color: #333333 !important;
	}
	code {
	background-color: #ffde9f;
	padding: 2px 4px;
	border-radius: 3px;
	}

	.gr-checkbox label span,
	.gr-check-radio label span,
	[data-testid="checkbox"] label span,
	.checkbox-container span {
	color: #ECF2F7 !important;
	}

	#advanced-accordion > button,
	#advanced-accordion > button span,
	#advanced-accordion > div > button,
	#advanced-accordion > div > button span,
	#advanced-accordion .label-wrap,
	#advanced-accordion .label-wrap span,
	#advanced-accordion > .open,
	#advanced-accordion > .open span {
	color: #FFD700 !important;
	}

	#voice-preset-container .gallery button,
	#voice-preset-container .gr-examples button,
	#voice-preset-container .examples button,
	#voice-preset-container button.sample {
	background-color: #c8b8d4 !important;
	border: 1px solid #b8a8c4 !important;
	color: #1a1a1a !important;
	font-weight: 500 !important;
	margin: 4px !important;
	padding: 10px 14px !important;
	border-radius: 6px !important;
	transition: background-color 0.2s ease !important;
	}

	#voice-preset-container .gallery button:hover,
	#voice-preset-container .gr-examples button:hover,
	#voice-preset-container .examples button:hover,
	#voice-preset-container button.sample:hover {
	background-color: #baadc9 !important;
	border-color: #a89ab8 !important;
	}

	body {
	background: none !important;
	}

	body::before {
	content: "";
	position: fixed;
	top: 0;
	left: 0;
	width: 100%;
	height: 100%;
	z-index: -1;
	pointer-events: none;
	background: url('https://i.postimg.cc/1smD6GPf/gradio-theme-rin2.png') center center / cover no-repeat;
	}

	"""

	VOICE_EXAMPLES = {
	"甘えた女の子 / ゆっくり": "かわいくて高い声の女の子が、甘えながらゆっくりのんびりしゃべってる感じの音声がほしい。", # https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/onnanoko_amai.wav
	"激怒する女性 / 感情爆発": "低くて激しい声の女性が、感情を抑えきれずに怒りを爆発させながら、早口でまくしたてるような声で読んでほしい。", # https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/angry.wav
	"落ち着いた男性 / 呆れ気味": "落ち着いた低めの声の男性が、相手の言動に少し呆れつつも感情を表に出さず、静かで平坦なトーンで淡々と話してるような声で読んでほしい。", # https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/guy_cool.wav
	"Calm man / mildly exasperated (EN)": "Read this in the voice of a calm, low-pitched man who sounds mildly exasperated but keeps his emotions in check, speaking in a flat, even tone without much expression.", # Nothing
	"冷たい女性 / 憎しみ (1)": "低くて冷たい声の女性が、怒りを内に秘めながら憎しみのこもった口調で、淡々と早めに話してるような声で読んでほしい。", # https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/woman_cold_frustrated_2.wav
	"冷たい女性 / 憎しみ (2)": "低くて冷たい声の女性が、怒りを内に秘めながら憎しみのこもった口調で、淡々と早めに話してるような声で読んでほしい。", # same text different result --> https://huggingface.co/spaces/Respair/Voice_Design/blob/main/samples/woman_cold_frustrated.wav
	}

	VOICE_PRESET_LIST = list(VOICE_EXAMPLES.items())

	# label -> local file path (ship these in your Space repo under samples/)
	PREGENERATED_AUDIO = {
	"甘えた女の子 / ゆっくり": "samples/onnanoko_amai.wav",
	"激怒する女性 / 感情爆発": "samples/angry.wav",
	"落ち着いた男性 / 呆れ気味": "samples/guy_cool.wav",
	"冷たい女性 / 憎しみ (1)": "samples/woman_cold_frustrated_2.wav",
	"冷たい女性 / 憎しみ (2)": "samples/woman_cold_frustrated.wav",
	}

	def load_pregenerated_to_main(label):
	"""
	Click handler from Examples tab:
	loads instruction text into the Instruction box (optional)
	and loads the pre-generated WAV into the MAIN tab audio_output.
	"""
	desc = VOICE_EXAMPLES.get(label, "")
	path = PREGENERATED_AUDIO.get(label)

	if path and os.path.exists(path):
	sr, data = wavfile.read(path)

	if isinstance(data, np.ndarray) and data.ndim == 2 and data.shape[0] in (1, 2) and data.shape[0] < data.shape[1]:
	data = data.T

	return (
	gr.update(value=desc), # voice_desc_input
	(sr, data), # audio_output (MAIN TAB)
	f"Status: Loaded pre-generated sample: {label}"
	)

	return (
	gr.update(value=desc),
	None,
	f"Status: No pre-generated audio found for: {label}"
	)


	def run_generation_pipeline_client(
	raw_text,
	voice_description,
	cfg_text,
	cfg_style,
	min_temp,
	max_temp,
	top_k,
	min_p,
	dry_multiplier,
	seed,
	):
	try:
	result = client.predict(
	raw_text,
	voice_description,
	cfg_text,
	cfg_style,
	min_temp,
	max_temp,
	top_k,
	min_p,
	dry_multiplier,
	seed,
	"",
	api_name="/run_generation_pipeline"
	)

	if result is None:
	return None, "Status: No response from server"

	if isinstance(result, (list, tuple)) and len(result) == 2:
	audio_result, status_msg = result
	if audio_result is not None:
	if isinstance(audio_result, str) and os.path.exists(audio_result):
	sr, data = wavfile.read(audio_result)
	elif isinstance(audio_result, (list, tuple)) and len(audio_result) >= 2:
	sr = audio_result[0]
	data = np.array(audio_result[1]) if isinstance(audio_result[1], list) else audio_result[1]
	else:
	return None, status_msg

	if isinstance(data, np.ndarray) and data.ndim == 2 and data.shape[0] in (1, 2) and data.shape[0] < data.shape[1]:
	data = data.T

	return (sr, data), status_msg
	return None, status_msg

	return None, "Status: Unexpected response format from server"

	except Exception as e:
	return None, f"Status: Connection error: {str(e)}"


	with gr.Blocks(theme="Respair/Shiki@10.1.0", css=css) as demo:
	gr.Markdown(
	"""
	<div style="text-align: left;">
	Demo is closed until further notice; thank you for using it. Feel free to check the pre-generated samples at the <code>Examples</code> tab. <br>
	</div>
	"""
	)
	with gr.Tabs():

	with gr.TabItem("Speech Generation"):
	with gr.Row():
	with gr.Column(scale=2):
	text_input = gr.Textbox(
	label="Text",
	lines=5,
	max_length=125,
	value="準備もできましたけど、いきなり本題に入ると分かりにくいかもしれないので、まずは今日やることを短く整理して、手順を一つずつ確認しながら進めていきますね。途中で気になるところがあったら、その都度止めて大丈夫です。",
	)

	with gr.Column(elem_id="voice-desc-wrap"):
	voice_desc_input = gr.Textbox(
	label="Instruction",
	value="低くて激しい声の女性が、感情を抑えきれずに怒りを爆発させながら、早口でまくしたてるような声で読んでほしい。",
	lines=2,
	)
	with gr.Row(equal_height=False):
	with gr.Accordion("----------------------------------⭐ 🛠️ ⭐", open=False):

	seed_slider = gr.Slider(
	label="Seed (-1 for random)", minimum=-1, maximum=2700000000, value=2700000000, step=1
	)
	gr.Markdown('<h3 style="color: #FFD700;">Style / CFG Parameters</h3>')
	cfg_text_slider = gr.Slider(
	label="CFG Text", minimum=0.5, maximum=3.0, value=1.15, step=0.05,
	)
	cfg_style_slider = gr.Slider(
	label="CFG Style",
	minimum=0.5, maximum=3.0, value=1.2, step=0.1,
	)
	gr.Markdown('<h3 style="color: #FFD700;">Sampling Parameters</h3>')
	min_temp_slider = gr.Slider(
	label="Min Temperature (adaptive)", minimum=0.0, maximum=2.0, value=0.25, step=0.05,
	)
	max_temp_slider = gr.Slider(
	label="Max Temperature (adaptive)", minimum=0.0, maximum=2.0, value=1.0, step=0.05,
	)
	top_k_slider = gr.Slider(
	label="Top K (0 = off)", minimum=0, maximum=200, value=0, step=5,
	)
	min_p_slider = gr.Slider(
	label="Min P (0 = off)", minimum=0.0, maximum=1.0, value=0.0, step=0.01,
	)

	gr.Markdown('<h3 style="color: #FFD700;">Repetition Control</h3>')

	dry_multiplier_slider = gr.Slider(
	label="DRY Multiplier (0 = off)", minimum=0.0, maximum=5.0, value=0.8, step=0.1,
	)

	# gr.Markdown('<h3 style="color: #FFD700;">Other</h3>')


	with gr.Column(scale=1):
	generate_button = gr.Button("🎤 Generate", variant="primary", size="lg")

	with gr.Column(scale=1):
	status_output = gr.Textbox(label="Status", interactive=False)
	audio_output = gr.Audio(
	label="Generated Speech",
	interactive=False
	)

	# random_desc_button.click(
	# fn=lambda: random.choice(VOICE_PRESET_LIST)[1],
	# inputs=[],
	# outputs=[voice_desc_input],
	# )

	generate_button.click(
	fn=run_generation_pipeline_client,
	inputs=[
	text_input,
	voice_desc_input,
	cfg_text_slider,
	cfg_style_slider,
	min_temp_slider,
	max_temp_slider,
	top_k_slider,
	min_p_slider,
	dry_multiplier_slider,
	seed_slider,
	],
	outputs=[audio_output, status_output],
	concurrency_limit=4,
	)

	with gr.TabItem("Examples"):
	gr.HTML("""
	<div style="background-color: rgba(255, 255, 255, 0.025); padding: 20px; border-radius: 12px; backdrop-filter: blur(10px); box-shadow: 0 4px 6px rgba(0,0,0,0.5); margin-top: 8px;">
	<p style="color: #1a1a1a; font-weight: 500; line-height: 1.6; font-size: 14px; text-align: center; margin: 0;">
	クリックするとメインタブの音声プレイヤーにプリジェネ音声がロードされます。 / Click a preset to load the pre-generated audio into the main tab player.
	</p>
	</div>
	""")

	with gr.Row():
	with gr.Column(scale=1, elem_id="voice-preset-container"):
	gr.HTML("""
	<div style="background-color: rgba(255, 255, 255, 0.55); padding: 8px 12px; border-radius: 8px; backdrop-filter: blur(10px); box-shadow: 0 2px 4px rgba(0,0,0,0.08); text-align: center; max-width: 220px; margin: 0 auto 12px auto;">
	<h3 style="color: #000000; margin: 0; font-size: 16px;">Examples</h3>
	</div>
	""")
	example_label_holder = gr.Textbox(visible=False)

	gr.Examples(
	examples=[[label] for label in PREGENERATED_AUDIO.keys()],
	inputs=[example_label_holder],
	outputs=[voice_desc_input, audio_output, status_output], # <-- MAIN TAB outputs
	fn=load_pregenerated_to_main,
	label="Click to load a pre-generated sample",
	cache_examples=False,
	run_on_click=True,
	examples_per_page=10,
	)

	with gr.TabItem("Info"):
	gr.HTML('<h1 style="text-align: center;">🌸 Takane - Voice Design 🎨 </h1>')

	gr.HTML("""
	<div style="background-color: rgba(255, 255, 255, 0.525); padding: 30px; border-radius: 12px; backdrop-filter: blur(5px); max-width: 100%; box-shadow: 0 4px 6px rgba(0,0,0,0.5);">
	<div style="display: flex; gap: 24px; flex-wrap: wrap; justify-content: center;">

	<div style="flex: 1; min-width: 280px;">
	<h3 style="color: #000000; margin: 0 0 12px 0; font-size: 20px; text-align: center;">日本語</h3>
	<p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 16px; margin: 0; text-align: center;">
	本モデルのバックボーンは
	<a href="https://huggingface.co/spaces/Respair/Takane" target="_blank" rel="noopener noreferrer"
	style="color: #b45309; text-decoration: none; font-weight: 600;">
	Takane
	</a>
	を改良したもので、ネイティブ 44.1kHz コーデックを備えた完全自回帰のエンコーダ・デコーダ型 Transformer です。<br><br>
	<strong>CFG Style</strong> を上げると指示への追従が強くなりますが、上げすぎると過剰な条件付け（over-conditioning）が起きて音質が劣化する場合があります。
	</p>
	</div>

	<div style="flex: 1; min-width: 280px;">
	<h3 style="color: #000000; margin: 0 0 12px 0; font-size: 20px; text-align: center;">English</h3>
	<p style="color: #1a1a1a; font-weight: 500; line-height: 1.8; font-size: 16px; margin: 0; text-align: center;">
	The backbone is a modified version of
	<a href="https://huggingface.co/spaces/Respair/Takane" target="_blank" rel="noopener noreferrer"
	style="color: #b45309; text-decoration: none; font-weight: 600;">
	Takane
	</a>,
	a fully autoregressive encoder-decoder transformer with a native 44.1khz codec.<br><br>
	Raise <strong>CFG Style</strong> if you want stronger adherence; pushing it too high can cause over-conditioning and degrade quality. <br><br>
	<code>This model is only in Japanese</code>, if you enjoy anime, this is yours to play with.
	</p>
	</div>

	</div>
	</div>
	""")

	def load_default():
	label = "激怒する女性 / 感情爆発"
	desc = VOICE_EXAMPLES.get(label, "")
	path = PREGENERATED_AUDIO.get(label)

	if path and os.path.exists(path):
	sr, data = wavfile.read(path)
	if isinstance(data, np.ndarray) and data.ndim == 2 and data.shape[0] in (1, 2) and data.shape[0] < data.shape[1]:
	data = data.T
	return gr.update(value=desc), (sr, data), gr.update(value=f"Status: Loaded default sample: {label}")

	return gr.update(value=desc), None, gr.update(value=f"Status: Default sample missing: {label}")

	demo.load(
	fn=load_default,
	inputs=None,
	outputs=[voice_desc_input, audio_output, status_output],
	)

	if __name__ == "__main__":
	demo.queue(api_open=False, max_size=15).launch()