Spaces:

benjamin-paine
/

zonos

Paused

App Files Files Community

zonos / app.py

benjamin-paine

Update app.py

104eece verified 8 days ago

raw

history blame contribute delete

18.7 kB

	# Install dependencies in application code, as we don't have access to a GPU at build time
	# Thanks to https://huggingface.co/Steveeeeeeen for their code to handle this!
	import os
	import shlex
	import subprocess

	subprocess.run(shlex.split("pip install flash-attn --no-build-isolation"), env=os.environ \| {"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, check=True)
	subprocess.run(shlex.split("pip install https://github.com/state-spaces/mamba/releases/download/v2.2.4/mamba_ssm-2.2.4+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"), check=True)
	subprocess.run(shlex.split("pip install https://github.com/Dao-AILab/causal-conv1d/releases/download/v1.5.0.post8/causal_conv1d-1.5.0.post8+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl"), check=True)

	import spaces
	import gradio as gr
	import numpy as np

	from typing import Tuple, Dict, Any, Optional
	from taproot import Task

	# Configuration
	is_hf_spaces = os.getenv("SYSTEM", "") == "spaces"
	max_characters = 2000
	header_markdown = """
	# Zonos v0.1
	State of the art text-to-speech model [[model]](https://huggingface.co/collections/Zyphra/zonos-v01-67ac661c85e1898670823b4f). [[blog]](https://www.zyphra.com/post/beta-release-of-zonos-v0-1), [[Zyphra Audio (hosted service)]](https://maia.zyphra.com/sign-in?redirect_url=https%3A%2F%2Fmaia.zyphra.com%2Faudio)
	## Unleashed
	Use this space to generate long-form speech up to around ~2 minutes in length. To generate an unlimited length, clone this space and run it locally.
	### Tips
	- If you are generating more than one chunk of audio, you should supply speaker conditioning. Otherwise, each chunk will have a slightly different voice.
	- When providing prefix audio, include the text of the prefix audio in your speech text to ensure a smooth transition.
	- The cleaner the speaker audio, the better the speaker conditioning will be - however, speaker audio is only sampled at 16kHz, so you do not need to provide high-bitrate speaker audio. Unlike this, however, prefix audio should be high-quality, as it is sampled at the full 44.1kHz.
	- The appropriate range of Speaking Rate and Pitch STD are highly dependent on the speaker audio. Start with the defaults and adjust as needed.
	- Emotion sliders do not completely function intuitively, and require some experimentation to get the desired effect.
	""".strip()

	# Create pipelines, downloading required files as necessary
	speech_enhancement = Task.get("speech-enhancement", model="deep-filter-net-v3", available_only=False)
	speech_enhancement.download_required_files(text_callback=print)
	hybrid_task = Task.get("speech-synthesis", model="zonos-hybrid", available_only=False)
	hybrid_task.download_required_files(text_callback=print)
	hybrid_pipe = hybrid_task()
	hybrid_pipe.load(allow_optional=True)

	transformer_task = Task.get(
	"speech-synthesis", model="zonos-transformer", available_only=False
	)
	transformer_task.download_required_files(text_callback=print)
	transformer_pipe = transformer_task()

	if is_hf_spaces:
	# Must load all models on GPU when using ZERO
	transformer_pipe.load(allow_optional=True)

	# Global state
	pipelines = {
	"Zonos Transformer v0.1": transformer_pipe,
	"Zonos Hybrid v0.1": hybrid_pipe,
	}
	pipeline_names = list(pipelines.keys())
	supported_language_codes = hybrid_pipe.supported_languages # Same for both pipes

	# Model toggle
	def update_ui(pipeline_choice: str) -> Tuple[Dict[str, Any], ...]:
	"""
	Dynamically show/hide UI elements based on the model's conditioners.
	"""
	if not is_hf_spaces:
	# When not using ZERO, we can onload/offload pipes
	for pipeline_name, pipeline in pipelines.items():
	if pipeline_name == pipeline_choice:
	pipeline.load()
	else:
	pipeline.unload()

	pipe = pipelines[pipeline_choice]
	cond_names = [c.name for c in pipe.pretrained.model.prefix_conditioner.conditioners]

	vqscore_update = gr.update(visible=("vqscore_8" in cond_names))
	emotion_update = gr.update(visible=("emotion" in cond_names))
	fmax_update = gr.update(visible=("fmax" in cond_names))
	pitch_update = gr.update(visible=("pitch_std" in cond_names))
	speaking_rate_update = gr.update(visible=("speaking_rate" in cond_names))
	dnsmos_update = gr.update(visible=("dnsmos_ovrl" in cond_names))
	speaker_noised_update = gr.update(visible=("speaker_noised" in cond_names))

	return (
	vqscore_update,
	emotion_update,
	fmax_update,
	pitch_update,
	speaking_rate_update,
	dnsmos_update,
	speaker_noised_update,
	)

	# Invocation method
	@spaces.GPU(duration=180)
	def generate_audio(
	pipeline_choice: str,
	text: str,
	language: str,
	speaker_audio: Optional[str],
	prefix_audio: Optional[str],
	e1: float,
	e2: float,
	e3: float,
	e4: float,
	e5: float,
	e6: float,
	e7: float,
	e8: float,
	vq_single: float,
	fmax: float,
	pitch_std: float,
	speaking_rate: float,
	dnsmos_ovrl: float,
	speaker_noised: bool,
	cfg_scale: float,
	min_p: float,
	seed: int,
	max_chunk_length: int,
	cross_fade_duration: float,
	punctuation_pause_duration: float,
	target_rms: float,
	randomize_seed: bool,
	skip_dnsmos: bool,
	skip_vqscore: bool,
	skip_fmax: bool,
	skip_pitch: bool,
	skip_speaking_rate: bool,
	skip_emotion: bool,
	skip_speaker: bool,
	speaker_pitch_shift: float,
	speaker_equalize: bool,
	speaker_enhance: bool,
	prefix_equalize: bool,
	prefix_enhance: bool,
	enhance: bool,
	progress=gr.Progress(),
	) -> Tuple[Tuple[int, np.ndarray[Any, Any]], int]:
	"""
	Generates audio based on the provided UI parameters.
	"""
	selected_pipeline = pipelines[pipeline_choice]
	if randomize_seed:
	seed = np.random.randint(0, 2**32)

	def on_progress(step: int, total: int) -> None:
	progress((step, total))

	selected_pipeline.on_progress(on_progress)
	try:
	wav_out = selected_pipeline(
	text=text,
	enhance=enhance,
	language=language,
	reference_audio=speaker_audio,
	reference_audio_pitch_shift=speaker_pitch_shift,
	equalize_reference_audio=speaker_equalize,
	enhance_reference_audio=speaker_enhance,
	prefix_audio=prefix_audio,
	equalize_prefix_audio=prefix_equalize,
	enhance_prefix_audio=prefix_enhance,
	seed=seed,
	max_chunk_length=max_chunk_length,
	cross_fade_duration=cross_fade_duration,
	punctuation_pause_duration=punctuation_pause_duration,
	target_rms=target_rms,
	cfg_scale=cfg_scale,
	min_p=min_p,
	fmax=fmax,
	pitch_std=pitch_std,
	emotion_happiness=e1,
	emotion_sadness=e2,
	emotion_disgust=e3,
	emotion_fear=e4,
	emotion_surprise=e5,
	emotion_anger=e6,
	emotion_other=e7,
	emotion_neutral=e8,
	speaking_rate=speaking_rate,
	vq_score=vq_single,
	speaker_noised=speaker_noised,
	dnsmos=dnsmos_ovrl,
	skip_speaker=skip_speaker,
	skip_dnsmos=skip_dnsmos,
	skip_vq_score=skip_vqscore,
	skip_fmax=skip_fmax,
	skip_pitch=skip_pitch,
	skip_speaking_rate=skip_speaking_rate,
	skip_emotion=skip_emotion,
	output_format="float",
	)

	return (
	(
	48000 if enhance else 44100,
	wav_out.squeeze().numpy()
	),
	seed
	)
	finally:
	selected_pipeline.off_progress()

	# Interface
	if __name__ == "__main__":
	with gr.Blocks() as demo:
	with gr.Row():
	with gr.Column(scale=3):
	gr.Markdown(header_markdown)

	gr.Image(
	value="https://raw.githubusercontent.com/Zyphra/Zonos/refs/heads/main/assets/ZonosHeader.png",
	container=False,
	interactive=False,
	show_label=False,
	show_share_button=False,
	show_fullscreen_button=False,
	show_download_button=False,
	)

	with gr.Row(equal_height=True):
	pipeline_choice = gr.Dropdown(
	choices=pipeline_names,
	value=pipeline_names[0],
	label="Zonos Model Variant",
	)
	language = gr.Dropdown(
	choices=supported_language_codes,
	value="en-us",
	label="Language",
	)
	enhanced_checkbox = gr.Checkbox(
	value=True,
	label="Enhance Output with DeepFilterNet"
	)

	with gr.Row():
	if not is_hf_spaces:
	limit_text = "Unlimited"
	else:
	limit_text = f"Up to {max_characters}"

	text = gr.Textbox(
	label=f"Speech Text ({limit_text} Characters)",
	value="Zonos is a state-of-the-art text-to-speech model that generates expressive and natural-sounding audio with robust customization options.",
	lines=4,
	max_lines=20,
	max_length=max_characters if is_hf_spaces else None,
	)

	with gr.Row():
	generate_button = gr.Button("Generate Audio")

	with gr.Row():
	output_audio = gr.Audio(label="Generated Audio", type="numpy", autoplay=True)

	with gr.Row():
	gr.Markdown("## Long-Form Parameters")

	with gr.Column(variant="panel"):
	with gr.Row(equal_height=True):
	max_chunk_length = gr.Slider(
	1, 300, 150, 1, label="Max Chunk Length (Characters)",
	info="The maximum number of characters to generate in a single chunk. Zonos itself has a much higher limit than this, but consistency breaks down as you go past ~200 characters or so."
	)
	target_rms = gr.Slider(
	0.0, 1.0, 0.10, 0.01, label="Target RMS",
	info="The target RMS (root-mean-square) amplitude for the generated audio. Each chunk will have its loudness normalized to this value to ensure consistent volume levels."
	)
	with gr.Row(equal_height=True):
	punctuation_pause_duration = gr.Slider(
	0, 1, 0.10, 0.01, label="Punctuation Pause Duration (Seconds)",
	info="Pause duration to add after a chunk that ends with punctuation. Full-stop punctuation (periods) will have the entire length, while shorter pauses will use half of this duration."
	)
	cross_fade_duration = gr.Slider(
	0, 1, 0.15, 0.01, label="Chunk Cross-Fade Duration (Seconds)",
	info="The duration of the cross-fade between chunks. This helps to smooth out transitions between chunks. In general, this should be set to a value greater than the pause duration."
	)

	with gr.Row():
	gr.Markdown("## Generation Parameters")

	with gr.Row(variant="panel", equal_height=True):
	with gr.Column():
	prefix_audio = gr.Audio(
	label="Optional Prefix Audio (continue from this audio)",
	type="filepath",
	)
	prefix_equalize_checkbox = gr.Checkbox(label="Equalize Prefix Audio", value=True)
	prefix_enhance_checkbox = gr.Checkbox(label="Enhance Prefix Audio with DeepFilterNet", value=True)

	with gr.Column(scale=3):
	cfg_scale_slider = gr.Slider(1.0, 5.0, 2.0, 0.1, label="CFG Scale")
	min_p_slider = gr.Slider(0.0, 1.0, 0.15, 0.01, label="Min P")
	seed_number = gr.Number(label="Seed", value=6475309, precision=0)
	randomize_seed_toggle = gr.Checkbox(label="Randomize Seed", value=True)

	with gr.Row():
	gr.Markdown(
	"## Conditioning Parameters\nAll of these types of conditioning are optional and can be disabled."
	)

	with gr.Row(variant="panel", equal_height=True) as speaker_row:
	with gr.Column():
	speaker_uncond = gr.Checkbox(label="Skip Speaker")
	speaker_noised_checkbox = gr.Checkbox(
	label="Speaker Noised",
	value=False,
	interactive=False,
	info="'Speaker Noised' is a conditioning value that the model understands, not a processing step. Check this box if your input audio is noisy."
	)
	speaker_equalize_checkbox = gr.Checkbox(label="Equalize Speaker Audio", value=True)
	speaker_enhance_checkbox = gr.Checkbox(label="Enhance Speaker Audio with DeepFilterNet", value=True)

	def on_enhanced_change(use_enhance: bool) -> Dict[str, Any]:
	update_dict = {"interactive": not use_enhance}
	if use_enhance:
	update_dict["value"] = False
	return gr.update(**update_dict)

	speaker_enhance_checkbox.change(
	fn=on_enhanced_change,
	inputs=[speaker_enhance_checkbox],
	outputs=[speaker_noised_checkbox]
	)
	speaker_pitch_shift = gr.Slider(
	-1200, 1200, -44.99, 0.01, label="Speaker Pitch Shift (Cents)",
	info="A pitch shift to apply to speaker audio before extracting embeddings. A slight down-shift of ~45 cents tends to produce a more accurate voice cloning."
	)

	speaker_audio = gr.Audio(
	label="Optional Speaker Audio (for cloning)",
	type="filepath",
	scale=3,
	)

	with gr.Row(variant="panel", equal_height=True) as emotion_row:
	emotion_uncond = gr.Checkbox(label="Skip Emotion")
	with gr.Column(scale=3):
	with gr.Row():
	emotion1 = gr.Slider(0.0, 1.0, 0.307, 0.001, label="Happiness")
	emotion2 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Sadness")
	emotion3 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Disgust")
	emotion4 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Fear")
	with gr.Row():
	emotion5 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Surprise")
	emotion6 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Anger")
	emotion7 = gr.Slider(0.0, 1.0, 0.025, 0.001, label="Other")
	emotion8 = gr.Slider(0.0, 1.0, 0.307, 0.001, label="Neutral")

	with gr.Row(variant="panel", equal_height=True) as dnsmos_row:
	dnsmos_uncond = gr.Checkbox(label="Skip DNSMOS")
	dnsmos_slider = gr.Slider(
	1.0,
	5.0,
	value=4.0,
	step=0.1,
	label="Deep Noise Suppression Mean Opinion Score [arXiv 2010.15258]",
	scale=3,
	)

	with gr.Row(variant="panel", equal_height=True) as vq_score_row:
	vq_uncond = gr.Checkbox(label="Skip VQScore")
	vq_single_slider = gr.Slider(
	0.5, 0.8, 0.78, 0.01, label="VQScore [arXiv 2402.16321]", scale=3
	)

	with gr.Row(variant="panel", equal_height=True) as fmax_row:
	fmax_uncond = gr.Checkbox(label="Skip Fmax")
	fmax_slider = gr.Slider(
	0, 22050, value=22050, step=1, label="Fmax (Hz)", scale=3
	)

	with gr.Row(variant="panel", equal_height=True) as pitch_row:
	pitch_uncond = gr.Checkbox(label="Skip Pitch")
	pitch_std_slider = gr.Slider(
	0.0, 300.0, value=20.0, step=1, label="Pitch Standard Deviation", scale=3
	)

	with gr.Row(variant="panel", equal_height=True) as speaking_rate_row:
	speaking_rate_uncond = gr.Checkbox(label="Skip Speaking Rate")
	speaking_rate_slider = gr.Slider(
	5.0, 30.0, value=15.0, step=0.5, label="Speaking Rate", scale=3
	)

	pipeline_choice.change(
	fn=update_ui,
	inputs=[pipeline_choice],
	outputs=[
	vq_score_row,
	emotion_row,
	fmax_row,
	pitch_row,
	speaking_rate_row,
	dnsmos_row,
	speaker_noised_checkbox,
	],
	)

	# Trigger UI update on load
	demo.load(
	fn=update_ui,
	inputs=[pipeline_choice],
	outputs=[
	vq_score_row,
	emotion_row,
	fmax_row,
	pitch_row,
	speaking_rate_row,
	dnsmos_row,
	speaker_noised_checkbox,
	],
	)

	# Generate audio on button click
	generate_button.click(
	fn=generate_audio,
	inputs=[
	pipeline_choice,
	text,
	language,
	speaker_audio,
	prefix_audio,
	emotion1,
	emotion2,
	emotion3,
	emotion4,
	emotion5,
	emotion6,
	emotion7,
	emotion8,
	vq_single_slider,
	fmax_slider,
	pitch_std_slider,
	speaking_rate_slider,
	dnsmos_slider,
	speaker_noised_checkbox,
	cfg_scale_slider,
	min_p_slider,
	seed_number,
	max_chunk_length,
	cross_fade_duration,
	punctuation_pause_duration,
	target_rms,
	randomize_seed_toggle,
	dnsmos_uncond,
	vq_uncond,
	fmax_uncond,
	pitch_uncond,
	speaking_rate_uncond,
	emotion_uncond,
	speaker_uncond,
	speaker_pitch_shift,
	speaker_equalize_checkbox,
	speaker_enhance_checkbox,
	prefix_equalize_checkbox,
	prefix_enhance_checkbox,
	enhanced_checkbox,
	],
	outputs=[output_audio, seed_number],
	)

	demo.launch()