Spaces:

reach-vb
/

Step-Audio-2-mini

Paused

App Files Files Community

Step-Audio-2-mini / app.py

reach-vb

Create app.py (#1)

d17cd30 verified 9 months ago

raw

history blame

5.61 kB

	import os
	import tempfile
	import traceback
	from pathlib import Path

	import gradio as gr
	import spaces # required for ZeroGPU

	# ---- Your model libs (ensure these are available in the repo or pip) ----
	from stepaudio2 import StepAudio2
	from token2wav import Token2wav

	# ------------------------- constants -------------------------
	MODEL_PATH = "Step-Audio-2-mini"
	PROMPT_WAV = "assets/default_female.wav"
	CACHE_DIR = "/tmp/stepaudio2"

	# Ensure Gradio uses a writable temp dir on Spaces
	os.environ["GRADIO_TEMP_DIR"] = CACHE_DIR
	Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)

	# ------------------------- helpers -------------------------
	def save_tmp_audio(audio_bytes: bytes, cache_dir: str) -> str:
	Path(cache_dir).mkdir(parents=True, exist_ok=True)
	with tempfile.NamedTemporaryFile(dir=cache_dir, delete=False, suffix=".wav") as f:
	f.write(audio_bytes)
	return f.name

	def add_message(chatbot, history, mic, text):
	if not mic and not text:
	return chatbot, history, "Input is empty"

	if text:
	chatbot.append({"role": "user", "content": text})
	history.append({"role": "human", "content": text})
	elif mic and Path(mic).exists():
	chatbot.append({"role": "user", "content": {"path": mic}})
	history.append({"role": "human", "content": [{"type": "audio", "audio": mic}]})
	return chatbot, history, None

	def reset_state(system_prompt):
	return [], [{"role": "system", "content": system_prompt}]

	# ------------------------- globals -------------------------
	AUDIO_MODEL = StepAudio2(MODEL_PATH) # load on CPU
	TOKEN2WAV = Token2wav(f"{MODEL_PATH}/token2wav") # load on CPU

	@spaces.GPU(duration=120) # GPU only during this call; no-ops outside ZeroGPU
	def gpu_predict(chatbot, history):
	global AUDIO_MODEL, TOKEN2WAV
	try:
	# Move to CUDA only when GPU is attached
	try:
	if hasattr(AUDIO_MODEL, "to"):
	AUDIO_MODEL.to("cuda")
	if hasattr(TOKEN2WAV, "to"):
	TOKEN2WAV.to("cuda")
	except Exception:
	pass

	history.append({"role": "assistant", "content": [{"type": "text", "text": "<tts_start>"}], "eot": False})

	tokens, text, audio_tokens = AUDIO_MODEL(
	history,
	max_new_tokens=4096,
	temperature=0.7,
	repetition_penalty=1.05,
	do_sample=True,
	)

	audio_bytes = TOKEN2WAV(audio_tokens, PROMPT_WAV)
	audio_path = save_tmp_audio(audio_bytes, CACHE_DIR)

	chatbot.append({"role": "assistant", "content": {"path": audio_path}})
	history[-1]["content"].append({"type": "token", "token": tokens})
	history[-1]["eot"] = True

	except Exception:
	print(traceback.format_exc())
	gr.Warning("Some error happened, please try again.")
	return chatbot, history

	def build_demo():
	with gr.Blocks(delete_cache=(86400, 86400)) as demo:
	gr.Markdown("<center><font size=8>Step Audio 2 Demo</center>")

	with gr.Row():
	system_prompt = gr.Textbox(
	label="System Prompt",
	value=(
	"你的名字叫做小跃，是由阶跃星辰公司训练出来的语音大模型。\n"
	"你情感细腻，观察能力强，擅长分析用户的内容，并作出善解人意的回复，"
	"说话的过程中时刻注意用户的感受，富有同理心，提供多样的情绪价值。\n"
	"今天是2025年8月29日，星期五\n"
	"请用默认女声与用户交流。"
	),
	lines=2,
	)

	chatbot = gr.Chatbot(elem_id="chatbot", min_height=800, type="messages")
	history = gr.State([{"role": "system", "content": system_prompt.value}])

	mic = gr.Audio(type="filepath", label="🎙️ Microphone input (optional)")
	text = gr.Textbox(placeholder="Enter message ...", label="💬 Text input")

	with gr.Row():
	clean_btn = gr.Button("🧹 Clear History (清除历史)")
	regen_btn = gr.Button("🤔️ Regenerate (重试)")
	submit_btn = gr.Button("🚀 Submit")

	def on_submit(chatbot, history, mic, text):
	chatbot, history, error = add_message(chatbot, history, mic, text)
	if error:
	gr.Warning(error)
	return chatbot, history, None, None
	chatbot, history = gpu_predict(chatbot, history)
	return chatbot, history, None, None

	submit_btn.click(
	fn=on_submit,
	inputs=[chatbot, history, mic, text],
	outputs=[chatbot, history, mic, text],
	concurrency_limit=4,
	concurrency_id="gpu_queue",
	)

	clean_btn.click(
	fn=reset_state,
	inputs=[system_prompt],
	outputs=[chatbot, history],
	)

	def regenerate(chatbot, history):
	while chatbot and chatbot[-1]["role"] == "assistant":
	chatbot.pop()
	while history and history[-1]["role"] == "assistant":
	history.pop()
	return gpu_predict(chatbot, history)

	regen_btn.click(
	regenerate,
	[chatbot, history],
	[chatbot, history],
	concurrency_id="gpu_queue",
	)
	return demo

	# Spaces runs this file; just build and launch with defaults (no ports/names).
	if __name__ == "__main__":
	demo = build_demo()
	demo.queue().launch() # no args — Spaces handles host/port