Spaces:

Amfeat
/

ioai

Sleeping

App Files Files Community

ioai / app.py

Amfeat

Add Qwen2.5-0.5B dialogue replies for recognized speech.

add507c 3 days ago

Raw

History Blame Contribute Delete

5.23 kB

	"""IOAI — Interactive Omnivocal Audio Interpreter (CPU / free tier)."""

	from __future__ import annotations

	import logging
	import os
	import tempfile

	# Workaround: gradio_client bool schema bug in get_api_info (Gradio 5.9.x)
	import gradio_client.utils as client_utils

	_orig_get_type = client_utils.get_type
	_orig_json_schema_to_python_type = client_utils._json_schema_to_python_type


	def _patched_get_type(schema):
	if isinstance(schema, bool):
	return "bool"
	return _orig_get_type(schema)


	def _patched_json_schema_to_python_type(schema, defs=None):
	if isinstance(schema, bool):
	return "Any" if schema else "Never"
	return _orig_json_schema_to_python_type(schema, defs)


	client_utils.get_type = _patched_get_type
	client_utils._json_schema_to_python_type = _patched_json_schema_to_python_type

	import gradio as gr

	from ioai.pipeline import SessionState, process_audio, regenerate_audio

	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
	)
	logger = logging.getLogger("ioai.app")

	WORK_DIR = os.getenv("IOAI_WORK_DIR", tempfile.mkdtemp(prefix="ioai_"))


	def run(audio: str \| None):
	if not audio:
	raise gr.Error("Запишите или загрузите звук (минимум 0.5 с).")

	try:
	result = process_audio(audio, WORK_DIR)
	except ValueError as exc:
	raise gr.Error(str(exc)) from exc
	except Exception as exc:
	logger.exception("Pipeline failed")
	raise gr.Error(f"Ошибка обработки: {exc}") from exc

	details = (
	f"Язык: {result.language}\n\n"
	f"Акустический профиль: {result.acoustic_summary}\n\n"
	f"Основание: {result.reasoning}"
	)
	return (
	result.transcription,
	result.reply,
	result.response_audio_path,
	details,
	result.session,
	)


	def regen(session: SessionState \| None, transcription: str, reply: str, mode: str):
	try:
	path = regenerate_audio(session, transcription, reply, WORK_DIR, mode=mode)
	except ValueError as exc:
	raise gr.Error(str(exc)) from exc
	except Exception as exc:
	logger.exception("Regeneration failed")
	raise gr.Error(f"Ошибка перегенерации: {exc}") from exc
	return path


	DESCRIPTION = """
	# IOAI — Interactive Omnivocal Audio Interpreter

	Любой звук читается как речь. Голос, инструмент, шум, дыхание, тишина —
	система извлекает сообщение, транскрибирует его и отвечает тем же акустическим веществом.

	### Free tier пайплайн (CPU)
	1. Perceive — Whisper tiny + спектральный анализ
	2. Interpret — Qwen2.5-0.5B отвечает по смыслу распознанной речи; иначе шаблоны
	3. Synthesize — речь (espeak + тембр) при диалоговом ответе, иначе гранулы

	> Первый ответ на вопрос вроде «как дела?» теперь осмысленный и озвучен. Шаблоны остаются для шума и тишины.
	"""

	with gr.Blocks(title="IOAI") as demo:
	gr.Markdown(DESCRIPTION)
	session_state = gr.State(value=None)

	with gr.Row():
	with gr.Column():
	audio_in = gr.Audio(label="Входящий звук", type="filepath")
	btn = gr.Button("Слушать и ответить", variant="primary")
	with gr.Column():
	transcription = gr.Textbox(
	label="Транскрипция сообщения",
	lines=4,
	interactive=True,
	placeholder="Можно редактировать после анализа…",
	)
	reply = gr.Textbox(
	label="Текстовый ответ",
	lines=3,
	interactive=True,
	placeholder="Напишите «привет» или свой ответ…",
	)
	regen_mode = gr.Radio(
	choices=[
	("Речь — произносит текст", "speech"),
	("Зерна — поэтичный тембр", "grain"),
	],
	value="speech",
	label="Режим перегенерации",
	)
	btn_regen = gr.Button("Перегенерировать звук", variant="secondary")
	audio_out = gr.Audio(label="Звуковой ответ (тот же тембр)", type="filepath", interactive=False)
	details = gr.Markdown()

	btn.click(
	run,
	inputs=[audio_in],
	outputs=[transcription, reply, audio_out, details, session_state],
	)
	btn_regen.click(
	regen,
	inputs=[session_state, transcription, reply, regen_mode],
	outputs=[audio_out],
	)


	if __name__ == "__main__":
	demo.queue(max_size=4).launch(
	server_name="0.0.0.0",
	server_port=7860,
	ssr_mode=False,
	)