Spaces:

Amfeat
/

ioai

Sleeping

File size: 5,227 Bytes

"""IOAI — Interactive Omnivocal Audio Interpreter (CPU / free tier)."""

from __future__ import annotations

import logging
import os
import tempfile

# Workaround: gradio_client bool schema bug in get_api_info (Gradio 5.9.x)
import gradio_client.utils as client_utils

_orig_get_type = client_utils.get_type
_orig_json_schema_to_python_type = client_utils._json_schema_to_python_type


def _patched_get_type(schema):
    if isinstance(schema, bool):
        return "bool"
    return _orig_get_type(schema)


def _patched_json_schema_to_python_type(schema, defs=None):
    if isinstance(schema, bool):
        return "Any" if schema else "Never"
    return _orig_json_schema_to_python_type(schema, defs)


client_utils.get_type = _patched_get_type
client_utils._json_schema_to_python_type = _patched_json_schema_to_python_type

import gradio as gr

from ioai.pipeline import SessionState, process_audio, regenerate_audio

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
)
logger = logging.getLogger("ioai.app")

WORK_DIR = os.getenv("IOAI_WORK_DIR", tempfile.mkdtemp(prefix="ioai_"))


def run(audio: str | None):
    if not audio:
        raise gr.Error("Запишите или загрузите звук (минимум 0.5 с).")

    try:
        result = process_audio(audio, WORK_DIR)
    except ValueError as exc:
        raise gr.Error(str(exc)) from exc
    except Exception as exc:
        logger.exception("Pipeline failed")
        raise gr.Error(f"Ошибка обработки: {exc}") from exc

    details = (
        f"**Язык:** {result.language}\n\n"
        f"**Акустический профиль:** {result.acoustic_summary}\n\n"
        f"**Основание:** {result.reasoning}"
    )
    return (
        result.transcription,
        result.reply,
        result.response_audio_path,
        details,
        result.session,
    )


def regen(session: SessionState | None, transcription: str, reply: str, mode: str):
    try:
        path = regenerate_audio(session, transcription, reply, WORK_DIR, mode=mode)
    except ValueError as exc:
        raise gr.Error(str(exc)) from exc
    except Exception as exc:
        logger.exception("Regeneration failed")
        raise gr.Error(f"Ошибка перегенерации: {exc}") from exc
    return path


DESCRIPTION = """
# IOAI — Interactive Omnivocal Audio Interpreter

**Любой звук читается как речь.** Голос, инструмент, шум, дыхание, тишина —
система извлекает *сообщение*, транскрибирует его и отвечает **тем же акустическим веществом**.

### Free tier пайплайн (CPU)
1. **Perceive** — Whisper tiny + спектральный анализ
2. **Interpret** — Qwen2.5-0.5B отвечает по смыслу распознанной речи; иначе шаблоны
3. **Synthesize** — речь (espeak + тембр) при диалоговом ответе, иначе гранулы

> Первый ответ на вопрос вроде «как дела?» теперь осмысленный и озвучен. Шаблоны остаются для шума и тишины.
"""

with gr.Blocks(title="IOAI") as demo:
    gr.Markdown(DESCRIPTION)
    session_state = gr.State(value=None)

    with gr.Row():
        with gr.Column():
            audio_in = gr.Audio(label="Входящий звук", type="filepath")
            btn = gr.Button("Слушать и ответить", variant="primary")
        with gr.Column():
            transcription = gr.Textbox(
                label="Транскрипция сообщения",
                lines=4,
                interactive=True,
                placeholder="Можно редактировать после анализа…",
            )
            reply = gr.Textbox(
                label="Текстовый ответ",
                lines=3,
                interactive=True,
                placeholder="Напишите «привет» или свой ответ…",
            )
            regen_mode = gr.Radio(
                choices=[
                    ("Речь — произносит текст", "speech"),
                    ("Зерна — поэтичный тембр", "grain"),
                ],
                value="speech",
                label="Режим перегенерации",
            )
            btn_regen = gr.Button("Перегенерировать звук", variant="secondary")
            audio_out = gr.Audio(label="Звуковой ответ (тот же тембр)", type="filepath", interactive=False)
            details = gr.Markdown()

    btn.click(
        run,
        inputs=[audio_in],
        outputs=[transcription, reply, audio_out, details, session_state],
    )
    btn_regen.click(
        regen,
        inputs=[session_state, transcription, reply, regen_mode],
        outputs=[audio_out],
    )


if __name__ == "__main__":
    demo.queue(max_size=4).launch(
        server_name="0.0.0.0",
        server_port=7860,
        ssr_mode=False,
    )