Spaces:

viktor-hu
/

parakeet-asr-mcp-server

Running

File size: 7,886 Bytes

import gradio as gr
import httpx
import asyncio
from typing import Dict, Any
import os
from config import CONFIG

# API endpoint configuration from config
API_BASE_URL = CONFIG["api"]["base_url"]
API_TIMEOUT = CONFIG["api"]["timeout"]

if API_BASE_URL is None:
    raise ValueError("API_BASE_URL is not set")

async def transcribe_audio(audio_file: str, output_format: str = "text") -> Dict[str, Any]:
    """
    Transcribe the audio file to text or SRT subtitles.
    
    Args:
        audio_file (str): Path to the audio file to transcribe
        output_format (str): Output format - "text" for plain text, "srt" for SRT subtitles
        
    Returns:
        Dict containing the transcription result
    """
    
    try:
        # Determine the endpoint based on output format
        if output_format == "srt":
            endpoint = f"{API_BASE_URL}{CONFIG['api']['endpoints']['transcribe_srt']}"
        else:
            endpoint = f"{API_BASE_URL}{CONFIG['api']['endpoints']['transcribe']}"

        # Prepare the file for upload
        async with httpx.AsyncClient(timeout=API_TIMEOUT) as client:
            with open(audio_file, "rb") as f:
                files = {"file": (os.path.basename(audio_file), f, "audio/wav")}
                
                response = await client.post(endpoint, files=files)
                response.raise_for_status()
                
                if output_format == "srt":
                    # For SRT format, return the raw text content
                    return {
                        "success": True,
                        "transcription": response.text,
                        "format": "srt"
                    }
                else:
                    # For JSON format, parse the response
                    result = response.json()
                    
                    # Handle both old format (direct text) and new format (segments array)
                    transcription_text = ""
                    if "success" in result and result["success"] is True and "segments" in result and result["segments"]:
                        # New format with segments array - include timestamps
                        formatted_segments = []
                        for segment in result["segments"]:
                            text = segment.get("text", "")
                            formatted_segments.append(f"{text}")
                        transcription_text = "\n".join(formatted_segments)
                    else:
                        transcription_text = "No transcription text found in response"
                    
                    return {
                        "success": True,
                        "transcription": transcription_text,
                        "format": "text",
                        "metadata": result
                    }
                    
    except httpx.TimeoutException:
        return {
            "success": False,
            "error": "Request timed out. The audio file might be too long or the server is busy.",
            "format": output_format
        }
    except httpx.HTTPStatusError as e:
        return {
            "success": False,
            "error": f"HTTP error {e.response.status_code}: {e.response.text}",
            "format": output_format
        }
    except Exception as e:
        return {
            "success": False,
            "error": f"Unexpected error: {str(e)}",
            "format": output_format
        }

def transcribe_audio_sync(audio_file: str, output_format: str = "text") -> Dict[str, Any]:
    """
    Synchronous wrapper for the async transcribe function.
    """
    return asyncio.run(transcribe_audio(audio_file, output_format))

def transcribe_to_text(audio_file: str) -> str:
    """
    Transcribe the audio file to plain text.
    
    Args:
        audio_file (str): The URL to the audio file.
        
    Returns:
        str: Transcribed text
    """
    if not audio_file:
        return "Please provide an audio file."
    
    result = transcribe_audio_sync(audio_file, "text")
    
    if result["success"]:
        return result["transcription"]
    else:
        return f"Error: {result['error']}"

def transcribe_to_srt(audio_file: str) -> str:
    """
    Transcribe the audio file to SRT subtitle format.
    
    Args:
        audio_file (str): The URL to the audio file.
        
    Returns:
        str: SRT formatted subtitles
    """
    
    if not audio_file:
        return "Please provide an audio file."
    
    result = transcribe_audio_sync(audio_file, "srt")
    
    if result["success"]:
        return result["transcription"]
    else:
        return f"Error: {result['error']}"

# Create the Gradio interface
with gr.Blocks(title="Parakeet ASR MCP Server") as demo:
    gr.Markdown("""
    # 🎙️ Parakeet ASR MCP Server
    
    A Model Context Protocol (MCP) server built with Gradio interfaces with a speech-to-text API, serving the model [Parakeet TDT 0.6B V2](https://huggingface.co/nvidia/parakeet-tdt-0.6b-v2) open-sourced by NVIDIA and hosted on Novita AI (https://novita.ai/templates-library/105929).

    This server is free to use and provides high-quality English transcription capabilities, supporting both plain text and SRT subtitle formats.
    
    ## MCP Server URL
    ```
    https://viktor-hu-parakeet-asr-mcp-server.hf.space/gradio_api/mcp/sse
    ```

    ## Available MCP Tools
    - `transcribe_to_text`: Transcribe the audio file to plain text.
    - `transcribe_to_srt`: Transcribe the audio file to SRT subtitle format.
    
    ## Integration
    
    To add this MCP to clients that support SSE (e.g. Cursor, Windsurf, Cline), simply add the following configuration to your MCP config:
    
    ```
    {
        "mcpServers": {
            "parakeet-asr": {
                "url": "https://viktor-hu-parakeet-asr-mcp-server.hf.space/gradio_api/mcp/sse"
            }
        }
    }
    ```
    """)
    
    with gr.Tab("Transcribe to text"):
        with gr.Row():
            with gr.Column():
                audio_input_text = gr.Audio(
                    label="Upload Audio File",
                    type="filepath",
                    sources=["upload", "microphone"]
                )
                transcribe_text_btn = gr.Button("Transcribe to Text", variant="primary")
            
            with gr.Column():
                text_output = gr.Textbox(
                    label="Transcription Result",
                    lines=10,
                    placeholder="Transcribed text will appear here..."
                )
        
        transcribe_text_btn.click(
            fn=transcribe_to_text,
            inputs=[audio_input_text],
            outputs=[text_output]
        )
    
    with gr.Tab("Transcribe to SRT Subtitles"):
        with gr.Row():
            with gr.Column():
                audio_input_srt = gr.Audio(
                    label="Upload Audio File",
                    type="filepath",
                    sources=["upload", "microphone"]
                )
                transcribe_srt_btn = gr.Button("Transcribe to SRT", variant="primary")
            
            with gr.Column():
                srt_output = gr.Textbox(
                    label="SRT Subtitles",
                    lines=15,
                    placeholder="SRT formatted subtitles will appear here..."
                )
        
        transcribe_srt_btn.click(
            fn=transcribe_to_srt,
            inputs=[audio_input_srt],
            outputs=[srt_output]
        )

if __name__ == "__main__":
    # Launch with MCP server enabled
    try:
        demo.launch(
            mcp_server=True,
            share=False,
            server_name=CONFIG["server"]["host"],
            server_port=CONFIG["server"]["port"],
        )
    except Exception as e:
        print(f"Error launching server: {e}")