Spaces:

TristanBehrens
/

speech

Sleeping

File size: 4,633 Bytes

import os
import gradio as gr
from transformers import pipeline
import numpy as np
import time
from typing import Tuple
import logging
import torch


# Create a logger. 
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Check if all the variables are set.
required_variables = ["HF_TOKEN", "PASSWORD", "MODEL_NAME"]
for required_variable in required_variables:
    if os.environ.get(required_variable, "NO") == "NO":
        logger.error(
            f"Environment variable {required_variable} is not set. "
            "Please set it before running the application."
        )
        raise ValueError(
            f"Environment variable {required_variable} is not set. "
            "Please set it before running the application."
        )


# Create the transcription pipeline.
model_name = os.environ["MODEL_NAME"]
device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Loading model {model_name} with device {device}...")
transcriber = pipeline(
    "automatic-speech-recognition", 
    model=model_name, 
    device=device
)
logger.info(f"Model loaded successfully.")


# Start the app.
def main():
    interface = create_interface()
    interface.launch()

    
# Create the Gradio interface for the Whisper transcription service.
def create_interface():
    
    # The UI is a block of Gradio components.
    with gr.Blocks() as interface:

        # Title.
        gr.Markdown("# Whisper Speech Transcription")

        # One row for the password input and another for the audio input.
        with gr.Row():
            with gr.Column(scale=2):
                passwort_input = gr.Textbox(
                    label="Enter Password",
                    placeholder="Enter the password to access the transcription service",
                    type="password"
                )

        # Row for audio input.
        with gr.Row():
            with gr.Column(scale=2):
                audio_input = gr.Audio(
                    sources=["microphone", "upload"], 
                    type="numpy",
                    label="Record or Upload Audio"
                )
                
        # Row for the transcription button.
        with gr.Row():
            transcribe_button = gr.Button("Transcribe", variant="primary")
            
        # Row for the transcription output.
        with gr.Row():
            output_text = gr.Textbox(
                label="Transcription Output",
                placeholder="Transcription will appear here...",
                lines=5
            )

        # Status message for transcription time.
        status_text = gr.Textbox(
            label="Status",
            placeholder="Transcription status will appear here...",
            lines=1,
            interactive=False
        )
  
        # Set up the transcribe button click event
        transcribe_button.click(
            fn=transcribe_audio,
            inputs=[audio_input, passwort_input],
            outputs=[output_text, status_text],
        )
        
        # Also transcribe when audio is recorded/uploaded
        audio_input.change(
            fn=transcribe_audio,
            inputs=[audio_input, passwort_input],
            outputs=[output_text, status_text],
        )
    return interface


def transcribe_audio(audio: Tuple[int, np.ndarray], password: str = None) -> str:
    
    # If the password is wrong, return an error message.
    if password != os.environ.get("PASSWORD"):
        return "Incorrect password. Please try again.", ""

    # If there is no audio, return an error message.
    if audio is None:
        return "No audio detected. Please record some audio.", ""
        
    # Start measuring the time.
    start_time = time.time()

    # Unpack the audio.
    sr, y = audio
    
    # Convert to mono if stereo
    if y.ndim > 1:
        logger.debug(f"Converting {y.shape[1]} channels to mono")
        y = y.mean(axis=1)
    
    # Normalize audio
    y = y.astype(np.float32)
    max_abs = np.max(np.abs(y))
    if max_abs > 0:  # Avoid division by zero
        y /= max_abs
    
    logger.info(f"Processing audio: {sr}Hz, {len(y)} samples (~{len(y)/sr:.2f}s)")
    
    # Run transcription
    result = transcriber({"sampling_rate": sr, "raw": y}, chunk_length_s=30, stride_length_s=[6,0])
    logger.info(f"Transcription completed.")

    # Calculate elapsed time
    elapsed_time = time.time() - start_time
    audio_time = len(y) / sr
    status_string = f"Transcription took {elapsed_time:.2f}s for {audio_time:.2f}s of audio with model {model_name}."
    return result["text"], status_string


# Entrypoint.
main()