Spaces:

simonraj
/

Audio

Sleeping

File size: 2,637 Bytes

f0a39fa
86e368d
 
f0a39fa
86e368d
 
f0a39fa
86e368d
 
 
f0a39fa
86e368d
 
 
f0a39fa
86e368d
 
 
f0a39fa
86e368d
 
 
 
 
f0a39fa
86e368d
 
 
 
 
f0a39fa
86e368d
 
 
 
 
f0a39fa
86e368d
 
 
f0a39fa
86e368d
 
 
f0a39fa
86e368d
 
 
 
 
 
 
 
 
 
 
f0a39fa
86e368d
 
 
 
 
 
 
 
 
 
 
 
f0a39fa
86e368d
 
 
 
f0a39fa
86e368d

import gradio as gr
from transformers import pipeline
import numpy as np

# Initialize the automatic speech recognition pipeline using a pre-trained model
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")

# Global variables to store the accumulated audio data and its streaming rate
audio_data = None
streaming_rate = None

def capture_audio(stream, new_chunk):
    """
    Function to capture streaming audio and accumulate it in a global variable.

    Args:
        stream (numpy.ndarray): The accumulated audio data up to this point.
        new_chunk (tuple): A tuple containing the sampling rate and the new audio data chunk.

    Returns:
        numpy.ndarray: The updated stream with the new chunk appended.
    """
    global audio_data
    global streaming_rate

    # Extract sampling rate and audio chunk, normalize the audio
    sr, y = new_chunk
    streaming_rate = sr
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    # Concatenate new audio chunk to the existing stream or start a new one
    if stream is not None:
        stream = np.concatenate([stream, y])
    else:
        stream = y

    # Update the global variable with the new audio data
    audio_data = stream
    return stream

def get_transcript():
    """
    Function to transcribe the accumulated audio data.

    Returns:
        str: The transcription of the accumulated audio data.
    """
    global audio_data
    global streaming_rate

    # Transcribe the audio data if available
    if audio_data is not None and streaming_rate is not None:
        transcript = transcriber({"sampling_rate": streaming_rate, "raw": audio_data})["text"]
        return transcript
    return ""

# Building the Gradio interface using Blocks
with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            # State variable to manage the streaming data
            state = gr.State()
            # Audio component for real-time audio capture from the microphone
            audio = gr.Audio(sources=["microphone"], streaming=True, type="numpy")
            # Textbox for displaying the transcription
            transcript_box = gr.Textbox(label="Transcript")
            # Button to initiate transcription of the captured audio
            rfrsh_btn = gr.Button("Refresh")

            # Streaming setup to handle real-time audio capture
            audio.stream(fn=capture_audio, inputs=[state, audio], outputs=[state])
            # Button click setup to trigger transcription
            rfrsh_btn.click(fn=get_transcript, outputs=[transcript_box])

# Launch the Gradio interface
demo.launch()