File size: 2,637 Bytes
f0a39fa
86e368d
 
f0a39fa
86e368d
 
f0a39fa
86e368d
 
 
f0a39fa
86e368d
 
 
f0a39fa
86e368d
 
 
f0a39fa
86e368d
 
 
 
 
f0a39fa
86e368d
 
 
 
 
f0a39fa
86e368d
 
 
 
 
f0a39fa
86e368d
 
 
f0a39fa
86e368d
 
 
f0a39fa
86e368d
 
 
 
 
 
 
 
 
 
 
f0a39fa
86e368d
 
 
 
 
 
 
 
 
 
 
 
f0a39fa
86e368d
 
 
 
f0a39fa
86e368d
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import gradio as gr
from transformers import pipeline
import numpy as np

# Initialize the automatic speech recognition pipeline using a pre-trained model
transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")

# Global variables to store the accumulated audio data and its streaming rate
audio_data = None
streaming_rate = None

def capture_audio(stream, new_chunk):
    """
    Function to capture streaming audio and accumulate it in a global variable.

    Args:
        stream (numpy.ndarray): The accumulated audio data up to this point.
        new_chunk (tuple): A tuple containing the sampling rate and the new audio data chunk.

    Returns:
        numpy.ndarray: The updated stream with the new chunk appended.
    """
    global audio_data
    global streaming_rate

    # Extract sampling rate and audio chunk, normalize the audio
    sr, y = new_chunk
    streaming_rate = sr
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    # Concatenate new audio chunk to the existing stream or start a new one
    if stream is not None:
        stream = np.concatenate([stream, y])
    else:
        stream = y

    # Update the global variable with the new audio data
    audio_data = stream
    return stream

def get_transcript():
    """
    Function to transcribe the accumulated audio data.

    Returns:
        str: The transcription of the accumulated audio data.
    """
    global audio_data
    global streaming_rate

    # Transcribe the audio data if available
    if audio_data is not None and streaming_rate is not None:
        transcript = transcriber({"sampling_rate": streaming_rate, "raw": audio_data})["text"]
        return transcript
    return ""

# Building the Gradio interface using Blocks
with gr.Blocks() as demo:
    with gr.Row():
        with gr.Column():
            # State variable to manage the streaming data
            state = gr.State()
            # Audio component for real-time audio capture from the microphone
            audio = gr.Audio(sources=["microphone"], streaming=True, type="numpy")
            # Textbox for displaying the transcription
            transcript_box = gr.Textbox(label="Transcript")
            # Button to initiate transcription of the captured audio
            rfrsh_btn = gr.Button("Refresh")

            # Streaming setup to handle real-time audio capture
            audio.stream(fn=capture_audio, inputs=[state, audio], outputs=[state])
            # Button click setup to trigger transcription
            rfrsh_btn.click(fn=get_transcript, outputs=[transcript_box])

# Launch the Gradio interface
demo.launch()