File size: 2,788 Bytes
ad8fa51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5cdc2ab
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import gradio as gr
import whisper
import torch
import time

# --- MODEL INITIALIZATION ---

# Check for GPU availability
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load the Whisper model.
# "base" is a good starting point. For higher accuracy, you can use "medium" or "large",
# but they require more resources.
print("Loading Whisper model...")
model = whisper.load_model("base", device=device)
print("Whisper model loaded successfully.")


# --- TRANSCRIPTION FUNCTION ---

def transcribe_audio(microphone_input, file_input):
    """
    Transcribes audio from either a microphone recording or an uploaded file.

    Args:
        microphone_input (tuple or None): Audio data from the microphone.
        file_input (str or None): Path to the uploaded audio file.

    Returns:
        str: The transcribed text.
    """
    # Determine the input source
    if microphone_input is not None:
        audio_source = microphone_input
    elif file_input is not None:
        audio_source = file_input
    else:
        return "No audio source provided. Please record or upload an audio file."

    # Perform the transcription
    try:
        # The transcribe function returns a dictionary with the text
        result = model.transcribe(audio_source)
        transcription = result["text"]
        return transcription
    except Exception as e:
        return f"An error occurred during transcription: {e}"


# --- GRADIO INTERFACE ---

# Use gr.Blocks for more complex layouts and custom styling
with gr.Blocks(css="assets/style.css", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🎙️ Voice Recognition")
    gr.Markdown(
        "This application uses OpenAI's Whisper model to transcribe speech to text. "
        "You can either record audio directly from your microphone or upload an audio file."
    )

    with gr.Row(elem_classes="audio-container"):
        with gr.Column():
            # Microphone input
            mic_input = gr.Audio(sources=["microphone"], type="filepath", label="Record from Microphone")
            
            # File upload input
            file_upload = gr.Audio(sources=["upload"], type="filepath", label="Upload Audio File")

    # Transcribe Button
    transcribe_button = gr.Button("Transcribe Audio")

    # Transcription Output
    output_text = gr.Textbox(
        lines=10,
        label="Transcription Result",
        placeholder="Your transcribed text will appear here...",
        elem_id="transcription_output"
    )

    # Define the action for the button click
    transcribe_button.click(
        fn=transcribe_audio,
        inputs=[mic_input, file_upload],
        outputs=output_text
    )

# Launch the application
if __name__ == "__main__":
    demo.launch(debug=True)