import gradio as gr import whisper import torch import time # --- MODEL INITIALIZATION --- # Check for GPU availability device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {device}") # Load the Whisper model. # "base" is a good starting point. For higher accuracy, you can use "medium" or "large", # but they require more resources. print("Loading Whisper model...") model = whisper.load_model("base", device=device) print("Whisper model loaded successfully.") # --- TRANSCRIPTION FUNCTION --- def transcribe_audio(microphone_input, file_input): """ Transcribes audio from either a microphone recording or an uploaded file. Args: microphone_input (tuple or None): Audio data from the microphone. file_input (str or None): Path to the uploaded audio file. Returns: str: The transcribed text. """ # Determine the input source if microphone_input is not None: audio_source = microphone_input elif file_input is not None: audio_source = file_input else: return "No audio source provided. Please record or upload an audio file." # Perform the transcription try: # The transcribe function returns a dictionary with the text result = model.transcribe(audio_source) transcription = result["text"] return transcription except Exception as e: return f"An error occurred during transcription: {e}" # --- GRADIO INTERFACE --- # Use gr.Blocks for more complex layouts and custom styling with gr.Blocks(css="assets/style.css", theme=gr.themes.Soft()) as demo: gr.Markdown("# 🎙️ Voice Recognition") gr.Markdown( "This application uses OpenAI's Whisper model to transcribe speech to text. " "You can either record audio directly from your microphone or upload an audio file." ) with gr.Row(elem_classes="audio-container"): with gr.Column(): # Microphone input mic_input = gr.Audio(sources=["microphone"], type="filepath", label="Record from Microphone") # File upload input file_upload = gr.Audio(sources=["upload"], type="filepath", label="Upload Audio File") # Transcribe Button transcribe_button = gr.Button("Transcribe Audio") # Transcription Output output_text = gr.Textbox( lines=10, label="Transcription Result", placeholder="Your transcribed text will appear here...", elem_id="transcription_output" ) # Define the action for the button click transcribe_button.click( fn=transcribe_audio, inputs=[mic_input, file_upload], outputs=output_text ) # Launch the application if __name__ == "__main__": demo.launch(debug=True)