from transformers import pipeline import gradio as gr from transformers.pipelines.audio_utils import ffmpeg_read import os MODEL_NAME = "Bidwill/whisper-small-sanskrit" BATCH_SIZE = 8 TARGET_SAMPLING_RATE = 16000 # Whisper uses 16kHz by default # Initialize the Whisper pipeline pipe = pipeline( model=MODEL_NAME, chunk_length_s=30, ) def transcribe(inputs): if inputs is None: raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.") # Open the file in binary mode and pass the bytes data to ffmpeg_read with open(inputs, "rb") as audio_file: audio_bytes = audio_file.read() # Load the audio from the bytes using ffmpeg_read with the correct sampling rate audio_data= ffmpeg_read(audio_bytes, sampling_rate=TARGET_SAMPLING_RATE) # Process the raw audio data with Whisper text = pipe(audio_data, batch_size=BATCH_SIZE)["text"] return text # Create the Gradio demo def create_demo(): with gr.Blocks() as demo: # Tab for microphone transcription with gr.Tab("Transcribe Microphone"): mic_input = gr.Audio(sources="microphone", type="filepath", label="Microphone Audio") mic_output = gr.Textbox(label="Transcription") mic_button = gr.Button("Transcribe") mic_button.click(transcribe, inputs=mic_input, outputs=mic_output) # Tab for audio file transcription with gr.Tab("Transcribe Audio File"): file_input = gr.Audio(sources="upload", type="filepath", label="Upload Audio File") file_output = gr.Textbox(label="Transcription") file_button = gr.Button("Transcribe") file_button.click(transcribe, inputs=file_input, outputs=file_output) return demo # Launching the demo demo = create_demo() demo.launch(share=True)