import gradio as gr import torch from transformers import pipeline import os # Initialize the Whisper ASR model api_key = os.getenv("HF_TOKEN") whisper_asr_trained = pipeline( "automatic-speech-recognition", model="shijunju/whisper_turbo_1202", chunk_length_s=30, # do not make it over 30 device="cuda" if torch.cuda.is_available() else "cpu" ) def transcribe_speech(filepath): if filepath is None: return "No audio found, please upload a file.", None # Process the audio file and get the transcription try: output = whisper_asr_trained(filepath, return_timestamps=True) # Extract and format the transcribed text if isinstance(output, dict) and "chunks" in output: transcribed_text = "\n\n".join([str(chunk['timestamp']) + '\n' + chunk["text"] for chunk in output["chunks"]]) else: transcribed_text = output.get("text", str(output)) # Save the transcription to a text file text_file_path = "transcription.txt" with open(text_file_path, "w") as text_file: text_file.write(transcribed_text) return transcribed_text, text_file_path except Exception as e: return f"Error during transcription: {str(e)}", None # Create Gradio interface with gr.Blocks() as demo: gr.Markdown("## Finetuned Whisper Turbo Speech to Text (CPU, each run takes about 2 min)") with gr.Row(): audio_input = gr.Audio(type="filepath", label="Upload MP3 or WAV File") transcribe_button = gr.Button("Transcribe Audio") with gr.Row(): output_textbox = gr.Textbox(label="Transcribed Text", lines=10) download_link = gr.File(label="Download Transcription") def update_output(filepath): transcribed_text, text_file_path = transcribe_speech(filepath) return transcribed_text, text_file_path transcribe_button.click( fn=update_output, inputs=audio_input, outputs=[output_textbox, download_link] ) # Launch the Gradio app demo.launch()