import os import tempfile from subprocess import Popen, PIPE import torch import gradio as gr from transformers import pipeline from transformers.pipelines.audio_utils import ffmpeg_read from pydub import AudioSegment # Constants MODEL_NAME = "openai/whisper-large-v3-turbo" BATCH_SIZE = 8 device = 0 if torch.cuda.is_available() else "cpu" # Whisper pipeline whisper_pipeline = pipeline( task="automatic-speech-recognition", model=MODEL_NAME, chunk_length_s=30, device=device, ) # Convert MP4 to MP3 def convert_mp4_to_mp3(mp4_path, mp3_path): try: audio = AudioSegment.from_file(mp4_path, format="mp4") audio.export(mp3_path, format="mp3") except Exception as e: raise RuntimeError(f"Error converting MP4 to MP3: {e}") # Transcribe audio def transcribe_audio(audio_path): try: # Ensure the audio file is read correctly in bytes format with open(audio_path, "rb") as audio_file: audio_data = audio_file.read() # Read the file as bytes # Process the audio file for transcription inputs = ffmpeg_read(audio_data, whisper_pipeline.feature_extractor.sampling_rate) inputs = {"array": inputs, "sampling_rate": whisper_pipeline.feature_extractor.sampling_rate} # Run the transcription pipeline result = whisper_pipeline(inputs, batch_size=8, return_timestamps=False) return result["text"] except Exception as e: return f"Error during transcription: {e}" # Gradio Interface Function def transcribe_file(file): # Check file type and convert if necessary if file.name.endswith(".mp4"): temp_mp3_path = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False).name try: convert_mp4_to_mp3(file.name, temp_mp3_path) audio_path = temp_mp3_path except Exception as e: return f"Error during MP4 to MP3 conversion: {e}" else: audio_path = file.name # Transcribe audio transcription = transcribe_audio(audio_path) # Clean up temporary files if file.name.endswith(".mp4"): os.remove(temp_mp3_path) return transcription # Gradio interface setup def launch_gradio(): with gr.Blocks() as demo: gr.Markdown("# Audio Transcription with Whisper Model") gr.Interface( fn=transcribe_file, inputs=gr.File(label="Upload Audio/Video File (MP4 or MP3)"), outputs=gr.Textbox(label="Transcribed Text"), ) demo.launch(share=True) # Run the Gradio app if __name__ == "__main__": launch_gradio()