import torch from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline import gradio as gr from moviepy.editor import VideoFileClip device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 model_id = "distil-whisper/distil-large-v3" model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=torch_dtype, use_safetensors=True ) model.to(device) processor = AutoProcessor.from_pretrained(model_id) pipe = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=25, batch_size=16, torch_dtype=torch_dtype, device=device, ) def extract_audio_from_video(video_path, audio_output_path): """Extracts audio from a video and saves it to an MP3 file.""" try: video_clip = VideoFileClip(video_path) audio_clip = video_clip.audio audio_clip.write_audiofile(audio_output_path) print(f"Audio extracted successfully and saved to: {audio_output_path}") return audio_output_path except Exception as e: print(f"Error extracting audio: {e}") return None def speech_to_text(input_file): try: if input_file.name.endswith((".mp4", ".avi", ".mov")): audio_file_path = extract_audio_from_video(input_file.name, "temp_audio.mp3") if audio_file_path: result = pipe(audio_file_path) return result[0]["transcription"] else: result = pipe(input_file.read()) return result[0]["transcription"] except Exception as e: return f"Error: {str(e)}" iface = gr.Interface(fn=speech_to_text, inputs="file", outputs="text", title="Audio/Video-to-Text") if __name__ == "__main__": iface.launch(debug=True)