from transformers import pipeline import gradio as gr import requests from moviepy.editor import * import os pipe = pipeline(model="esnagy/whisper-small-hu") def transcribe_audio(audio_file): print("Transcribing audio: ", audio_file) text = pipe(audio_file)["text"] return text def transcribe_video(video_url): # Download the video from the URL video_filename = "temp_video.mp4" with open(video_filename, "wb") as f: response = requests.get(video_url) f.write(response.content) # Load the video using moviepy video = VideoFileClip(video_filename) audio = video.audio audio_file = "temp_audio.wav" audio.write_audiofile(audio_file, codec="pcm_s16le") text = transcribe_audio(audio_file) # Remove temporary files os.remove(video_filename) os.remove(audio_file) return text def transcribe(video_url="", audio=None): print("[transcribe] Transcribing...") print("[transcribe] video_url: ", video_url) print("[transcribe] audio: ", audio) if video_url.strip() != "": return transcribe_video(video_url) else: return transcribe_audio(audio) iface = gr.Interface( lambda video_url, audio: transcribe(video_url, audio), inputs=[ gr.Textbox(label="Enter video URL", placeholder="Or leave empty to use microphone"), gr.Audio(sources=["microphone"], type="filepath"), ], outputs="text", title="Whisper Small Hungarian", description="Realtime demo for Hungarian speech recognition using a fine-tuned Whisper small model. Enter a video URL or record your voice to transcribe.\nExample video URL: https://github.com/pwang697/Scalable-Machine-Learning-Lab_2/raw/test/vasar-hu.mp4", ) iface.launch()