import gradio as gr import whisper from pytube import YouTube loaded_model = whisper.load_model("base") current_size = 'base' def inference(link): yt = YouTube(link) path = yt.streams.filter(only_audio=True)[0].download(filename="audio.mp4") options = whisper.DecodingOptions(without_timestamps=True) results = loaded_model.transcribe(path) return results['text'] def change_model(size): if size == current_size: return loaded_model = whisper.load_model(size) current_size = size def populate_metadata(link): yt = YouTube(link) return yt.thumbnail_url, yt.title title="Youtube Whisperer" description="Speech to text transcription of Youtube videos using OpenAI's Whisper" block = gr.Blocks() with block: gr.HTML( """

Youtube Whisperer

Speech to text transcription of Youtube videos using OpenAI's Whisper

""" ) with gr.Group(): with gr.Box(): sz = gr.Dropdown(label="Model Size", choices=['base','small', 'medium', 'large'], value='base') link = gr.Textbox(label="YouTube Link") with gr.Row().style(mobile_collapse=False, equal_height=True): title = gr.Label(label="Video Title", placeholder="Title") img = gr.Image(label="Thumbnail") text = gr.Textbox( label="Transcription", placeholder="Transcription Output", lines=5) with gr.Row().style(mobile_collapse=False, equal_height=True): btn = gr.Button("Transcribe") # Events btn.click(inference, inputs=[link], outputs=[text]) link.change(populate_metadata, inputs=[link], outputs=[img, title]) sz.change(change_model, inputs=[sz], outputs=[]) block.launch(debug=True)