import gradio as gr import whisper from pytube import YouTube loaded_model = whisper.load_model("base") def inference(link): yt = YouTube(link) path = yt.streams.filter(only_audio=True)[0].download(filename="audio.mp4") options = whisper.DecodingOptions(without_timestamps=True) results = loaded_model.transcribe(path) return results['text'] def change_model(size): loaded_model = whisper.load_model(size) title="Youtube Whisperer" description="Speech to text transcription of Youtube videos using OpenAI's Whisper" block = gr.Blocks() with block: gr.HTML( """

Youtube Whisperer

Speech to text transcription of Youtube videos using OpenAI's Whisper

""" ) with gr.Group(): with gr.Box(): sz = gr.Dropdown(label="Model Size", choices=['base','small', 'medium', 'large'], value='base') sz.change(change_model, inputs=[sz], outputs=[]) link = gr.Textbox(label="YouTube Link") text = gr.Textbox( label="Transcription", placeholder="Transcription Output", lines=5) with gr.Row().style(mobile_collapse=False, equal_height=True): btn = gr.Button("Transcribe") btn.click(inference, inputs=[link], outputs=[text]) block.launch()