import gradio as gr import whisper from pytube import YouTube from typing import List from transformers import pipeline def transcribe( url: str, model_size: str ) -> str: # Get audio from the video. yt_client = YouTube(url=url) audio_file = yt_client.streams.filter(only_audio=True)[0].download(filename="file.mp4") # Load the model model = whisper.load_model(model_size) # Load the audio into the model audio = whisper.load_audio(audio_file) # Get results result = model.transcribe(audio) return format_result(result), summarize(result["text"]) def summarize(text: str) -> str: summarizer = pipeline("summarization", model="facebook/bart-large-cnn") out = summarizer(text, max_length=150, min_length=30, do_sample=False)[0]["summary_text"] return out def format_result(result: whisper.DecodingResult) -> str: out = [] for item in result["segments"]: out.append(f"from {item['start']:6.2f} to {item['end']:6.2f} {item['text']}") return "\n".join(out) def get_model_sizes() -> List[str]: """ :rtype: list :return: List of possible sizes of the Whisper model. """ return list( whisper._MODELS.keys() ) title = "YouTube transcribe + summarization" desc = "Transcribe YouTube videos using OpenAI Whisper." with gr.Blocks() as demo: gr.HTML(title) with gr.Row(): with gr.Column(): gr.Markdown( f""" {desc} """ ) with gr.Row(): model_size = gr.Dropdown( label="Model size", choices=get_model_sizes(), value="tiny" ) url = gr.Textbox(label="YouTube URL") with gr.Row(): text = gr.Textbox( label="Transcription", lines=10 ) with gr.Row(): summarization = gr.Textbox( label="Summarization", lines=5 ) with gr.Row().style(equal_height=True): submit_button = gr.Button("Submit") submit_button.click( transcribe, inputs=[ url, model_size ], outputs=[ text, summarization ] ) demo.launch()