from transformers import pipeline import gradio as gr from pytube import YouTube import os # Get model from my model repo pipe = pipeline(model="Akseluhr/whisper-small-sv-SE-auhr-v2") def get_audio(url): yt = YouTube(url) # Downloads yt video video = yt.streams.filter(only_audio=True).first() # Gets the audio of the video print(video) out_file=video.download(output_path=".") # Write the stream to disk base, ext = os.path.splitext(out_file) # Split the path new_file = base+'.mp3' os.rename(out_file, new_file) # Convert to .mp3 audio_file = new_file return audio_file def transcribe(rec=None, file=None, url=""): if rec is not None: audio = rec elif file is not None: audio = file elif url is not "": audio = get_audio(url) else: return "Provide a recording or a file." text = pipe(audio)["text"] return text iface = gr.Interface( fn=transcribe, inputs=[ gr.Audio(source="microphone", type="filepath", optional=True), gr.Audio(source="upload", type="filepath", optional=True), gr.Textbox(placeholder='Enter the Youtube video URL', label='URL', optional=True), ], outputs="text", title="Whisper Small Swedish", description="Realtime demo for Swedish speech recognition using a fine-tuned Whisper model.", ) iface.launch()