from transformers import pipeline
import gradio as gr
from pytube import YouTube
import os

# Get model from my model repo
pipe = pipeline(model="Akseluhr/whisper-small-sv-SE-auhr-v2")

def get_audio(url):
  yt = YouTube(url) # Downloads yt video
  video = yt.streams.filter(only_audio=True).first() # Gets the audio of the video
  print(video)
  out_file=video.download(output_path=".") # Write the stream to disk
  base, ext = os.path.splitext(out_file) # Split the path
  new_file = base+'.mp3' 
  os.rename(out_file, new_file) # Convert to .mp3
  audio_file = new_file 
  return audio_file

def transcribe(rec=None, file=None, url=""):
    if rec is not None:
        audio = rec
    elif file is not None:
        audio = file
    elif url is not "":
        audio = get_audio(url)
    else:
        return "Provide a recording or a file."

    text = pipe(audio)["text"]
    return text


iface = gr.Interface(
    fn=transcribe,
    inputs=[
        gr.Audio(source="microphone", type="filepath", optional=True),
        gr.Audio(source="upload", type="filepath", optional=True),
        gr.Textbox(placeholder='Enter the Youtube video URL', label='URL', optional=True),
    ],
    outputs="text",
    title="Whisper Small Swedish",
    description="Realtime demo for Swedish speech recognition using a fine-tuned Whisper model.",
)


iface.launch()