from transformers import pipeline import gradio as gr import moviepy.editor as mp from pytube import YouTube import math import youtube_dl import validators pipe = pipeline(model="errno98/whisper-small-hi") segment_len = 30 def download_soundcloud(url): options = { 'format': 'bestaudio/best', 'extractaudio' : True, # only keep the audio 'audioformat' : "mp3", # convert to mp3 'outtmpl': '%(id)s', # name the file the ID of the video 'noplaylist' : True, # only download single song, not playlist } with youtube_dl.YoutubeDL() as ydl: ydl.download([url]) # download metadata ydl = youtube_dl.YoutubeDL(options) r = None url = url with ydl: # don't download, much faster r = ydl.extract_info(url, download=True) global video_clip global audio_wav global audio_len #my_clip = mp.VideoFileClip(r) #my_clip.audio.write_audiofile("audio.wav") #audio_wav = mp.AudioFileClip("audio.wav") #audio_len = audio_wav.duration return r['title'] def load_vid(url): video = ( YouTube(url) .streams.filter(progressive=True, file_extension="mp4") .first() .download() ) global video_clip global audio_wav global audio_len video_clip = mp.VideoFileClip(video) video_clip.audio.write_audiofile("audio.wav") audio_wav = mp.AudioFileClip("audio.wav") audio_len = audio_wav.duration return video def validate_link(url): try: yt = YouTube(url) except Exception: return True video_length = yt.length if video_length > 600: print("Video length is too long (longer than 10 minutes)") return False else: return True def clippify(index, seg_total): audio_file = "audio_out"+str(index)+".wav" audio_clipped_obj = mp.AudioFileClip.copy(audio_wav) if (index > 0): audio_clipped_obj = mp.AudioFileClip.cutout(audio_clipped_obj, 0, segment_len * (index)) if (index < seg_total - 1): audio_clipped_obj = mp.AudioFileClip.cutout(audio_clipped_obj, segment_len * (index + 1), audio_length) mp.AudioFileClip.write_audiofile(audio_clipped_obj, audio_file) return audio_file def transcribe(url): if url[0:8] == "https://" or url[0:8] == "http://": url = url[8::] text = "" if not validators.url(url): if not validate_link(url): return "Not a YouTube video" else: load_vid(url) else: return "Incorrect URL structure" segment_count = math.ceil(audio_len / segment_len) if segment_count <= 0: return "Invalid segment length" else: for x in range(segment_count): audio = clippify(x, segment_count) seg_text = pipe(audio, batch_size=512, truncation=True)["text"] print("Segtext: ") print(seg_text) text = text + seg_text return text iface = gr.Interface( fn=transcribe, inputs=gr.Textbox(label = "Input the URL of a YouTube video:"), outputs="text", title="Whisper Small SE", description="Video Swedish Transcription", ) iface.launch()