""" Imports """ from transformers import pipeline from pytube import YouTube import gradio as gr import requests from transformers import AutoTokenizer, AutoModelForSeq2SeqLM """ Pipeline and models """ transcribe_pipe = pipeline(model="Silemo/whisper-it") # change to "your-username/the-name-you-picked" tags_model = AutoModelForSeq2SeqLM.from_pretrained("efederici/text2tags") tags_tokenizer = AutoTokenizer.from_pretrained("efederici/text2tags") """ Methods """ def transcribe(audio): text = transcribe_pipe(audio)["text"] return text def transcribe_video(url): yt = YouTube(url) stream = yt.streams.get_audio_only() # Saves the audio in the /audio folder audio = stream.download() #output_path = "audio/" text = transcribe_and_tag(audio) return text def transcribe_and_tag(audio): text = transcribe(audio) tags = tag(text=text) return text, tags def download_audio(audio_url, filename): # URL of the image to be downloaded is defined as audio_url r = requests.get(audio_url) # create HTTP response object # send a HTTP request to the server and save # the HTTP response in a response object called r with open(filename,'wb') as f: #"audio/" + # Saving received content as a mp3 file in # binary format # write the contents of the response (r.content) # to a new file in binary mode. f.write(r.content) def tag(text: str): """ Generates tags from given text """ text = text.strip().replace('\n', '') text = 'summarize: ' + text tokenized_text = tags_tokenizer.encode(text, return_tensors="pt") tags_ids = tags_model.generate(tokenized_text, num_beams=4, no_repeat_ngram_size=2, max_length=20, early_stopping=True) output = tags_tokenizer.decode(tags_ids[0], skip_special_tokens=True) return output.split(', ') """ Downloading audio files """ audio1_url = "https://github.com/Silemo/sml-lab2-2023-manfredi-meneghin/raw/main/task1/audio/offer.mp3" audio1_filename = "offer.mp3" download_audio(audio1_url, audio1_filename) audio2_url = "https://github.com/Silemo/sml-lab2-2023-manfredi-meneghin/raw/main/task1/audio/fantozzi.mp3" audio2_filename = "fantozzi.mp3" download_audio(audio2_url, audio2_filename) """ Interfaces """ audio_transcription = gr.Textbox(label="Transcription") audio_tags = gr.Textbox(label="Tags") yt_transcription = gr.Textbox(label="Transcription") yt_tags = gr.Textbox(label="Tags") # Multiple interfaces using tabs -> https://github.com/gradio-app/gradio/issues/450 io1 = gr.Interface( fn = transcribe_and_tag, inputs = gr.Audio(sources=["upload", "microphone"], type="filepath"), outputs = [audio_transcription, audio_tags], examples = [ [audio1_filename], [audio2_filename], ], title = "Whisper Small - Italian - Microphone or Audio file", description = "Realtime demo for Italian speech recognition using a fine-tuned Whisper small model. It uses the computer microphone as audio input. It outputs a transcription and the tags of the text.", ) io2 = gr.Interface( fn = transcribe_video, inputs = gr.Textbox(label = "YouTube URL", placeholder = "https://youtu.be/9DImRZERJNs?si=1Lme7o_KH2oCxU7y"), outputs=[yt_transcription, yt_tags], examples=[ # Meloni - Confindustria ["https://www.youtube.com/watch?v=qMslwA7RCcc"], # Montemagno - Ripartire da zero ["https://www.youtube.com/watch?v=WlT3dCAGjRo"], ], title = "Whisper Small - Italian - YouTube link", description = "Realtime demo for Italian speech recognition using a fine-tuned Whisper small model. It uses a YouTube link as audio input. It outputs a transcription and the tags of the text.", ) gr.TabbedInterface( [io1, io2], {"Microphone or audio file", "YouTube"} ).launch()