from transformers import pipeline import gradio as gr from pytube import YouTube from datasets import Dataset, Audio import os from moviepy.editor import AudioFileClip pipe1 = pipeline(model="khalidey/ID2223_Lab2_Whisper_SV") # change to "your-username/the-name-you-picked" pipe2 = pipeline('text-generation', model='birgermoell/swedish-gpt') def transcribe(audio): text = pipe1(audio)["text"] generated_text = pipe2(text, max_length=50, num_return_sequences=2)[0]['generated_text'] return text, generated_text def youtube_link(url): # Obtains the audio of the youtube video and returns the path of the mp4 file streams = YouTube(url).streams.filter(only_audio=True, file_extension='mp4') path = streams.first().download() return path def convert_to_wav(path): audio = AudioFileClip(path) audio_frame = audio.subclip(0, -2) audio_frame.write_audiofile(f"audio.wav") return f"audio.wav" def youtube_transcribe(url): path = youtube_link(url) path_wav = convert_to_wav(path) audio_dataset = Dataset.from_dict({"audio": [path_wav]}).cast_column("audio", Audio(sampling_rate=16000)) text = pipe1(audio_dataset["audio"]) return text[0]["text"] with gr.Blocks() as demo: gr.Markdown("Whisper Small Swedish + Swedish GPT") gr.Markdown("Realtime demo for Swedish speech recognition using a fine-tuned Whisper small model & text generation with Swedish GPT.") with gr.TabItem("Upload from disk"): upload_file = gr.Audio(source="upload", type="filepath",label="Upload from disk") upload_button = gr.Button("Submit for recognition") upload_outputs = [ gr.Textbox(label="Recognized speech from uploaded file"), gr.Textbox(label="Swedish-gpt generated speech from uploaded file") ] with gr.TabItem("Record from microphone"): record_file = gr.Audio(source="microphone", type="filepath",label="Record from microphone") record_button = gr.Button("Submit for recognition") record_outputs = [ gr.Textbox(label="Recognized speech from recordings"), gr.Textbox(label="Swedish-gpt generated speech from recordings") ] with gr.TabItem("Transcribe from Youtube URL"): url = gr.Text(max_lines=1, label="Transcribe from YouTube URL") youtube_button = gr.Button("Submit for recognition") youtube_outputs = [ gr.Textbox(label="Recognized speech from URL") ] upload_button.click( fn=transcribe, inputs=upload_file, outputs=upload_outputs, ) record_button.click( fn=transcribe, inputs=record_file, outputs=record_outputs, ) youtube_button.click( fn=youtube_transcribe, inputs=url, outputs=youtube_outputs, ) demo.launch()