import gradio as gr from deep_translator import GoogleTranslator #import deepl from transformers import pipeline import os from gtts import gTTS pipe = pipeline(model="freeja/lab2-whisper-sv") def transcribe_audio(audio,language): transcribed = pipe(audio)["text"] result = "Transcribed text\n" result += transcribed + "\n" #result += "Translated text\n" #trans_text = translate_audio(transcribed,language) #result += trans_text #text_to_speech(trans_text,language) #result += text_to_speech return transcribed def translate_audio(text,language): #translate = deepl.Translator language_dict = {"English":"en","Spanish":"es","German":"de","French":"fr","Italian":"it"} lang = language_dict[language] translated_text = GoogleTranslator(source='sv', target=lang).translate(text) return translated_text def text_to_speech(text,language): language_dict = {"English":"en","Spanish":"es","German":"de","French":"fr","Italian":"it"} lang = language_dict[language] gTTS(text,lang,slow=False) def transcribe_video(URL): video = Youtube(URL) yt = video.streams.get_audio_only() yt.download() text = pipe(yt)["text"] return text youtube_func = gr.Interface( fn = transcribe_video, inputs = "text", outputs = "text", title = "Whisper Small Swedish",, description = "Realtime demo for Swedish speech recognition with translation using a fine-tuned Whisper small model", ) transcribe_func = gr.Interface( fn=transcribe_audio, inputs=[ gr.Audio(source="microphone", type="filepath", label="Transcribe from Microphone"), gr.Dropdown(["English","Spanish","Dutch","French","Italian"], value="English", label="Translate to ") ], outputs="text", title=Whisper Small Swedish",, description="Realtime demo for Swedish speech recognition with translation using a fine-tuned Whisper small model", iface = gr.TabbedInterface([transcribe_func, youtube_func], ["Audio to Text","Video transciption"]) iface.launch()