from transformers import pipeline import gradio as gr import os import deepl from pytube import YouTube TARGET_LANG = "EN-GB" deepl_key = os.environ.get('DEEPL_KEY') translator = deepl.Translator(deepl_key) pipe = pipeline("automatic-speech-recognition", model="FredBonux/whisper-small-it") def transcribe(audio): ita = pipe(audio)["text"] eng = translator.translate_text(ita, target_lang=TARGET_LANG).text print(f"{ita} -> {eng}") return ita, eng def transcribe_url(url): youtube = YouTube(str(url)) print("Downloading video") audio = youtube.streams.filter(only_audio=True).first().download('yt_video') print("Downloaded") text_it = pipe(audio)["text"] print(f"{text_it}") text_en = translator.translate_text(text_it, target_lang=TARGET_LANG).text print(f"{text_en}") return text_it, text_en url_demo = gr.Interface( fn=transcribe_url, inputs="text", outputs=[gr.Textbox(label="Transcribed text"), gr.Textbox(label="English translation")], title="Italian video to english text", description="Transcribing italian video to text and translating it to english!", ) voice_demo = gr.Interface( fn=transcribe, inputs=gr.Audio(sources=["microphone"], type="filepath"), outputs=[gr.Textbox(label="Transcribed text"), gr.Textbox(label="English translation")], title="Italian recorded speech to english text", description="Transcribing italian speech to text and translating it to english!", ) app = gr.TabbedInterface([url_demo, voice_demo], ["Video to English Text", "Audio to English Text"]) app.launch()