import os import gradio as gr import torch from transformers import pipeline title = "Transcribe speech in several languages" device = "cuda:0" if torch.cuda.is_available() else "cpu" asr_pipe_audio2Text_Ge = pipeline(task="automatic-speech-recognition", model="jonatasgrosman/wav2vec2-large-xlsr-53-german") asr_pipe_whisper = pipeline(task="automatic-speech-recognition", model="openai/whisper-medium", device=device) def transcribeFile(inputlang, audio_path : str) -> str: #transcription = asr_pipe_audio2Text_Ge(audio_path) #transcription = asr_pipe_whisper(audio_path, max_new_tokens=256, generate_kwargs={"task":"transcribe"}) if inputlang == "Auto Detect": transcription = asr_pipe_whisper(audio_path, chunk_length_s=10, stride_length_s=(4, 2), generate_kwargs={"task":"transcribe"}, batch_size=32) elif inputlang == "German": transcription = asr_pipe_audio2Text_Ge(audio_path, chunk_length_s=10, stride_length_s=(4, 2), batch_size=32) return transcription["text"] def translateAudio(audio_path): translationOutput = asr_pipe_whisper(audio_path, max_new_tokens=256, generate_kwargs={"task":"translate"}) return translationOutput def transcribeFileMulti(inputlang, audio_path : str) -> str: if inputlang == "English": transcription = asr_pipe_whisper(audio_path) elif inputlang == "German": transcription = asr_pipe_audio2Text_Ge(audio_path) translation = translateAudio(audio_path) t1 = transcription["text"] t2 = translation["text"] output = t1+t2 return output #transcription["text"] app1 = gr.Interface( fn=transcribeFile, #inputs=gr.inputs.Audio(label="Upload audio file", type="filepath"), inputs=[gr.Radio(["Auto Detect", "German"], value="Auto Detect", label="Source Language", info="Select the language of the speech you want to transcribe"), gr.Audio(source="upload", type="filepath",label="Upload audio file")], outputs="text", title=title ) app2 = gr.Interface( fn=transcribeFileMulti, inputs=[gr.Radio(["Auto Detect", "German"], value="Auto Detect", label="Source Language", info="Select the language of the speech you want to transcribe"), gr.Audio(source="microphone", type="filepath")], outputs="text", title=title ) demo = gr.TabbedInterface([app1, app2], ["Audio File", "Microphone"]) if __name__ == "__main__": demo.launch()