from transformers import pipeline from transformers import WhisperProcessor, WhisperForConditionalGeneration import gradio as gr model = WhisperForConditionalGeneration.from_pretrained("MaximilianChen/Casper") processor = WhisperProcessor.from_pretrained("MaximilianChen/Casper", language='catalan', task='transcribe') asr = pipeline("automatic-speech-recognition", model=model, pprocessor=processor) def transcribe_audio(mic=None, file=None): if mic is not None: audio = mic elif file is not None: audio = file else: return "You must either provide a mic recording or a file" transcription = asr(audio)["text"] return transcription gr.Interface( fn=transcribe_audio, inputs=[ gr.Audio(source="microphone", type="filepath", optional=True), gr.Audio(source="upload", type="filepath", optional=True), ], outputs="text", ).launch()