import gradio as gr from transformers import pipeline from datasets import DatasetDict, Dataset, load_dataset, Audio from transformers import WhisperProcessor, WhisperForConditionalGeneration def transcribe(audio): # load model and processor processor = WhisperProcessor.from_pretrained("openai/whisper-medium") model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-medium") ds = Dataset.from_dict({"audio": [audio]}).cast_column("audio", Audio()) ds = ds.cast_column("audio", Audio(sampling_rate=16_000)) input_speech = next(iter(ds))["audio"]["array"] input_features = processor(input_speech, return_tensors="pt").input_features forced_decoder_ids = processor.get_decoder_prompt_ids(language = "no", task = "transcribe") predicted_ids = model.generate(input_features, forced_decoder_ids = forced_decoder_ids) transcription = processor.batch_decode(predicted_ids, skip_special_tokens = True) return transcription gr.Interface( title = "OpenAI Whisper ASR Gradio Norwegian Web UI", fn=transcribe, inputs=[ gr.inputs.Audio(type="filepath") ], outputs=[ "textbox" ] ).launch()