Inference ``` from transformers import WhisperFeatureExtractor, WhisperProcessor import numpy as np import librosa import torch model = WhisperForConditionalGeneration.from_pretrained("userdata/ud-whisper-medium-1").cuda() processor = WhisperProcessor.from_pretrained( "userdata/ud-whisper-medium-1") _ = model.eval() model.config.forced_decoder_ids = None sec = 30 target_sr = 16_000 audio, sr = librosa.load('/home/userdata/ariff-wav2vec2/finetune/2887.mp3', sr=None) audio_array = librosa.resample(audio, orig_sr=sr, target_sr=target_sr) chunk = [audio_array[i: i + (target_sr * sec)] for i in range(0, len(audio_array), target_sr * sec)] with torch.no_grad(): input_features = (processor(chunk[4], sampling_rate=16_000, return_tensors="pt").input_features).cuda() predicted_ids = model.generate(input_features) transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) res = ''.join(list(transcription)) print(res) ``` Play Audio ``` import IPython.display as ipd ipd.Audio(data=np.asarray(chunk), autoplay=True, rate=16000) ```