from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperForConditionalGeneration, WhisperProcessor
from datasets import load_dataset
import torch
from tqdm import tqdm

define our torch configuration

device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

load the model + processor

model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, use_safetensors=True, low_cpu_mem_usage=True)
model = model.to(device)
processor =WhisperProcessor.from_pretrained(model_id)

load the dataset with streaming mode

dataset = load_dataset("librispeech_asr", "clean", split="validation", streaming=True)

def inference(batch):
# 1. Pre-process the audio data to log-mel spectrogram inputs
audio = [sample["array"] for sample in batch["audio"]]
input_features = processor(audio, sampling_rate=batch["audio"][0]["sampling_rate"], return_tensors="pt").input_features
input_features = input_features.to(device, dtype=torch_dtype)

# 2. Auto-regressively generate the predicted token ids
pred_ids = model.generate(input_features,no_speech_threshold=0.5)


# 3. Decode the token ids to the final transcription
batch["transcription"] = processor.batch_decode(pred_ids, skip_special_tokens=True)
print(batch["transcription"])

return batch

batch size 16 inference

dataset = dataset.map(function=inference, batched=True, batch_size=1)

all_transcriptions = []
all_references = []

iterate over the dataset and run inference

for result in tqdm(dataset, desc="Evaluating..."):
# print(result["transcription"])
pass

I am using this code but getting the following "Audio input consists of only 3000. Short-form transcription is activated.no_speech_threshold is set to 0.5, but will be ignored." I want to use this no_speech_threshold, how we can use that?