Audio input consists of only 3000. Short-form transcription is activated.no_speech_threshold is set to 0.5, but will be ignored.
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, WhisperForConditionalGeneration, WhisperProcessor
from datasets import load_dataset
import torch
from tqdm import tqdm
define our torch configuration
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
model_id = "openai/whisper-large-v3"
load the model + processor
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, use_safetensors=True, low_cpu_mem_usage=True)
model = model.to(device)
processor =WhisperProcessor.from_pretrained(model_id)
load the dataset with streaming mode
dataset = load_dataset("librispeech_asr", "clean", split="validation", streaming=True)
def inference(batch):
# 1. Pre-process the audio data to log-mel spectrogram inputs
audio = [sample["array"] for sample in batch["audio"]]
input_features = processor(audio, sampling_rate=batch["audio"][0]["sampling_rate"], return_tensors="pt").input_features
input_features = input_features.to(device, dtype=torch_dtype)
# 2. Auto-regressively generate the predicted token ids
pred_ids = model.generate(input_features,no_speech_threshold=0.5)
# 3. Decode the token ids to the final transcription
batch["transcription"] = processor.batch_decode(pred_ids, skip_special_tokens=True)
print(batch["transcription"])
return batch
batch size 16 inference
dataset = dataset.map(function=inference, batched=True, batch_size=1)
all_transcriptions = []
all_references = []
iterate over the dataset and run inference
for result in tqdm(dataset, desc="Evaluating..."):
# print(result["transcription"])
pass
I am using this code but getting the following "Audio input consists of only 3000. Short-form transcription is activated.no_speech_threshold is set to 0.5, but will be ignored." I want to use this no_speech_threshold, how we can use that?