#!/usr/bin/env python3 from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor from datasets import load_dataset import datasets import torch model = Wav2Vec2ForCTC.from_pretrained("facebook/data2vec-audio-base-10m") processor = Wav2Vec2Processor.from_pretrained("facebook/data2vec-audio-base-10m") minds14 = load_dataset("PolyAI/minds14", "en-US", split="train") minds14 = minds14.cast_column("audio", datasets.Audio(sampling_rate=16_000)) input_values = processor(minds14[0]["audio"]["array"], return_tensors="pt", sampling_rate=minds14[0]["audio"]["sampling_rate"]).input_values with torch.no_grad(): logits = model(input_values).logits scores = torch.nn.functional.softmax(logits, dim=-1) pred_ids = torch.argmax(logits, dim=-1) pred_scores = scores.gather(1, pred_ids.unsqueeze(-1))[:, :, 0] output = processor.batch_decode(pred_ids, output_word_offsets=True) # add confidence def confidence_score(word_dict): probs = pred_scores[0, word_dict["start_offset"]: word_dict["end_offset"]] return torch.mean(probs) output["confidence_scores"] = {d["word"]: confidence_score(d) for d in output.word_offsets[0]} print(output["confidence_scores"])