|
|
|
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor |
|
from datasets import load_dataset |
|
import datasets |
|
import torch |
|
|
|
model = Wav2Vec2ForCTC.from_pretrained("facebook/data2vec-audio-base-10m") |
|
processor = Wav2Vec2Processor.from_pretrained("facebook/data2vec-audio-base-10m") |
|
|
|
minds14 = load_dataset("PolyAI/minds14", "en-US", split="train") |
|
minds14 = minds14.cast_column("audio", datasets.Audio(sampling_rate=16_000)) |
|
|
|
input_values = processor(minds14[0]["audio"]["array"], return_tensors="pt", sampling_rate=minds14[0]["audio"]["sampling_rate"]).input_values |
|
|
|
with torch.no_grad(): |
|
logits = model(input_values).logits |
|
scores = torch.nn.functional.softmax(logits, dim=-1) |
|
pred_ids = torch.argmax(logits, dim=-1) |
|
pred_scores = scores.gather(1, pred_ids.unsqueeze(-1))[:, :, 0] |
|
|
|
output = processor.batch_decode(pred_ids, output_word_offsets=True) |
|
|
|
|
|
def confidence_score(word_dict): |
|
probs = pred_scores[0, word_dict["start_offset"]: word_dict["end_offset"]] |
|
return torch.mean(probs) |
|
|
|
output["confidence_scores"] = {d["word"]: confidence_score(d) for d in output.word_offsets[0]} |
|
|
|
print(output["confidence_scores"]) |
|
|