|
|
|
from transformers import AutoModelForCTC, AutoProcessor |
|
from datasets import load_dataset |
|
import datasets |
|
import torch |
|
import sys |
|
|
|
model_id = sys.argv[1] |
|
|
|
model = AutoModelForCTC.from_pretrained(model_id) |
|
processor = AutoProcessor.from_pretrained(model_id) |
|
|
|
num_samples = 4 |
|
|
|
do_streaming = True |
|
|
|
if do_streaming: |
|
dataset = load_dataset("common_voice", "en", split="test", streaming=True) |
|
dataset = dataset.cast_column("audio", datasets.Audio(sampling_rate=16_000)) |
|
|
|
|
|
dataset_iter = iter(dataset) |
|
samples = [next(dataset_iter) for _ in range(num_samples)] |
|
|
|
audio_samples = [s["audio"]["array"] for s in samples] |
|
sampling_rate = set([s["audio"]["sampling_rate"] for s in samples]).pop() |
|
text_samples = [s["sentence"] for s in samples] |
|
|
|
else: |
|
dataset = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation") |
|
samples = dataset[:4] |
|
audio_samples = [s["array"] for s in samples["audio"]] |
|
sampling_rate = set([s["sampling_rate"] for s in samples["audio"]]).pop() |
|
text_samples = samples["text"] |
|
|
|
inputs = processor(audio_samples, return_tensors="pt", sampling_rate=sampling_rate, padding=True) |
|
|
|
with torch.no_grad(): |
|
logits = model(inputs.input_values, attention_mask=inputs.attention_mask).logits |
|
scores = torch.nn.functional.softmax(logits, dim=-1) |
|
pred_ids = torch.argmax(logits, dim=-1) |
|
pred_scores = scores.gather(1, pred_ids.unsqueeze(-1))[:, :, 0] |
|
|
|
output = processor.batch_decode(pred_ids, output_word_offsets=True) |
|
|
|
|
|
|
|
def confidence_score(word_dict, index): |
|
probs = pred_scores[index, word_dict["start_offset"]: word_dict["end_offset"]] |
|
return round(torch.mean(probs).item(), 4) |
|
|
|
|
|
for i in range(num_samples): |
|
print(20 * "=" + f"Output {i}" + 20 * "=") |
|
print(text_samples[i]) |
|
print({d["word"]: confidence_score(d, i) for d in output.word_offsets[i]}) |
|
print("\n") |
|
|