How to get accuracy of transcription from the model?

#98

by Atulad - opened Mar 27

Discussion

Atulad

Mar 27

How to get accuracy of transcription from the model?

flabbaf97

Apr 3

I have the same issue. I'd like to have the probability of each chunk. I set the generate_kwargs={"language": language, "output_scores": True, "output_logits": True}) when calling pipe, but it does not return any probability. Please let me know if you found any solution.

vasiliadi

17 days ago

Try a model like that https://github.com/thomasmol/cog-whisper-diarization

sanchit-gandhi

17 days ago

•

edited 17 days ago

You can evaluate the model using the word-error rate (WER) metric using the following example. First install the Python dependencies:

pip install --upgrade pip
pip install --upgrade transformers datasets[audio] evaluate jiwer

Then, run the following code snippet to evaluate the model on the LibriSpeech ASR dataset:

from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
from datasets import load_dataset
from evaluate import load
import torch
from tqdm import tqdm

# define our torch configuration
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

# load the model + processor
model =  AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, use_safetensors=True, low_cpu_mem_usage=True)
model = model.to(device)
processor = AutoProcessor.from_pretrained(model_id)

# load the dataset with streaming mode
dataset = load_dataset("librispeech_asr", "clean", split="validation", streaming=True)

# define the evaluation metric
wer_metric = load("wer")

def inference(batch):
    # 1. Pre-process the audio data to log-mel spectrogram inputs
    audio = [sample["array"] for sample in batch["audio"]]
    input_features = processor(audio, sampling_rate=batch["audio"][0]["sampling_rate"], return_tensors="pt").input_features
    input_features = input_features.to(device, dtype=torch_dtype)
    
    # 2. Auto-regressively generate the predicted token ids
    pred_ids = model.generate(input_features, max_new_tokens=128)
    
    # 3. Decode the token ids to the final transcription
    batch["transcription"] = processor.batch_decode(pred_ids, skip_special_tokens=True)
    batch["reference"] = batch["text"]
    return batch

# batch size 16 inference
dataset = dataset.map(function=inference, batched=True, batch_size=16)

all_transcriptions = []
all_references = []

# iterate over the dataset and run inference
for result in tqdm(dataset, desc="Evaluating..."):
    all_transcriptions.append(result["transcription"])
    all_references.append(result["reference"])

# normalize predictions and references
all_transcriptions = [processor.normalize(transcription) for transcription in all_transcriptions]
all_references = [processor.normalize(reference) for reference in all_references]

# compute the WER metric
wer = 100 * wer_metric.compute(predictions=all_transcriptions, references=all_references)
print(wer)

flabbaf97

14 days ago

You can evaluate the model using the word-error rate (WER) metric using the following example. First install the Python dependencies:

pip install --upgrade pip
pip install --upgrade transformers datasets[audio] evaluate jiwer

Then, run the following code snippet to evaluate the model on the LibriSpeech ASR dataset:

from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
from datasets import load_dataset
from evaluate import load
import torch
from tqdm import tqdm

# define our torch configuration
device = "cuda:0" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

model_id = "openai/whisper-large-v3"

# load the model + processor
model =  AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, use_safetensors=True, low_cpu_mem_usage=True)
model = model.to(device)
processor = AutoProcessor.from_pretrained(model_id)

# load the dataset with streaming mode
dataset = load_dataset("librispeech_asr", "clean", split="validation", streaming=True)

# define the evaluation metric
wer_metric = load("wer")

def inference(batch):
    # 1. Pre-process the audio data to log-mel spectrogram inputs
    audio = [sample["array"] for sample in batch["audio"]]
    input_features = processor(audio, sampling_rate=batch["audio"][0]["sampling_rate"], return_tensors="pt").input_features
    input_features = input_features.to(device, dtype=torch_dtype)
    
    # 2. Auto-regressively generate the predicted token ids
    pred_ids = model.generate(input_features, max_new_tokens=128)
    
    # 3. Decode the token ids to the final transcription
    batch["transcription"] = processor.batch_decode(pred_ids, skip_special_tokens=True)
    batch["reference"] = batch["text"]
    return batch

# batch size 16 inference
dataset = dataset.map(function=inference, batched=True, batch_size=16)

all_transcriptions = []
all_references = []

# iterate over the dataset and run inference
for result in tqdm(dataset, desc="Evaluating..."):
    all_transcriptions.append(result["transcription"])
    all_references.append(result["reference"])

# normalize predictions and references
all_transcriptions = [processor.normalize(transcription) for transcription in all_transcriptions]
all_references = [processor.normalize(reference) for reference in all_references]

# compute the WER metric
wer = 100 * wer_metric.compute(predictions=all_transcriptions, references=all_references)
print(wer)

Thak you for your response. But I look to find how confident the model is about transcription. I probably didn't frame my question properly. I appreciate your help anyway.

vasiliadi

14 days ago

Try a model like that https://github.com/thomasmol/cog-whisper-diarization

https://replicate.com/thomasmol/whisper-diarization

You have a confidence level for every word. So you can assume how confident the model is about transcription.

{
  "end": "281.0",
  "text": "We shared it with researchers. Right.",
  "start": "279.76",
  "words": [
    {
      "end": 279.78,
      "word": "We",
      "start": 279.76,
      "probability": 0.92041015625
    },
    {
      "end": 280.02,
      "word": "shared",
      "start": 279.78,
      "probability": 0.99072265625
    },
    {
      "end": 280.14,
      "word": "it",
      "start": 280.02,
      "probability": 0.99951171875
    },
    {
      "end": 280.24,
      "word": "with",
      "start": 280.14,
      "probability": 1
    },
    {
      "end": 280.54,
      "word": "researchers.",
      "start": 280.24,
      "probability": 0.94384765625
    },
    {
      "end": 281,
      "word": "Right.",
      "start": 280.78,
      "probability": 0.73193359375
    }
  ],
  "speaker": "SPEAKER_00",
  "avg_logprob": -0.21625000337759653
}

Upload images, audio, and videos by dragging in the text input, pasting, or clicking here.

Tap or paste here to upload images

· Sign up or log in to comment