Olive_Whisper_ASR / evaluation.py
sam2ai's picture
Synced repo using 'sync_with_huggingface' Github Action
6de3e11
raw
history blame
5.13 kB
import argparse
import functools
import gc
import os
import evaluate
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from utils.data_utils import DataCollatorSpeechSeq2SeqWithPadding, remove_punctuation, to_simple
from utils.reader import CustomDataset
from utils.utils import print_arguments, add_arguments
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
add_arg("test_data", type=str, default="dataset/test.json", help="测试集的路径")
add_arg("model_path", type=str, default="models/whisper-tiny-finetune", help="合并模型的路径,或者是huggingface上模型的名称")
add_arg("batch_size", type=int, default=16, help="评估的batch size")
add_arg("num_workers", type=int, default=8, help="读取数据的线程数量")
add_arg("language", type=str, default="Chinese", help="设置语言,可全称也可简写,如果为None则评估的是多语言")
add_arg("remove_pun", type=bool, default=True, help="是否移除标点符号")
add_arg("to_simple", type=bool, default=True, help="是否转为简体中文")
add_arg("timestamps", type=bool, default=False, help="评估时是否使用时间戳数据")
add_arg("min_audio_len", type=float, default=0.5, help="最小的音频长度,单位秒")
add_arg("max_audio_len", type=float, default=30, help="最大的音频长度,单位秒")
add_arg("local_files_only", type=bool, default=True, help="是否只在本地加载模型,不尝试下载")
add_arg("task", type=str, default="transcribe", choices=['transcribe', 'translate'], help="模型的任务")
add_arg("metric", type=str, default="cer", choices=['cer', 'wer'], help="评估方式")
args = parser.parse_args()
print_arguments(args)
# 判断模型路径是否合法
assert 'openai' == os.path.dirname(args.model_path) or os.path.exists(args.model_path), \
f"模型文件{args.model_path}不存在,请检查是否已经成功合并模型,或者是否为huggingface存在模型"
# 获取Whisper的数据处理器,这个包含了特征提取器、tokenizer
processor = WhisperProcessor.from_pretrained(args.model_path,
language=args.language,
task=args.task,
no_timestamps=not args.timestamps,
local_files_only=args.local_files_only)
forced_decoder_ids = processor.get_decoder_prompt_ids()
# 获取模型
model = WhisperForConditionalGeneration.from_pretrained(args.model_path,
device_map="auto",
local_files_only=args.local_files_only)
model.eval()
# 获取测试数据
test_dataset = CustomDataset(data_list_path=args.test_data,
processor=processor,
timestamps=args.timestamps,
min_duration=args.min_audio_len,
max_duration=args.max_audio_len)
print(f"测试数据:{len(test_dataset)}")
# 数据padding器
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
eval_dataloader = DataLoader(test_dataset, batch_size=args.batch_size,
num_workers=args.num_workers, collate_fn=data_collator)
# 获取评估方法
metric = evaluate.load(args.metric)
# 开始评估
for step, batch in enumerate(tqdm(eval_dataloader)):
with torch.cuda.amp.autocast():
with torch.no_grad():
generated_tokens = (
model.generate(
input_features=batch["input_features"].cuda(),
decoder_input_ids=batch["labels"][:, :4].cuda(),
forced_decoder_ids=forced_decoder_ids,
max_new_tokens=255).cpu().numpy())
labels = batch["labels"].cpu().numpy()
labels = np.where(labels != -100, labels, processor.tokenizer.pad_token_id)
# 将预测和实际的token转换为文本
decoded_preds = processor.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
decoded_labels = processor.tokenizer.batch_decode(labels, skip_special_tokens=True)
# 删除标点符号
if args.remove_pun:
decoded_preds = remove_punctuation(decoded_preds)
decoded_labels = remove_punctuation(decoded_labels)
# 将繁体中文总成简体中文
if args.to_simple:
decoded_preds = to_simple(decoded_preds)
decoded_labels = to_simple(decoded_labels)
metric.add_batch(predictions=decoded_preds, references=decoded_labels)
# 删除计算的记录
del generated_tokens, labels, batch
gc.collect()
# 计算评估结果
m = metric.compute()
print(f"评估结果:{args.metric}={round(m, 5)}")