|
|
import json |
|
|
import os |
|
|
import shutil |
|
|
import subprocess |
|
|
import tempfile |
|
|
import unittest |
|
|
|
|
|
import numpy as np |
|
|
import torch |
|
|
import transformers |
|
|
from datasets import load_dataset |
|
|
from transformers import AutoFeatureExtractor, AutoModelForImageClassification, Trainer, TrainingArguments, pipeline |
|
|
|
|
|
from evaluate import evaluator, load |
|
|
|
|
|
from .utils import slow |
|
|
|
|
|
|
|
|
class TestEvaluatorTrainerParity(unittest.TestCase): |
|
|
def setUp(self): |
|
|
self.dir_path = tempfile.mkdtemp("evaluator_trainer_parity_test") |
|
|
|
|
|
transformers_version = transformers.__version__ |
|
|
branch = "" |
|
|
if not transformers_version.endswith(".dev0"): |
|
|
branch = f"--branch v{transformers_version}" |
|
|
subprocess.run( |
|
|
f"git clone --depth 3 --filter=blob:none --sparse {branch} https://github.com/huggingface/transformers", |
|
|
shell=True, |
|
|
cwd=self.dir_path, |
|
|
) |
|
|
|
|
|
def tearDown(self): |
|
|
shutil.rmtree(self.dir_path, ignore_errors=True) |
|
|
|
|
|
def test_text_classification_parity(self): |
|
|
model_name = "philschmid/tiny-bert-sst2-distilled" |
|
|
|
|
|
subprocess.run( |
|
|
"git sparse-checkout set examples/pytorch/text-classification", |
|
|
shell=True, |
|
|
cwd=os.path.join(self.dir_path, "transformers"), |
|
|
) |
|
|
|
|
|
subprocess.run( |
|
|
f"python examples/pytorch/text-classification/run_glue.py" |
|
|
f" --model_name_or_path {model_name}" |
|
|
f" --task_name sst2" |
|
|
f" --do_eval" |
|
|
f" --max_seq_length 9999999999" |
|
|
f" --output_dir {os.path.join(self.dir_path, 'textclassification_sst2_transformers')}" |
|
|
f" --max_eval_samples 80", |
|
|
shell=True, |
|
|
cwd=os.path.join(self.dir_path, "transformers"), |
|
|
) |
|
|
|
|
|
with open( |
|
|
f"{os.path.join(self.dir_path, 'textclassification_sst2_transformers', 'eval_results.json')}", "r" |
|
|
) as f: |
|
|
transformers_results = json.load(f) |
|
|
|
|
|
eval_dataset = load_dataset("glue", "sst2", split="validation[:80]") |
|
|
|
|
|
pipe = pipeline(task="text-classification", model=model_name, tokenizer=model_name) |
|
|
|
|
|
task_evaluator = evaluator(task="text-classification") |
|
|
evaluator_results = task_evaluator.compute( |
|
|
model_or_pipeline=pipe, |
|
|
data=eval_dataset, |
|
|
metric="accuracy", |
|
|
input_column="sentence", |
|
|
label_column="label", |
|
|
label_mapping={"negative": 0, "positive": 1}, |
|
|
strategy="simple", |
|
|
) |
|
|
|
|
|
self.assertEqual(transformers_results["eval_accuracy"], evaluator_results["accuracy"]) |
|
|
|
|
|
@slow |
|
|
def test_text_classification_parity_two_columns(self): |
|
|
model_name = "prajjwal1/bert-tiny-mnli" |
|
|
max_eval_samples = 150 |
|
|
|
|
|
subprocess.run( |
|
|
"git sparse-checkout set examples/pytorch/text-classification", |
|
|
shell=True, |
|
|
cwd=os.path.join(self.dir_path, "transformers"), |
|
|
) |
|
|
|
|
|
subprocess.run( |
|
|
f"python examples/pytorch/text-classification/run_glue.py" |
|
|
f" --model_name_or_path {model_name}" |
|
|
f" --task_name mnli" |
|
|
f" --do_eval" |
|
|
f" --max_seq_length 256" |
|
|
f" --output_dir {os.path.join(self.dir_path, 'textclassification_mnli_transformers')}" |
|
|
f" --max_eval_samples {max_eval_samples}", |
|
|
shell=True, |
|
|
cwd=os.path.join(self.dir_path, "transformers"), |
|
|
) |
|
|
|
|
|
with open( |
|
|
f"{os.path.join(self.dir_path, 'textclassification_mnli_transformers', 'eval_results.json')}", "r" |
|
|
) as f: |
|
|
transformers_results = json.load(f) |
|
|
|
|
|
eval_dataset = load_dataset("glue", "mnli", split=f"validation_matched[:{max_eval_samples}]") |
|
|
|
|
|
pipe = pipeline(task="text-classification", model=model_name, tokenizer=model_name, max_length=256) |
|
|
|
|
|
task_evaluator = evaluator(task="text-classification") |
|
|
evaluator_results = task_evaluator.compute( |
|
|
model_or_pipeline=pipe, |
|
|
data=eval_dataset, |
|
|
metric="accuracy", |
|
|
input_column="premise", |
|
|
second_input_column="hypothesis", |
|
|
label_column="label", |
|
|
label_mapping={"LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2}, |
|
|
) |
|
|
|
|
|
self.assertEqual(transformers_results["eval_accuracy"], evaluator_results["accuracy"]) |
|
|
|
|
|
def test_image_classification_parity(self): |
|
|
|
|
|
model_name = "douwekiela/resnet-18-finetuned-dogfood" |
|
|
dataset_name = "beans" |
|
|
max_eval_samples = 120 |
|
|
|
|
|
raw_dataset = load_dataset(dataset_name, split="validation") |
|
|
eval_dataset = raw_dataset.select(range(max_eval_samples)) |
|
|
|
|
|
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name) |
|
|
model = AutoModelForImageClassification.from_pretrained(model_name) |
|
|
|
|
|
def collate_fn(examples): |
|
|
pixel_values = torch.stack( |
|
|
[torch.tensor(feature_extractor(example["image"])["pixel_values"][0]) for example in examples] |
|
|
) |
|
|
labels = torch.tensor([example["labels"] for example in examples]) |
|
|
return {"pixel_values": pixel_values, "labels": labels} |
|
|
|
|
|
metric = load("accuracy") |
|
|
trainer = Trainer( |
|
|
model=model, |
|
|
args=TrainingArguments( |
|
|
output_dir=os.path.join(self.dir_path, "imageclassification_beans_transformers"), |
|
|
remove_unused_columns=False, |
|
|
), |
|
|
train_dataset=None, |
|
|
eval_dataset=eval_dataset, |
|
|
compute_metrics=lambda p: metric.compute( |
|
|
predictions=np.argmax(p.predictions, axis=1), references=p.label_ids |
|
|
), |
|
|
tokenizer=None, |
|
|
data_collator=collate_fn, |
|
|
) |
|
|
|
|
|
metrics = trainer.evaluate() |
|
|
trainer.save_metrics("eval", metrics) |
|
|
|
|
|
with open( |
|
|
f"{os.path.join(self.dir_path, 'imageclassification_beans_transformers', 'eval_results.json')}", "r" |
|
|
) as f: |
|
|
transformers_results = json.load(f) |
|
|
|
|
|
pipe = pipeline(task="image-classification", model=model_name, feature_extractor=model_name) |
|
|
|
|
|
task_evaluator = evaluator(task="image-classification") |
|
|
evaluator_results = task_evaluator.compute( |
|
|
model_or_pipeline=pipe, |
|
|
data=eval_dataset, |
|
|
metric="accuracy", |
|
|
input_column="image", |
|
|
label_column="labels", |
|
|
label_mapping=model.config.label2id, |
|
|
strategy="simple", |
|
|
) |
|
|
|
|
|
self.assertEqual(transformers_results["eval_accuracy"], evaluator_results["accuracy"]) |
|
|
|
|
|
def test_question_answering_parity(self): |
|
|
model_name_v1 = "anas-awadalla/bert-tiny-finetuned-squad" |
|
|
model_name_v2 = "mrm8488/bert-tiny-finetuned-squadv2" |
|
|
|
|
|
subprocess.run( |
|
|
"git sparse-checkout set examples/pytorch/question-answering", |
|
|
shell=True, |
|
|
cwd=os.path.join(self.dir_path, "transformers"), |
|
|
) |
|
|
|
|
|
|
|
|
subprocess.run( |
|
|
f"python examples/pytorch/question-answering/run_qa.py" |
|
|
f" --model_name_or_path {model_name_v1}" |
|
|
f" --dataset_name squad" |
|
|
f" --do_eval" |
|
|
f" --output_dir {os.path.join(self.dir_path, 'questionanswering_squad_transformers')}" |
|
|
f" --max_eval_samples 100" |
|
|
f" --max_seq_length 384", |
|
|
shell=True, |
|
|
cwd=os.path.join(self.dir_path, "transformers"), |
|
|
) |
|
|
|
|
|
with open( |
|
|
f"{os.path.join(self.dir_path, 'questionanswering_squad_transformers', 'eval_results.json')}", "r" |
|
|
) as f: |
|
|
transformers_results = json.load(f) |
|
|
|
|
|
eval_dataset = load_dataset("squad", split="validation[:100]") |
|
|
|
|
|
pipe = pipeline( |
|
|
task="question-answering", |
|
|
model=model_name_v1, |
|
|
tokenizer=model_name_v1, |
|
|
max_answer_len=30, |
|
|
padding="max_length", |
|
|
) |
|
|
|
|
|
task_evaluator = evaluator(task="question-answering") |
|
|
evaluator_results = task_evaluator.compute( |
|
|
model_or_pipeline=pipe, |
|
|
data=eval_dataset, |
|
|
metric="squad", |
|
|
strategy="simple", |
|
|
) |
|
|
|
|
|
self.assertEqual(transformers_results["eval_f1"], evaluator_results["f1"]) |
|
|
self.assertEqual(transformers_results["eval_exact_match"], evaluator_results["exact_match"]) |
|
|
|
|
|
|
|
|
subprocess.run( |
|
|
f"python examples/pytorch/question-answering/run_qa.py" |
|
|
f" --model_name_or_path {model_name_v2}" |
|
|
f" --dataset_name squad_v2" |
|
|
f" --version_2_with_negative" |
|
|
f" --do_eval" |
|
|
f" --output_dir {os.path.join(self.dir_path, 'questionanswering_squadv2_transformers')}" |
|
|
f" --max_eval_samples 100" |
|
|
f" --max_seq_length 384", |
|
|
shell=True, |
|
|
cwd=os.path.join(self.dir_path, "transformers"), |
|
|
) |
|
|
|
|
|
with open( |
|
|
f"{os.path.join(self.dir_path, 'questionanswering_squadv2_transformers', 'eval_results.json')}", "r" |
|
|
) as f: |
|
|
transformers_results = json.load(f) |
|
|
|
|
|
eval_dataset = load_dataset("squad_v2", split="validation[:100]") |
|
|
|
|
|
pipe = pipeline( |
|
|
task="question-answering", |
|
|
model=model_name_v2, |
|
|
tokenizer=model_name_v2, |
|
|
max_answer_len=30, |
|
|
) |
|
|
|
|
|
task_evaluator = evaluator(task="question-answering") |
|
|
evaluator_results = task_evaluator.compute( |
|
|
model_or_pipeline=pipe, |
|
|
data=eval_dataset, |
|
|
metric="squad_v2", |
|
|
strategy="simple", |
|
|
squad_v2_format=True, |
|
|
) |
|
|
|
|
|
self.assertEqual(transformers_results["eval_f1"], evaluator_results["f1"]) |
|
|
self.assertEqual(transformers_results["eval_HasAns_f1"], evaluator_results["HasAns_f1"]) |
|
|
self.assertEqual(transformers_results["eval_NoAns_f1"], evaluator_results["NoAns_f1"]) |
|
|
|
|
|
def test_token_classification_parity(self): |
|
|
model_name = "hf-internal-testing/tiny-bert-for-token-classification" |
|
|
n_samples = 500 |
|
|
|
|
|
subprocess.run( |
|
|
"git sparse-checkout set examples/pytorch/token-classification", |
|
|
shell=True, |
|
|
cwd=os.path.join(self.dir_path, "transformers"), |
|
|
) |
|
|
|
|
|
subprocess.run( |
|
|
f"python examples/pytorch/token-classification/run_ner.py" |
|
|
f" --model_name_or_path {model_name}" |
|
|
f" --dataset_name conll2003" |
|
|
f" --do_eval" |
|
|
f" --output_dir {os.path.join(self.dir_path, 'tokenclassification_conll2003_transformers')}" |
|
|
f" --max_eval_samples {n_samples}", |
|
|
shell=True, |
|
|
cwd=os.path.join(self.dir_path, "transformers"), |
|
|
) |
|
|
|
|
|
with open( |
|
|
os.path.join(self.dir_path, "tokenclassification_conll2003_transformers", "eval_results.json"), "r" |
|
|
) as f: |
|
|
transformers_results = json.load(f) |
|
|
|
|
|
eval_dataset = load_dataset("conll2003", split=f"validation[:{n_samples}]") |
|
|
|
|
|
pipe = pipeline(task="token-classification", model=model_name) |
|
|
|
|
|
e = evaluator(task="token-classification") |
|
|
evaluator_results = e.compute( |
|
|
model_or_pipeline=pipe, |
|
|
data=eval_dataset, |
|
|
metric="seqeval", |
|
|
input_column="tokens", |
|
|
label_column="ner_tags", |
|
|
strategy="simple", |
|
|
) |
|
|
|
|
|
self.assertEqual(transformers_results["eval_accuracy"], evaluator_results["overall_accuracy"]) |
|
|
self.assertEqual(transformers_results["eval_f1"], evaluator_results["overall_f1"]) |
|
|
|