Spaces:

nowhuggingface
/

LLM1-Fine-tuning-and-deployment

Sleeping

App Files Files Community

LLM1-Fine-tuning-and-deployment / evaluate /tests /test_trainer_evaluator_parity.py

nowhuggingface

Add my folder

d733479 2 months ago

raw

history blame contribute delete

11.8 kB

	import json
	import os
	import shutil
	import subprocess
	import tempfile
	import unittest

	import numpy as np
	import torch
	import transformers
	from datasets import load_dataset
	from transformers import AutoFeatureExtractor, AutoModelForImageClassification, Trainer, TrainingArguments, pipeline

	from evaluate import evaluator, load

	from .utils import slow


	class TestEvaluatorTrainerParity(unittest.TestCase):
	def setUp(self):
	self.dir_path = tempfile.mkdtemp("evaluator_trainer_parity_test")

	transformers_version = transformers.__version__
	branch = ""
	if not transformers_version.endswith(".dev0"):
	branch = f"--branch v{transformers_version}"
	subprocess.run(
	f"git clone --depth 3 --filter=blob:none --sparse {branch} https://github.com/huggingface/transformers",
	shell=True,
	cwd=self.dir_path,
	)

	def tearDown(self):
	shutil.rmtree(self.dir_path, ignore_errors=True)

	def test_text_classification_parity(self):
	model_name = "philschmid/tiny-bert-sst2-distilled"

	subprocess.run(
	"git sparse-checkout set examples/pytorch/text-classification",
	shell=True,
	cwd=os.path.join(self.dir_path, "transformers"),
	)

	subprocess.run(
	f"python examples/pytorch/text-classification/run_glue.py"
	f" --model_name_or_path {model_name}"
	f" --task_name sst2"
	f" --do_eval"
	f" --max_seq_length 9999999999" # rely on tokenizer.model_max_length for max_length
	f" --output_dir {os.path.join(self.dir_path, 'textclassification_sst2_transformers')}"
	f" --max_eval_samples 80",
	shell=True,
	cwd=os.path.join(self.dir_path, "transformers"),
	)

	with open(
	f"{os.path.join(self.dir_path, 'textclassification_sst2_transformers', 'eval_results.json')}", "r"
	) as f:
	transformers_results = json.load(f)

	eval_dataset = load_dataset("glue", "sst2", split="validation[:80]")

	pipe = pipeline(task="text-classification", model=model_name, tokenizer=model_name)

	task_evaluator = evaluator(task="text-classification")
	evaluator_results = task_evaluator.compute(
	model_or_pipeline=pipe,
	data=eval_dataset,
	metric="accuracy",
	input_column="sentence",
	label_column="label",
	label_mapping={"negative": 0, "positive": 1},
	strategy="simple",
	)

	self.assertEqual(transformers_results["eval_accuracy"], evaluator_results["accuracy"])

	@slow
	def test_text_classification_parity_two_columns(self):
	model_name = "prajjwal1/bert-tiny-mnli"
	max_eval_samples = 150

	subprocess.run(
	"git sparse-checkout set examples/pytorch/text-classification",
	shell=True,
	cwd=os.path.join(self.dir_path, "transformers"),
	)

	subprocess.run(
	f"python examples/pytorch/text-classification/run_glue.py"
	f" --model_name_or_path {model_name}"
	f" --task_name mnli"
	f" --do_eval"
	f" --max_seq_length 256"
	f" --output_dir {os.path.join(self.dir_path, 'textclassification_mnli_transformers')}"
	f" --max_eval_samples {max_eval_samples}",
	shell=True,
	cwd=os.path.join(self.dir_path, "transformers"),
	)

	with open(
	f"{os.path.join(self.dir_path, 'textclassification_mnli_transformers', 'eval_results.json')}", "r"
	) as f:
	transformers_results = json.load(f)

	eval_dataset = load_dataset("glue", "mnli", split=f"validation_matched[:{max_eval_samples}]")

	pipe = pipeline(task="text-classification", model=model_name, tokenizer=model_name, max_length=256)

	task_evaluator = evaluator(task="text-classification")
	evaluator_results = task_evaluator.compute(
	model_or_pipeline=pipe,
	data=eval_dataset,
	metric="accuracy",
	input_column="premise",
	second_input_column="hypothesis",
	label_column="label",
	label_mapping={"LABEL_0": 0, "LABEL_1": 1, "LABEL_2": 2},
	)

	self.assertEqual(transformers_results["eval_accuracy"], evaluator_results["accuracy"])

	def test_image_classification_parity(self):
	# we can not compare to the Pytorch transformers example, that uses custom preprocessing on the images
	model_name = "douwekiela/resnet-18-finetuned-dogfood"
	dataset_name = "beans"
	max_eval_samples = 120

	raw_dataset = load_dataset(dataset_name, split="validation")
	eval_dataset = raw_dataset.select(range(max_eval_samples))

	feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
	model = AutoModelForImageClassification.from_pretrained(model_name)

	def collate_fn(examples):
	pixel_values = torch.stack(
	[torch.tensor(feature_extractor(example["image"])["pixel_values"][0]) for example in examples]
	)
	labels = torch.tensor([example["labels"] for example in examples])
	return {"pixel_values": pixel_values, "labels": labels}

	metric = load("accuracy")
	trainer = Trainer(
	model=model,
	args=TrainingArguments(
	output_dir=os.path.join(self.dir_path, "imageclassification_beans_transformers"),
	remove_unused_columns=False,
	),
	train_dataset=None,
	eval_dataset=eval_dataset,
	compute_metrics=lambda p: metric.compute(
	predictions=np.argmax(p.predictions, axis=1), references=p.label_ids
	),
	tokenizer=None,
	data_collator=collate_fn,
	)

	metrics = trainer.evaluate()
	trainer.save_metrics("eval", metrics)

	with open(
	f"{os.path.join(self.dir_path, 'imageclassification_beans_transformers', 'eval_results.json')}", "r"
	) as f:
	transformers_results = json.load(f)

	pipe = pipeline(task="image-classification", model=model_name, feature_extractor=model_name)

	task_evaluator = evaluator(task="image-classification")
	evaluator_results = task_evaluator.compute(
	model_or_pipeline=pipe,
	data=eval_dataset,
	metric="accuracy",
	input_column="image",
	label_column="labels",
	label_mapping=model.config.label2id,
	strategy="simple",
	)

	self.assertEqual(transformers_results["eval_accuracy"], evaluator_results["accuracy"])

	def test_question_answering_parity(self):
	model_name_v1 = "anas-awadalla/bert-tiny-finetuned-squad"
	model_name_v2 = "mrm8488/bert-tiny-finetuned-squadv2"

	subprocess.run(
	"git sparse-checkout set examples/pytorch/question-answering",
	shell=True,
	cwd=os.path.join(self.dir_path, "transformers"),
	)

	# test squad_v1-like dataset
	subprocess.run(
	f"python examples/pytorch/question-answering/run_qa.py"
	f" --model_name_or_path {model_name_v1}"
	f" --dataset_name squad"
	f" --do_eval"
	f" --output_dir {os.path.join(self.dir_path, 'questionanswering_squad_transformers')}"
	f" --max_eval_samples 100"
	f" --max_seq_length 384",
	shell=True,
	cwd=os.path.join(self.dir_path, "transformers"),
	)

	with open(
	f"{os.path.join(self.dir_path, 'questionanswering_squad_transformers', 'eval_results.json')}", "r"
	) as f:
	transformers_results = json.load(f)

	eval_dataset = load_dataset("squad", split="validation[:100]")

	pipe = pipeline(
	task="question-answering",
	model=model_name_v1,
	tokenizer=model_name_v1,
	max_answer_len=30,
	padding="max_length",
	)

	task_evaluator = evaluator(task="question-answering")
	evaluator_results = task_evaluator.compute(
	model_or_pipeline=pipe,
	data=eval_dataset,
	metric="squad",
	strategy="simple",
	)

	self.assertEqual(transformers_results["eval_f1"], evaluator_results["f1"])
	self.assertEqual(transformers_results["eval_exact_match"], evaluator_results["exact_match"])

	# test squad_v2-like dataset
	subprocess.run(
	f"python examples/pytorch/question-answering/run_qa.py"
	f" --model_name_or_path {model_name_v2}"
	f" --dataset_name squad_v2"
	f" --version_2_with_negative"
	f" --do_eval"
	f" --output_dir {os.path.join(self.dir_path, 'questionanswering_squadv2_transformers')}"
	f" --max_eval_samples 100"
	f" --max_seq_length 384",
	shell=True,
	cwd=os.path.join(self.dir_path, "transformers"),
	)

	with open(
	f"{os.path.join(self.dir_path, 'questionanswering_squadv2_transformers', 'eval_results.json')}", "r"
	) as f:
	transformers_results = json.load(f)

	eval_dataset = load_dataset("squad_v2", split="validation[:100]")

	pipe = pipeline(
	task="question-answering",
	model=model_name_v2,
	tokenizer=model_name_v2,
	max_answer_len=30,
	)

	task_evaluator = evaluator(task="question-answering")
	evaluator_results = task_evaluator.compute(
	model_or_pipeline=pipe,
	data=eval_dataset,
	metric="squad_v2",
	strategy="simple",
	squad_v2_format=True,
	)

	self.assertEqual(transformers_results["eval_f1"], evaluator_results["f1"])
	self.assertEqual(transformers_results["eval_HasAns_f1"], evaluator_results["HasAns_f1"])
	self.assertEqual(transformers_results["eval_NoAns_f1"], evaluator_results["NoAns_f1"])

	def test_token_classification_parity(self):
	model_name = "hf-internal-testing/tiny-bert-for-token-classification"
	n_samples = 500

	subprocess.run(
	"git sparse-checkout set examples/pytorch/token-classification",
	shell=True,
	cwd=os.path.join(self.dir_path, "transformers"),
	)

	subprocess.run(
	f"python examples/pytorch/token-classification/run_ner.py"
	f" --model_name_or_path {model_name}"
	f" --dataset_name conll2003"
	f" --do_eval"
	f" --output_dir {os.path.join(self.dir_path, 'tokenclassification_conll2003_transformers')}"
	f" --max_eval_samples {n_samples}",
	shell=True,
	cwd=os.path.join(self.dir_path, "transformers"),
	)

	with open(
	os.path.join(self.dir_path, "tokenclassification_conll2003_transformers", "eval_results.json"), "r"
	) as f:
	transformers_results = json.load(f)

	eval_dataset = load_dataset("conll2003", split=f"validation[:{n_samples}]")

	pipe = pipeline(task="token-classification", model=model_name)

	e = evaluator(task="token-classification")
	evaluator_results = e.compute(
	model_or_pipeline=pipe,
	data=eval_dataset,
	metric="seqeval",
	input_column="tokens",
	label_column="ner_tags",
	strategy="simple",
	)

	self.assertEqual(transformers_results["eval_accuracy"], evaluator_results["overall_accuracy"])
	self.assertEqual(transformers_results["eval_f1"], evaluator_results["overall_f1"])