tests / sagemaker /test_multi_node_model_parallel.py

add model

5f89bea almost 3 years ago

No virus

4.52 kB

	import json
	import os
	import subprocess
	import unittest
	from ast import literal_eval

	import pytest

	from parameterized import parameterized, parameterized_class

	from . import is_sagemaker_available


	if is_sagemaker_available():
	from sagemaker import Session, TrainingJobAnalytics
	from sagemaker.huggingface import HuggingFace


	@pytest.mark.skipif(
	literal_eval(os.getenv("TEST_SAGEMAKER", "False")) is not True,
	reason="Skipping test because should only be run when releasing minor transformers version",
	)
	@pytest.mark.usefixtures("sm_env")
	@parameterized_class(
	[
	{
	"framework": "pytorch",
	"script": "run_glue_model_parallelism.py",
	"model_name_or_path": "roberta-large",
	"instance_type": "ml.p3dn.24xlarge",
	"results": {"train_runtime": 1600, "eval_accuracy": 0.3, "eval_loss": 1.2},
	},
	{
	"framework": "pytorch",
	"script": "run_glue.py",
	"model_name_or_path": "roberta-large",
	"instance_type": "ml.p3dn.24xlarge",
	"results": {"train_runtime": 1600, "eval_accuracy": 0.3, "eval_loss": 1.2},
	},
	]
	)
	class MultiNodeTest(unittest.TestCase):
	def setUp(self):
	if self.framework == "pytorch":
	subprocess.run(
	f"cp ./examples/pytorch/text-classification/run_glue.py {self.env.test_path}/run_glue.py".split(),
	encoding="utf-8",
	check=True,
	)
	assert hasattr(self, "env")

	def create_estimator(self, instance_count):

	# configuration for running training on smdistributed Model Parallel
	mpi_options = {
	"enabled": True,
	"processes_per_host": 8,
	}
	smp_options = {
	"enabled": True,
	"parameters": {
	"microbatches": 4,
	"placement_strategy": "spread",
	"pipeline": "interleaved",
	"optimize": "speed",
	"partitions": 4,
	"ddp": True,
	},
	}

	distribution = {"smdistributed": {"modelparallel": smp_options}, "mpi": mpi_options}

	name_extension = "trainer" if self.script == "run_glue.py" else "smtrainer"
	# creates estimator
	return HuggingFace(
	entry_point=self.script,
	source_dir=self.env.test_path,
	role=self.env.role,
	image_uri=self.env.image_uri,
	base_job_name=f"{self.env.base_job_name}-{instance_count}-smp-{name_extension}",
	instance_count=instance_count,
	instance_type=self.instance_type,
	debugger_hook_config=False,
	hyperparameters={
	**self.env.hyperparameters,
	"model_name_or_path": self.model_name_or_path,
	"max_steps": 500,
	},
	metric_definitions=self.env.metric_definitions,
	distribution=distribution,
	py_version="py36",
	)

	def save_results_as_csv(self, job_name):
	TrainingJobAnalytics(job_name).export_csv(f"{self.env.test_path}/{job_name}_metrics.csv")

	# @parameterized.expand([(2,), (4,),])
	@parameterized.expand([(1,)])
	def test_scripz(self, instance_count):
	# create estimator
	estimator = self.create_estimator(instance_count)

	# run training
	estimator.fit()

	# result dataframe
	result_metrics_df = TrainingJobAnalytics(estimator.latest_training_job.name).dataframe()

	# extract kpis
	eval_accuracy = list(result_metrics_df[result_metrics_df.metric_name == "eval_accuracy"]["value"])
	eval_loss = list(result_metrics_df[result_metrics_df.metric_name == "eval_loss"]["value"])
	# get train time from SageMaker job, this includes starting, preprocessing, stopping
	train_runtime = (
	Session().describe_training_job(estimator.latest_training_job.name).get("TrainingTimeInSeconds", 999999)
	)

	# assert kpis
	assert train_runtime <= self.results["train_runtime"]
	assert all(t >= self.results["eval_accuracy"] for t in eval_accuracy)
	assert all(t <= self.results["eval_loss"] for t in eval_loss)

	# dump tests result into json file to share in PR
	with open(f"{estimator.latest_training_job.name}.json", "w") as outfile:
	json.dump({"train_time": train_runtime, "eval_accuracy": eval_accuracy, "eval_loss": eval_loss}, outfile)