NeMo / examples /nlp /language_modeling /megatron_gpt_test.py

thanks to NVIDIA ❤

7934b29 over 2 years ago

2.77 kB

	# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	from omegaconf.omegaconf import OmegaConf
	from pytorch_lightning import Trainer

	from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
	from nemo.collections.nlp.modules.common.megatron.megatron_utils import compute_model_parallel_rank
	from nemo.collections.nlp.parts.nlp_overrides import (
	NLPDDPStrategy,
	NLPNativeMixedPrecisionPlugin,
	NLPPrecisionPlugin,
	NLPSaveRestoreConnector,
	)
	from nemo.core.config import hydra_runner
	from nemo.utils import logging
	from nemo.utils.app_state import AppState


	@hydra_runner(config_path="conf", config_name="megatron_gpt_config")
	def main(cfg) -> None:
	logging.info("\n\n************ Experiment configuration *********")
	logging.info(f'\n{OmegaConf.to_yaml(cfg)}')

	trainer = None
	if cfg.trainer.precision == 16:
	trainer = Trainer(
	plugins=[
	NLPNativeMixedPrecisionPlugin(
	init_scale=cfg.model.get('native_amp_init_scale', 2 ** 32),
	growth_interval=cfg.model.get('native_amp_growth_interval', 1000),
	),
	],
	strategy=NLPDDPStrategy(),
	**cfg.trainer,
	)
	elif cfg.trainer.precision == 'bf16':
	trainer = Trainer(plugins=[NLPNativeBfloat16PrecisionPlugin(),], strategy=NLPDDPStrategy(), **cfg.trainer,)
	else:
	trainer = Trainer(plugins=[NLPPrecisionPlugin()], strategy=NLPDDPStrategy(), **cfg.trainer)

	app_state = AppState()
	app_state.model_parallel_size = cfg.model.tensor_model_parallel_size
	app_state.model_parallel_rank = compute_model_parallel_rank(trainer.local_rank, app_state.model_parallel_size)

	model = MegatronGPTModel.restore_from(
	cfg.restore_from_path, trainer=trainer, save_restore_connector=NLPSaveRestoreConnector(),
	)

	# Note: most nemo models must have the data paths configured before instantiating the model
	# MegatronGPTMOdel sets up the data in the PTL method .setup which happens after DDP spawns.
	model.cfg.data.splits_string = cfg.model.data.splits_string

	trainer.test(model)


	if __name__ == '__main__':
	main()