|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" |
|
Script to convert a NeMo 2.0 checkpoint to enable Speculative Decoding. |
|
|
|
This script adds speculative decoding capabilities to an existing NeMo 2.0 model checkpoint. |
|
It supports different speculative decoding algorithms and parallel configurations. |
|
|
|
Example usage: |
|
python scripts/llm/gpt_convert_speculative.py \ |
|
--model_path /path/to/nemo2_ckpt \ |
|
--export_dir /path/to/export_dir \ |
|
--specdec_algo eagle3 \ |
|
--tp_size 2 \ |
|
--devices 2 |
|
|
|
Available speculative decoding algorithms in Nemo 2.0: |
|
- Eagle 3 (default): Extrapolation Algorithm for Greater Language-model Efficiency |
|
|
|
For more details on speculative decoding algorithms, refer to the NVIDIA Model Optimizer documentation: |
|
https://nvidia.github.io/TensorRT-Model-Optimizer/guides/7_speculative_decoding.html |
|
""" |
|
|
|
from argparse import ArgumentParser |
|
|
|
from nemo.collections.llm.modelopt import ( |
|
apply_speculative_decoding, |
|
setup_trainer_and_restore_model_with_modelopt_spec, |
|
) |
|
from nemo.collections.llm.utils import barrier |
|
from nemo.lightning.ckpt_utils import ckpt_to_context_subdir |
|
from nemo.lightning.io.pl import TrainerContext |
|
from nemo.utils.get_rank import is_global_rank_zero |
|
|
|
|
|
def get_args(): |
|
"""Parse the command line arguments.""" |
|
parser = ArgumentParser(description="""Enable Speculative Decoding on a NeMo 2.0 checkpoint.""") |
|
|
|
parser.add_argument("--model_path", type=str, required=True, help="""Path to NeMo 2 checkpoint""") |
|
parser.add_argument("--specdec_algo", type=str, default="eagle3", help="""Speculative decoding algorithm to use""") |
|
parser.add_argument("--export_dir", type=str, required=True, help="""Path to export checkpoint""") |
|
parser.add_argument("--tp_size", type=int, default=1, help="""Tensor parallel size""") |
|
parser.add_argument("--pp_size", type=int, default=1, help="""Pipeline parallel size""") |
|
parser.add_argument("--devices", type=int, default=1, help="""Number of GPUs to use per node""") |
|
parser.add_argument("--num_nodes", type=int, default=1, help="""Number of nodes to use""") |
|
parser.add_argument("--tokenizer", type=str, default=None, help="""Name of tokenizer model to override default""") |
|
parser.add_argument("--legacy_ckpt", action="store_true", help="""Load ckpt saved with TE < 1.14""") |
|
|
|
return parser.parse_args() |
|
|
|
|
|
if __name__ == "__main__": |
|
args = get_args() |
|
|
|
|
|
model, trainer = setup_trainer_and_restore_model_with_modelopt_spec( |
|
model_path=args.model_path, |
|
tensor_model_parallel_size=args.tp_size, |
|
pipeline_model_parallel_size=args.pp_size, |
|
devices=args.devices, |
|
num_nodes=args.num_nodes, |
|
tokenizer_path=args.tokenizer, |
|
legacy_ckpt=args.legacy_ckpt, |
|
inference_only=True, |
|
) |
|
|
|
|
|
apply_speculative_decoding(model, algorithm=args.specdec_algo) |
|
|
|
|
|
trainer.save_checkpoint(args.export_dir) |
|
barrier() |
|
if is_global_rank_zero(): |
|
TrainerContext.from_trainer(trainer).io_dump(ckpt_to_context_subdir(args.export_dir), yaml_attrs=["model"]) |
|
|