NeMo_Canary / scripts /nlp_language_modeling /prepare_packed_ft_dataset.py

Upload folder using huggingface_hub

b386992 verified about 2 months ago

11 kB

	# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import os
	from dataclasses import dataclass
	from typing import TYPE_CHECKING, Tuple

	import numpy as np

	from nemo.collections.nlp.data.language_modeling.megatron.gpt_sft_dataset import GPTSFTDataset
	from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
	from nemo.core.config import hydra_runner
	from nemo.utils import logging
	from nemo.utils.sequence_packing_utils import create_hist, create_packing_strategy, fill_packing_strategy

	if TYPE_CHECKING:
	from omegaconf import DictConfig

	"""
	Script to prepare packed dataset from a SFT/PEFT dataset in the jsonl format.
	Two main steps are run in this script:
	1. The online processing code in GPTSFTDataset is run (including prompt template manipulation,
	sequence length truncation, tokenization, etc) and the result is an array of tokenized sequences,
	represented by indices).
	2. The sequences are grouped by length, and a packing algorithm is run. (https://en.wikipedia.org/wiki/Bin_packing_problem#Offline_algorithms)
	Currently, two variants of "first fit" are supported.
	"first_fit_decreasing" sorts the sequences in decreasing order before applying first-fit.
	It generates a more optimal packing, but it tends to keep all short sequences together, which may affect convergence.
	"first_fit_shuffle" runs first-fit in a random order. Packing is less optimal but it keeps the dataset order random.
	The recommendation is to run "first_fit_shuffle" and check the packed sequence lengths in the printout.
	If they are similar to the target length (i.e. packing is efficient), then use shuffle. Otherwise try first_fit_decreasing.

	Example usage:

	python scripts/nlp_language_modeling/prepare_packed_ft_dataset.py \
	model.data.train_ds.file_names=[/path/to/training.jsonl] \
	model.data.train_ds.max_seq_length=2048 \
	+tokenizer_path=<see note 1 below> \
	+output_dir=/path/to/output_folder \
	+pack_sizes=[2048,4096,8192]

	when using context parallelism (CP) with packed dataset, CP size needs to be set in the command:

	python scripts/nlp_language_modeling/prepare_packed_ft_dataset.py \
	model.data.train_ds.file_names=[/path/to/training.jsonl] \
	model.data.train_ds.max_seq_length=4096 \
	++model.context_parallel_size=2 \
	+tokenizer_path=<see note 1 below> \
	+output_dir=/path/to/output_folder \
	+pack_sizes=[4096]

	Note:
	- Tokenizer path supports SentencePiece tokenizer and HF tokenizer.
	For SentencePiece tokenizer, specify the file /path/to/tokenizer.model
	For HF tokenizer, specify a folder /path/to/hf_folder which contains tokenizer.json, tokenizer_config.json
	and special_tokens_map.json

	- If your model or dataset requires non-default configs for conventional SFT/PEFT training in NeMo, you will
	need to pass in the same configs to ``model.data.train_ds`` as you would for training with unpacked dataset.

	- ``model.data.train_ds.max_seq_length`` is the length to truncate each sequence before packing multiple sequences
	to the size of packed sequence (``pack_size``). ``max_seq_length`` should be set to the same value as unpacked data,
	and can be determined by examining the distribution of sequence lengths in the dataset.

	- ``model.context_parallel_size`` is the CP size the model uses in SFT. The default value is 1 (no context parallelism)
	if not specified. This argument is necessary to make each individual sequence length in a packed sequence a multiple of CP*2
	when CP is enabled in SFT.

	- ``pack_sizes`` is a list of packed sequence lengths. In this example, there will be three output files, one for
	each pack size. The output files are named ``<output_folder>/packed_{pack_size}_seed{seed}.npy``.
	This argument is a list because you will likely want to experiment with a few ``pack_sizes`` to find out which length
	can fill the GPU memory without exceeding it. Adjusting ``pack_size`` is analogous to adjusting the micro batch size in
	the unpacked case.
	"""


	def tokenize_dataset(cfg: 'DictConfig'):
	"""
	Tokenizes a dataset using the same configuration file as finetuninng with GPTSFTDataset.

	This function reads a dataset and tokenizes it using SentencePiece tokenizer based on the provided configuration.

	Args:
	cfg: A Hydra configuration object containing parameters for tokenization.

	Returns:
	A NumPy array containing the tokenized sequences from the dataset.
	"""

	logging.info("Tokenizing dataset...")
	# using the same template as SFT/PEFT script. This may be overkill but guarantees the preprocess settings
	# are identical to normal SFT training
	data_cfg = cfg.model.data.train_ds
	pad_seq_length_to_mult = 16
	cp_size = cfg.model.get("context_parallel_size", 1)

	# if context parallel is used, each individual data length in one packed dataset sample
	# needs to be a multiple of (cp_size * 2): https://github.com/NVIDIA/TransformerEngine/pull/641
	if cp_size > 1:
	pad_seq_length_to_mult = max(pad_seq_length_to_mult, cp_size * 2)

	if os.path.isdir(cfg.tokenizer_path):
	# pass in a Hugging Face folder which contains tokenizer.json
	tokenizer = get_nmt_tokenizer(library="huggingface", model_name=cfg.tokenizer_path, use_fast=True)
	else:
	tokenizer = get_nmt_tokenizer(library="sentencepiece", tokenizer_model=cfg.tokenizer_path)

	dataset = GPTSFTDataset(
	file_path=data_cfg.file_names[0],
	tokenizer=tokenizer,
	max_seq_length=data_cfg.max_seq_length,
	min_seq_length=data_cfg.min_seq_length,
	pad_seq_length_to_mult=pad_seq_length_to_mult,
	add_bos=data_cfg.get('add_bos', False),
	add_eos=data_cfg.get('add_eos', True),
	add_sep=data_cfg.get('add_sep', False),
	sep_id=cfg.get('sep_id', 49704),
	max_num_samples=None,
	seed=data_cfg.get('seed', 1234),
	label_key=data_cfg.get('label_key', 'answer'),
	answer_only_loss=cfg.get('answer_only_loss', True),
	truncation_field=data_cfg.get('truncation_field', 'text'),
	pad_to_max_length=data_cfg.get('pad_to_max_length', False),
	index_mapping_dir=data_cfg.get('index_mapping_dir', None),
	prompt_template=data_cfg.get('prompt_template', None),
	virtual_tokens=0,
	tokens_to_generate=data_cfg.get('tokens_to_generate', 0),
	memmap_workers=data_cfg.get('memmap_workers', None),
	hf_dataset=data_cfg.get('hf_dataset', False),
	truncation_method=data_cfg.get('truncation_method', 'right'),
	special_tokens=data_cfg.get('chat_prompt_tokens', None),
	is_test=True,
	)

	max_seq_length = dataset.max_seq_length
	pad_id = dataset.tokenizer.eos_id
	tokenizer = dataset.tokenizer
	pad_seq_length_to_mult = dataset.pad_seq_length_to_mult
	dataset = np.array([dataset[i] for i in range(len(dataset))])
	if cp_size > 1:

	def pre_pad_dataset(data, max_seq_length, max_length_to_pad, pad_id):
	'''
	pad each individual data point to the length of max_length
	'''
	assert max_seq_length >= max_length_to_pad
	for key, val in data.items():
	if key in {'input_ids', 'context_ids'}:
	if len(val) <= max_length_to_pad:
	# because input_ids are truncated by 1 for inputs and labels,
	# we add 1 extra padding here to make sure padded inputs and labels
	# are is a multiple of (cp_size * 2)
	val = val + [pad_id] * (max_length_to_pad - len(val) + 1)
	data[key] = val
	elif len(val) > max_seq_length:
	logging.info(
	f"""The current sequence length {len(val)} for packing is
	larger than the max_seq_length specified ({max_seq_length}).
	The current seqquence length is truncated to the size of max_seq_length.
	Please consider increase the sequence packing size"""
	)
	data[key] = val[:max_seq_length]
	return

	ceil_to_nearest = lambda n, m: (n + m - 1) // m * m
	for data in dataset:
	max_length_to_pad = min(max_seq_length, ceil_to_nearest(len(data['input_ids']), pad_seq_length_to_mult))
	pre_pad_dataset(data, max_seq_length, max_length_to_pad, pad_id)
	return dataset, tokenizer


	@dataclass
	class PackingArgs:
	output_dir: str = "output"
	pack_sizes: Tuple[int] = (2048,)
	packing_algorithm: str = "first_fit_shuffle"
	seed: int = 0

	def from_config(self, cfg: 'DictConfig'):
	for required_arg in ('output_dir', 'pack_sizes'):
	assert cfg.get(required_arg, None), f"Please specify +{required_arg}=..."
	self.output_dir = cfg.output_dir
	self.pack_sizes = cfg.pack_sizes
	self.packing_algorithm = cfg.get("packing_algorithm", "first_fit_shuffle")
	self.seed = cfg.get("seed", 0)
	return self


	@hydra_runner(
	config_path="../../examples/nlp/language_modeling/tuning/conf", config_name="megatron_gpt_finetuning_config"
	)
	def main(cfg: 'DictConfig') -> None:
	args = PackingArgs().from_config(cfg)
	dataset, tokenizer = tokenize_dataset(cfg)
	sequences, histogram = create_hist(dataset, cfg.model.data.train_ds.max_seq_length)
	for pack_size in args.pack_sizes:
	assignments, _ = create_packing_strategy(histogram, pack_size, args.packing_algorithm)
	output_data = fill_packing_strategy(assignments, sequences, pack_size, tokenizer.eos_id)

	# save output data
	os.makedirs(args.output_dir, exist_ok=True)
	output_path = os.path.join(args.output_dir, f'packed_{pack_size}_seed{args.seed}.npy')
	np.save(output_path, output_data)
	logging.info(f"Done, output written to {output_path}")

	logging.info(
	f"""
	✅ Packed datasets with pack sizes {args.pack_sizes} are prepared successfully.
	To train with packed sequences, you need to make changes to the SFT/PEFT config file. See NeMo Documentation
	for more details: <https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/features/throughput_optimizations.html#sequence-packing-for-sft-peft>
	"""
	)


	if __name__ == '__main__':
	main()