laurencer's picture
Step 6000
261dbc8 verified
from dataclasses import dataclass, field, fields
from typing import List, Optional
from torchtune.datasets import ALL_DATASETS
from torchtune.models import ALL_MODELS, ALL_TOKENIZERS
from torchtune.utils.metric_logging import ALL_METRIC_LOGGERS
from torchtune.utils.precision import PRECISION_STR_TO_DTYPE
@dataclass
class ColoringFinetuneParams:
"""Arguments for the finetune_llm recipe.
Args:
device (str): Device to use for training. Options are "cpu" and "cuda"
dtype (str): Data type to use for training.
seed (int): Random seed to use for training.
model (str): String specifying model architecture to fine-tune. See ``torchtune.models.get_model`` for options.
model_checkpoint (str): Local path to load model checkpoint from.
tokenizer (str): String specifying tokenizer to use. See ``torchtune.models.get_tokenizer`` for options.
tokenizer_checkpoint (str): Local path to load tokenizer checkpoint from.
dataset (str): String specifying dataset to use. See ``torchtune.datasets.get_dataset`` for options.
Currently, only predefined datasets in library are supported.
shuffle (bool): Whether to shuffle dataset.
batch_size (int): Batch size to use for training.
epochs (int): Number of epochs to train for.
optimizer (str): String specifying optimizer to use. See ``torchtune.optim.get_optimizer`` for options.
loss (str): String specifying loss function to use. See ``torchtune.losses.get_loss`` for options.
lr (float): Learning rate to use for optimizer.
activation_checkpointing (bool): Whether to use activation checkpointing.
output_dir (str): Local path to save checkpoints and logs to.
run_generation (int): Run eval on a prompt every ``run_generation`` steps. Set to 0 to disable.
max_steps_per_epoch (int): Maximum number of steps to take per epoch.
metric_logger_type (str): String specifying metric logger to use. See ``torchtune.utils.get_metric_logger``
for options.
project (str): Project name to use for logging. Used by ``WandBLogger``.
resume_from_previous_checkpoint (bool): Whether to resume fine-tuning from a previous checkpoint.
cpu_offload (bool): Whether to offload model to CPU.
Raises:
ValueError: If ``cpu_offload`` is ``True`` but ``device`` is not ``cuda`` and <= 1 GPUs.
"""
# Model
model_checkpoint: str = ""
color_layer_initialization: str = "default"
norm_before_color_layer: bool = False
# Tokenizer
tokenizer_checkpoint: str = ""
hf_repo_id: Optional[str] = None
checkpoint_every_n_steps: Optional[int] = None
# Dataset and Sampler
dataset: str = ""
train_on_input: bool = True
shuffle: bool = True
batch_size: int = 2
# Optimizer and Scheduler
optimizer: str = "SGD"
lr: float = 2e-5
loss: str = "CrossEntropyLoss"
gradient_accumulation_steps: int = 1
# Training
compile: bool = False
epochs: int = 3
max_steps_per_epoch: Optional[int] = None
resume_from_checkpoint: bool = False
run_generation: Optional[int] = None
# Distributed
cpu_offload: bool = False
enable_fsdp: bool = True
enable_activation_checkpointing: bool = True
# Environment
device: str = "cuda"
dtype: str = "fp16"
seed: Optional[int] = None
# Logging
output_dir: str = "/tmp/full_finetune_output"
metric_logger_type: str = "disk"
project: Optional[str] = None
log_every_n_steps: Optional[int] = None
def __post_init__(self):
for param in fields(self):
if getattr(self, param.name) == "":
raise TypeError(f"{param.name} needs to be specified")
if self.cpu_offload and self.device != "cuda":
raise ValueError(
"Cannot offload model to CPU if device is not cuda or <= 1 GPUs."
)
if self.enable_fsdp and self.device == "cpu":
raise ValueError("FSDP is not supported on CPU.")
if self.metric_logger_type not in ALL_METRIC_LOGGERS:
raise ValueError(
f"Metric logger not recognized. Expected one of {ALL_METRIC_LOGGERS}, received {self.metric_logger_type}."
)
if self.dtype not in PRECISION_STR_TO_DTYPE:
raise ValueError(
f"Dtype {self.dtype} must be one of {', '.join(PRECISION_STR_TO_DTYPE.keys())} for finetuning."
)