#unsloth/Mistral-Small-Instruct-2409 #"hf_hub_url": "teknium/trismegistus-project", #"hf_hub_url": "AIRRC/Eudaimonic", #"hf_hub_url": "Gryphe/Sonnet3.5-Charcard-Roleplay", #"hf_hub_url": "anthracite-org/kalo_misc_part2", #"hf_hub_url": "anthracite-org/kalo_opus_misc_240827", #"hf_hub_url":"AtlasUnified/atlas-converse", # Paths model = '/workspace/model' output_dir = '/workspace/out' # Lora configuration # can use full_fine_tune=true and no quantization to train the whole model instead of a LoRA #full_fine_tune = true lora_rank = 1024 lora_alpha = 256 lora_dropout = 0.05 # Train only specific modules. This is passed to the parameter of the same name in the LoraConfig. # If not set, adapt all linear modules. # Note, this ALSO affects full fine tuning. In that case, if this is set, only weights containing one # of these keys as substring will have requires_grad. If not set everything is trained. #target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'] # can specify layers to adapt with LoRA if you want #layers_to_transform = '16:31' # for Mixtral, set the load balancing coefficient # load_balancing_loss_coef = 0.02 # Optimization configuration epochs = 2 lr_scheduler = 'cosine' # can also be 'constant' warmup_steps = 50 # might be useful if resuming from a checkpoint and you want to change the LR and force it to something #force_constant_lr = 5e-5 # hard clamp the magnitude of the LoRA weights #scale_weight_norms = 1.0 # dynamic batch size, targeting this many tokens per batch, per device # if set, completely ignores the batch size in the deepspeed JSON config file # can be thought of as a replacement for sample packing batch_size_tokens = 10000 # Performance settings pipeline_stages = 2 # number of pipeline parallel stages, must evenly divide the number of GPUs you launch the script with logging_steps = 10 # how often to log in Tensorboard eval_steps = 500 save_steps = 500 checkpoint_every_n_minutes = 60 eval_before_first_step = false # do an eval before any training happens # dtype to load the underlying model weights in model_weight_dtype = 'bfloat16' # dtype for the LoRA weights lora_weight_dtype = 'bfloat16' # Can have the saved weights be different dtype. Don't need to set this. Could be useful for # training in float32 but saving with float16. #save_dtype = 'bfloat16' # Keep this number of stepXXXX (model saves) and global_stepXXX (checkpoint saves) and delete the rest # (this only applies to the current training session, and resumed training sessions will not touch # old saves) keep_states = 5 # sort examples by length before dividing them into batches # this makes all examples in a batch approximately the same length, to minimize padding # the batches are still shuffled after that # you should probably always have this set to true group_by_length = true # This can also be 'unsloth' to offload hidden states to CPU, saving potentially a lot of VRAM # for a minor performance hit. # Example: 4x4090, PCIE 3.0 16x, pipeline_stages=4, training QLoRA on Llama 3 70B with 4096 sequence length. # true: 75s step time, 19.7G peak per-GPU VRAM usage. # 'unsloth': 78s step time, 16.2G peak per-GPU VRAM usage. activation_checkpointing = 'unsloth' # Keep MLP weights on system RAM until they are needed. Can save a ton of VRAM with a # moderate hit to performance. If using an MoE model, this can also be an integer, in # which case only that many experts are offloaded (tradeoff between VRAM and speed). offload_mlp_to_cpu = true # Resume a prior run # if true, we attempt to resume training from the most recent directory inside output_dir (the directory names are timestamps) # so, to resume, just run the exact same command but set this to true first resume_from_checkpoint = false # Loading the optimizer states seems to cause some kind of unavoidable VRAM memory leak. # It's very small, only about 0.2 GB in cases I've seen. But if you are very close to the # limit, it can cause resuming from checkpoint to OOM. As a last resort, you can uncomment # this to not load the optimizer states and hopefully the resumption won't OOM. #load_optimizer_states = false # Dataset configuration # How to combine multiple datasets if you have more than one. # Can be 'concatenate' or 'interleave'. Will be 'concatenate' if not set. dataset_combination_mode = 'concatenate' # When to stop interleaving datasets when using mode 'interleave'. Either 'first_exhausted' or 'all_exhausted'. # Default if not set: 'first_exhausted' # dataset_interleave_stopping_strategy = 'all_exhausted' # Can set this lower than training, so we don't drop as many examples when trying to make equal-sized batches. # Default if not set: same as training GAS. eval_gradient_accumulation_steps = 1 # bitsandbytes 4 bit quantization. The parameters here become arguments to Transformers BitsAndBytesConfig. #[quantization.bnb] #load_in_4bit = true #bnb_4bit_use_double_quant = false #bnb_4bit_compute_dtype = 'bfloat16' # HQQ quantization. The parameters here become arguments to CustomHQQConfig. # [quantization.hqq] # nbits = 4 # group_size = 64 # compute_dtype = 'bfloat16' # (Optional) You can override the quant params for certain modules. This does substring matching, e.g. if 'gate_proj' # is a substring of the full module name, anything specified overwrites the defaults in [quantization.hqq]. # [quantization.hqq.dynamic_config] # gate_proj = {nbits = 2, group_size = 16, quant_zero = true, quant_scale = true} # up_proj = {nbits = 2, group_size = 16, quant_zero = true, quant_scale = true} # down_proj = {nbits = 2, group_size = 16, quant_zero = true, quant_scale = true} [optimizer] # options: adamw_kahan, AdamW, AdamW8bit type = 'adamw_kahan' lr = 5e-5 beta1 = 0.9 beta2 = 0.99 weight_decay = 0.1 [[datasets]] # Arbitrary name, used only for separately logging eval metrics. Will be dataset0, dataset1, etc if not set. name = 'acolyte' dataset_type = 'axolotl' dataset_path = './acolyte.yml' sequence_len = 16384 eval_size = 0.01 # Relative sampling weight, when using combination mode 'interleave'. Will be 1 if not set. sample_weight = 1