adam_beta1: 0.9 | |
adam_beta2: 0.999 | |
global_batch_size: 32 | |
gradient_accumulation_steps: 2 | |
learning_rate: 0.0001 | |
logprob_threshold: -1.5 | |
lr_scheduler_type: !!python/object/apply:transformers.trainer_utils.SchedulerType | |
- linear | |
max_label_length: 4096 | |
max_steps: 200000 | |
mixed_precision: bf16 | |
model_name_or_path: sanchit-gandhi/Mistral-3B-Instruct-v0.2 | |
num_train_epochs: 3.0 | |
per_device_train_batch_size: 4 | |
teacher_name_or_path: mistralai/Mistral-7B-Instruct-v0.2 | |
temperature: 2.0 | |
warmup_steps: 500 | |
weight_decay: 0.0 | |