adam_beta1: 0.9 adam_beta2: 0.999 global_batch_size: 64 gradient_accumulation_steps: 1 learning_rate: 0.0001 logprob_threshold: -1.5 lr_scheduler_type: !!python/object/apply:transformers.trainer_utils.SchedulerType - linear max_label_length: 4096 max_steps: 200000 mixed_precision: bf16 model_name_or_path: sanchit-gandhi/Mistral-1.5B-Instruct-v0.2 num_train_epochs: 3.0 per_device_train_batch_size: 8 teacher_name_or_path: mistralai/Mistral-7B-Instruct-v0.2 temperature: 2.0 warmup_steps: 500 weight_decay: 0.0