cache_dir: ./cache ddp_find_unused_parameters: false ddp_timeout: 30000 device_map: auto do_eval: true do_train: true eval_steps: 1000 evaluation_strategy: steps fp16: true gradient_accumulation_steps: 1 gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false hub_model_id: hllj/sft-mistral-v1-clean-valid hub_strategy: every_save learning_rate: 3.0e-05 log_level: info logging_first_step: true logging_steps: 10 logging_strategy: steps lora_alpha: 128 lora_dropout: 0.05 lora_r: 256 lora_target_modules: - q_proj - k_proj - v_proj - o_proj lr_scheduler_type: cosine max_seq_length: 1024 model_name_or_path: hllj/mistral-vi-math model_type: auto num_train_epochs: 2 output_dir: outputs-sft-mistral-v1-clean-valid overwrite_output_dir: true per_device_eval_batch_size: 4 per_device_train_batch_size: 4 preprocessing_num_workers: 4 push_to_hub: true report_to: wandb run_name: sft-mistral-v1-clean-valid save_steps: 1000 save_strategy: steps save_total_limit: 13 seed: 42 token: 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 torch_dtype: float16 train_file_dir: datasets/finetune use_peft: true validation_file_dir: datasets/validation warmup_ratio: 0.05 weight_decay: 0.05