cache_dir: ./cache ddp_find_unused_parameters: false ddp_timeout: 30000 device_map: auto do_eval: true do_train: true eval_steps: 1000 evaluation_strategy: steps fp16: true gradient_accumulation_steps: 1 gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false hub_model_id: hllj/sft-mistral-v3-all hub_strategy: every_save learning_rate: 5.0e-05 log_level: info logging_first_step: true logging_steps: 10 logging_strategy: steps lora_alpha: 128 lora_dropout: 0.05 lora_r: 256 lora_target_modules: all lr_scheduler_type: cosine max_seq_length: 1024 model_name_or_path: hllj/mistral-vi-math model_type: auto num_train_epochs: 2 output_dir: outputs-sft-mistral-v3-all overwrite_output_dir: true per_device_eval_batch_size: 8 per_device_train_batch_size: 8 preprocessing_num_workers: 4 push_to_hub: true report_to: wandb run_name: sft-mistral-v3-all save_steps: 1000 save_strategy: steps save_total_limit: 13 seed: 42 token: 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 torch_dtype: float16 train_file_dir: datasets/finetune use_peft: true warmup_ratio: 0.05 weight_decay: 0.05