#Training args model_name_or_path: openchat/openchat-3.5-0106 torch_dtype: bfloat16 use_lora: true quantization: 4 quantization_inference: null gradient_checkpointing: true force_auto_device_map: false use_flash_attention: true generation_config: generation_config.json stop_words: - "<|end_of_turn|>" - "GPT4 Correct User:" - "GPT4 Correct Assistant:" - "" - "" - "\\n" # dataset arguments train_datasets: - train validation_datasets: - validation test_datasets: - test max_seq_length: 8192 generation_max_length: 8192 prompt_loss_weight: 0.0 # checkpoint settings output_dir: results/finetune/openchat-3.5-0106_Lora overwrite_output_dir: true load_best_model_at_end: false metric_for_best_model: eval_validation_predictions_validation/rouge greater_is_better: true save_strategy: "epoch" save_only_model: true save_total_limit: 1 # evaluation do_train: true do_eval: true do_predict: true evaluation_strategy: "epoch" predict_with_generate: true evaluate_all_checkpoints: true # batch size: 2 batch size * 16 gradaccum * 2 GPUs = 64 per_device_train_batch_size: 8 per_device_eval_batch_size: 4 gradient_accumulation_steps: 8 generation_num_beams: 1 # optimizer settings optim: adamw_torch_fused learning_rate: 0.0003 weight_decay: 0.001 num_train_epochs: 3 lr_scheduler_type: cosine warmup_ratio: 0.1 adam_beta1: 0.9 adam_beta2: 0.95 adam_epsilon: 1e-12 # lora settings lora_r: 128 lora_alpha: 256 lora_dropout: 0.05 lora_target_modules: - all # reporting logging_strategy: steps logging_first_step: true logging_steps: 5 report_to: wandb run_name: "openchat-3.5-0106_Lora" disable_tqdm: false # hub settings push_to_hub: false resume_from_checkpoint: false # performance bf16: true fp16: false torch_compile: false ddp_find_unused_parameters: false