File size: 1,795 Bytes
3786308 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 |
#Training args
model_name_or_path: openchat/openchat-3.5-0106
torch_dtype: bfloat16
use_lora: true
quantization: 4
quantization_inference: null
gradient_checkpointing: true
force_auto_device_map: false
use_flash_attention: true
generation_config: generation_config.json
stop_words:
- "<|end_of_turn|>"
- "GPT4 Correct User:"
- "GPT4 Correct Assistant:"
- "</s>"
- "<s>"
- "\\n"
# dataset arguments
train_datasets:
- train
validation_datasets:
- validation
test_datasets:
- test
max_seq_length: 8192
generation_max_length: 8192
prompt_loss_weight: 0.0
# checkpoint settings
output_dir: results/finetune/openchat-3.5-0106_Lora
overwrite_output_dir: true
load_best_model_at_end: false
metric_for_best_model: eval_validation_predictions_validation/rouge
greater_is_better: true
save_strategy: "epoch"
save_only_model: true
save_total_limit: 1
# evaluation
do_train: true
do_eval: true
do_predict: true
evaluation_strategy: "epoch"
predict_with_generate: true
evaluate_all_checkpoints: true
# batch size: 2 batch size * 16 gradaccum * 2 GPUs = 64
per_device_train_batch_size: 8
per_device_eval_batch_size: 4
gradient_accumulation_steps: 8
generation_num_beams: 1
# optimizer settings
optim: adamw_torch_fused
learning_rate: 0.0003
weight_decay: 0.001
num_train_epochs: 3
lr_scheduler_type: cosine
warmup_ratio: 0.1
adam_beta1: 0.9
adam_beta2: 0.95
adam_epsilon: 1e-12
# lora settings
lora_r: 128
lora_alpha: 256
lora_dropout: 0.05
lora_target_modules:
- all
# reporting
logging_strategy: steps
logging_first_step: true
logging_steps: 5
report_to: wandb
run_name: "openchat-3.5-0106_Lora"
disable_tqdm: false
# hub settings
push_to_hub: false
resume_from_checkpoint: false
# performance
bf16: true
fp16: false
torch_compile: false
ddp_find_unused_parameters: false
|