|
{ |
|
"trainer": { |
|
"evaluation_strategy": "steps", |
|
"per_device_train_batch_size": 2, |
|
"per_device_eval_batch_size": 2, |
|
"gradient_accumulation_steps": 16, |
|
"eval_steps": 150, |
|
"save_steps": 150, |
|
"logging_steps": 5, |
|
"learning_rate": 0.003, |
|
"num_train_epochs": 5, |
|
"lr_scheduler_type": "cosine", |
|
"warmup_steps": 100, |
|
"fp16": false, |
|
"bf16": true, |
|
"gradient_checkpointing": false, |
|
"torch_compile": false, |
|
"optim": "adamw_torch", |
|
"half_precision_backend": "auto", |
|
"fp16_opt_level": "O2" |
|
}, |
|
"deepspeed": { |
|
"bf16": { |
|
"enabled": true |
|
}, |
|
"optimizer": { |
|
"type": "AdamW", |
|
"params": { |
|
"lr": "auto", |
|
"betas": "auto", |
|
"eps": "auto", |
|
"weight_decay": "auto" |
|
} |
|
}, |
|
"zero_optimization": { |
|
"stage": 2, |
|
"offload_optimizer": { |
|
"device": "cpu", |
|
"pin_memory": true |
|
}, |
|
"overlap_comm": true, |
|
"round_robin_gradients": true |
|
}, |
|
"train_batch_size": "auto", |
|
"gradient_accumulation_steps": "auto" |
|
}, |
|
"model_name": "ai-forever/FRED-T5-1.7B", |
|
"templates_path": "ru_alpaca_seq2seq_template.json", |
|
"model_type": "seq2seq", |
|
"max_source_tokens_count": 512, |
|
"max_target_tokens_count": 512 |
|
} |
|
|
|
|