rugpt3xl / deepspeed_config.json
Anton Emelyanov
add model files
1c6d739
{
"train_micro_batch_size_per_gpu": 2,
"gradient_accumulation_steps": 1,
"steps_per_print": 100,
"gradient_clipping": 1.0,
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 2000,
"hysteresis": 2,
"min_loss_scale": 0.0
},
"zero_optimization": {
"stage": 2,
"reduce_bucket_size": 50000000,
"overlap_comm": true
},
"sparse_attention": {
"mode": "fixed",
"block": 16,
"different_layout_per_head": true,
"num_local_blocks": 8,
"num_global_blocks": 1,
"attention": "unidirectional",
"horizontal_global_attention": false,
"num_different_global_patterns": 8
}
}