{ "train_micro_batch_size_per_gpu": 2, "gradient_accumulation_steps": 1, "steps_per_print": 100, "gradient_clipping": 1.0, "fp16": { "enabled": true, "loss_scale": 0, "loss_scale_window": 2000, "hysteresis": 2, "min_loss_scale": 0.0 }, "zero_optimization": { "stage": 2, "reduce_bucket_size": 50000000, "overlap_comm": true }, "sparse_attention": { "mode": "fixed", "block": 16, "different_layout_per_head": true, "num_local_blocks": 8, "num_global_blocks": 1, "attention": "unidirectional", "horizontal_global_attention": false, "num_different_global_patterns": 8 } }