57: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 63: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 39: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 53: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 33: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 13: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 58: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 40: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 60: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 52: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 30: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 57: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 39: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 57: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 39: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 58: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 57: START 2059276: Fri Nov 25 09:39:25 EET 2022 39: START 2059276: Fri Nov 25 09:39:25 EET 2022 58: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 58: START 2059276: Fri Nov 25 09:39:25 EET 2022 53: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 55: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 51: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 1: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 61: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 59: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 35: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 5: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 29: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 37: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 40: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 4: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 56: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 54: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 62: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 36: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 28: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 38: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 0: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 2: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 24: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 49: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 47: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 41: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 45: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 43: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 27: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 25: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 3: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 7: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 17: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 23: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 53: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 40: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 34: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 46: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 44: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 6: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 16: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 10: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 20: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 53: START 2059276: Fri Nov 25 09:39:25 EET 2022 40: START 2059276: Fri Nov 25 09:39:25 EET 2022 42: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 9: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 32: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 63: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 63: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 60: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 60: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 63: START 2059276: Fri Nov 25 09:39:25 EET 2022 60: START 2059276: Fri Nov 25 09:39:25 EET 2022 33: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 33: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 33: START 2059276: Fri Nov 25 09:39:25 EET 2022 59: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 13: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 29: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 52: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 30: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 54: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 36: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 28: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 38: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 48: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 50: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 55: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 59: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 13: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 35: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 29: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 37: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 52: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 56: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 54: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 0: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 2: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 49: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 34: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 18: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 12: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 14: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 55: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 59: START 2059276: Fri Nov 25 09:39:25 EET 2022 13: START 2059276: Fri Nov 25 09:39:25 EET 2022 35: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 29: START 2059276: Fri Nov 25 09:39:25 EET 2022 37: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 52: START 2059276: Fri Nov 25 09:39:25 EET 2022 30: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 56: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 54: START 2059276: Fri Nov 25 09:39:25 EET 2022 36: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 28: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 38: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 49: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 55: START 2059276: Fri Nov 25 09:39:25 EET 2022 35: START 2059276: Fri Nov 25 09:39:25 EET 2022 37: START 2059276: Fri Nov 25 09:39:25 EET 2022 30: START 2059276: Fri Nov 25 09:39:25 EET 2022 56: START 2059276: Fri Nov 25 09:39:25 EET 2022 36: START 2059276: Fri Nov 25 09:39:25 EET 2022 28: START 2059276: Fri Nov 25 09:39:25 EET 2022 38: START 2059276: Fri Nov 25 09:39:25 EET 2022 0: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 2: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 34: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 0: START 2059276: Fri Nov 25 09:39:25 EET 2022 2: START 2059276: Fri Nov 25 09:39:25 EET 2022 34: START 2059276: Fri Nov 25 09:39:25 EET 2022 49: START 2059276: Fri Nov 25 09:39:25 EET 2022 57: 57: 57: ======================= ROCm System Management Interface ======================= 57: ================================= Concise Info ================================= 57: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 57: 0 44.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 57: 1 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 57: 2 42.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 57: 3 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 57: 4 42.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 57: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 57: 6 38.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 57: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 57: ================================================================================ 57: ============================= End of ROCm SMI Log ============================== 39: 39: 39: ======================= ROCm System Management Interface ======================= 39: ================================= Concise Info ================================= 39: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 39: 0 46.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 39: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 39: 2 41.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 39: 3 38.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 39: 4 44.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 39: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 39: 6 40.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 39: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 39: ================================================================================ 39: ============================= End of ROCm SMI Log ============================== 1: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 61: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 5: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 58: 58: 58: ======================= ROCm System Management Interface ======================= 58: ================================= Concise Info ================================= 58: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 58: 0 45.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 58: 1 51.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 58: 2 37.0c 98.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 58: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 58: 4 41.0c 100.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 58: 5 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 58: 6 41.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 58: 7 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 58: ================================================================================ 58: ============================= End of ROCm SMI Log ============================== 47: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 41: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 45: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 27: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 31: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 46: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 16: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 10: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 51: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 1: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 61: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 5: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 62: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 47: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 45: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 27: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 25: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 19: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 8: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 15: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 22: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 51: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 1: START 2059276: Fri Nov 25 09:39:25 EET 2022 61: START 2059276: Fri Nov 25 09:39:25 EET 2022 5: START 2059276: Fri Nov 25 09:39:25 EET 2022 47: START 2059276: Fri Nov 25 09:39:25 EET 2022 41: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 45: START 2059276: Fri Nov 25 09:39:25 EET 2022 27: START 2059276: Fri Nov 25 09:39:25 EET 2022 46: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 16: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 10: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 51: START 2059276: Fri Nov 25 09:39:25 EET 2022 62: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 41: START 2059276: Fri Nov 25 09:39:25 EET 2022 25: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 46: START 2059276: Fri Nov 25 09:39:25 EET 2022 16: START 2059276: Fri Nov 25 09:39:25 EET 2022 10: START 2059276: Fri Nov 25 09:39:25 EET 2022 62: START 2059276: Fri Nov 25 09:39:25 EET 2022 25: START 2059276: Fri Nov 25 09:39:25 EET 2022 11: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 26: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 3: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 3: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 3: START 2059276: Fri Nov 25 09:39:25 EET 2022 24: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 24: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 24: START 2059276: Fri Nov 25 09:39:25 EET 2022 7: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 7: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 7: START 2059276: Fri Nov 25 09:39:25 EET 2022 17: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 17: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 17: START 2059276: Fri Nov 25 09:39:25 EET 2022 4: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 4: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 4: START 2059276: Fri Nov 25 09:39:25 EET 2022 43: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 43: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 43: START 2059276: Fri Nov 25 09:39:25 EET 2022 21: Model parameters: d_model 4096 ffw_size 16384 kv_size 128 n_heads 32 n_layers 42 53: 53: 53: ======================= ROCm System Management Interface ======================= 53: ================================= Concise Info ================================= 53: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 53: 0 43.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 53: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 53: 2 40.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 53: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 53: 4 42.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 53: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 53: 6 41.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 53: 7 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 53: ================================================================================ 53: ============================= End of ROCm SMI Log ============================== 40: 40: 40: ======================= ROCm System Management Interface ======================= 40: ================================= Concise Info ================================= 40: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 40: 0 47.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 40: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 40: 2 38.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 40: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 40: 4 39.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 40: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 40: 6 37.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 40: 7 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 40: ================================================================================ 40: ============================= End of ROCm SMI Log ============================== 44: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 44: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 23: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 23: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 44: START 2059276: Fri Nov 25 09:39:25 EET 2022 23: START 2059276: Fri Nov 25 09:39:25 EET 2022 20: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 20: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 63: 63: 63: ======================= ROCm System Management Interface ======================= 63: ================================= Concise Info ================================= 63: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 63: 0 48.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 63: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 63: 2 45.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 63: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 63: 4 45.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 63: 5 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 63: 6 38.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 63: 7 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 63: ================================================================================ 63: ============================= End of ROCm SMI Log ============================== 60: 60: 60: ======================= ROCm System Management Interface ======================= 60: ================================= Concise Info ================================= 60: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 60: 0 45.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 60: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 60: 2 40.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 60: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 60: 4 44.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 60: 5 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 60: 6 38.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 60: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 60: ================================================================================ 60: ============================= End of ROCm SMI Log ============================== 20: START 2059276: Fri Nov 25 09:39:25 EET 2022 6: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 6: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 33: 33: 33: ======================= ROCm System Management Interface ======================= 33: ================================= Concise Info ================================= 33: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 33: 0 46.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 33: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 33: 2 43.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 33: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 33: 4 45.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 33: 5 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 33: 6 42.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 33: 7 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 33: ================================================================================ 33: ============================= End of ROCm SMI Log ============================== 6: START 2059276: Fri Nov 25 09:39:25 EET 2022 42: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 42: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 52: 52: 52: ======================= ROCm System Management Interface ======================= 52: ================================= Concise Info ================================= 52: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 52: 0 44.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 52: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 52: 2 40.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 52: 3 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 52: 4 42.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 52: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 52: 6 43.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 52: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 52: ================================================================================ 52: ============================= End of ROCm SMI Log ============================== 30: 30: 30: ======================= ROCm System Management Interface ======================= 30: ================================= Concise Info ================================= 30: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 30: 0 41.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 30: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 30: 2 43.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 30: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 30: 4 40.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 30: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 30: 6 38.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 30: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 30: ================================================================================ 30: ============================= End of ROCm SMI Log ============================== 42: START 2059276: Fri Nov 25 09:39:25 EET 2022 9: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 9: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 9: START 2059276: Fri Nov 25 09:39:25 EET 2022 32: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 32: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 54: 54: 54: ======================= ROCm System Management Interface ======================= 54: ================================= Concise Info ================================= 54: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 54: 0 39.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 54: 1 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 54: 2 38.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 54: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 54: 4 37.0c 99.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 54: 5 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 54: 6 37.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 54: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 54: ================================================================================ 54: ============================= End of ROCm SMI Log ============================== 32: START 2059276: Fri Nov 25 09:39:25 EET 2022 48: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 48: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 13: 13: 13: ======================= ROCm System Management Interface ======================= 13: ================================= Concise Info ================================= 13: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 13: 0 50.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 13: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 13: 2 43.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 13: 3 51.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 13: 4 44.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 13: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 13: 6 38.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 13: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 13: ================================================================================ 13: ============================= End of ROCm SMI Log ============================== 48: START 2059276: Fri Nov 25 09:39:25 EET 2022 59: 59: 59: ======================= ROCm System Management Interface ======================= 59: ================================= Concise Info ================================= 59: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 59: 0 44.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 59: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 59: 2 43.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 59: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 59: 4 41.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 59: 5 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 59: 6 41.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 59: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 59: ================================================================================ 59: ============================= End of ROCm SMI Log ============================== 50: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 50: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 50: START 2059276: Fri Nov 25 09:39:25 EET 2022 29: 29: 29: ======================= ROCm System Management Interface ======================= 29: ================================= Concise Info ================================= 29: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 29: 0 46.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 29: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 29: 2 42.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 29: 3 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 29: 4 46.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 29: 5 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 29: 6 40.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 29: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 29: ================================================================================ 29: ============================= End of ROCm SMI Log ============================== 12: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 12: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 14: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 14: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 12: START 2059276: Fri Nov 25 09:39:25 EET 2022 28: 28: 28: ======================= ROCm System Management Interface ======================= 28: ================================= Concise Info ================================= 28: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 28: 0 44.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 28: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 28: 2 42.0c 97.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 28: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 28: 4 41.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 28: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 28: 6 41.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 28: 7 38.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 28: ================================================================================ 28: ============================= End of ROCm SMI Log ============================== 18: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 14: START 2059276: Fri Nov 25 09:39:25 EET 2022 18: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 36: 36: 36: ======================= ROCm System Management Interface ======================= 36: ================================= Concise Info ================================= 36: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 36: 0 40.0c 97.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 36: 1 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 36: 2 38.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 36: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 36: 4 40.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 36: 5 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 36: 6 39.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 36: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 36: ================================================================================ 36: ============================= End of ROCm SMI Log ============================== 18: START 2059276: Fri Nov 25 09:39:25 EET 2022 31: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 31: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 38: 38: 38: ======================= ROCm System Management Interface ======================= 38: ================================= Concise Info ================================= 38: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 38: 0 46.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 38: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 38: 2 41.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 38: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 38: 4 40.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 38: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 38: 6 43.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 38: 7 52.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 38: ================================================================================ 38: ============================= End of ROCm SMI Log ============================== 0: 0: 0: ======================= ROCm System Management Interface ======================= 0: ================================= Concise Info ================================= 0: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 0: 0 41.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 0: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 0: 2 38.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 0: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 0: 4 45.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 0: 5 52.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 0: 6 43.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 0: 7 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 0: ================================================================================ 0: ============================= End of ROCm SMI Log ============================== 8: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 31: START 2059276: Fri Nov 25 09:39:25 EET 2022 8: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 8: START 2059276: Fri Nov 25 09:39:25 EET 2022 19: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 19: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 19: START 2059276: Fri Nov 25 09:39:25 EET 2022 22: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 22: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 2: 2: 2: ======================= ROCm System Management Interface ======================= 2: ================================= Concise Info ================================= 2: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 2: 0 44.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 2: 1 39.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 2: 2 43.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 2: 3 39.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 2: 4 45.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 2: 5 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 2: 6 43.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 2: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 2: ================================================================================ 2: ============================= End of ROCm SMI Log ============================== 22: START 2059276: Fri Nov 25 09:39:25 EET 2022 15: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 15: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 55: 55: 55: ======================= ROCm System Management Interface ======================= 55: ================================= Concise Info ================================= 55: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 55: 0 42.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 55: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 55: 2 39.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 55: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 55: 4 45.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 55: 5 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 55: 6 41.0c 82.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 55: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 55: ================================================================================ 55: ============================= End of ROCm SMI Log ============================== 37: 37: 37: ======================= ROCm System Management Interface ======================= 37: ================================= Concise Info ================================= 37: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 37: 0 49.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 37: 1 52.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 37: 2 41.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 37: 3 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 37: 4 41.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 37: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 37: 6 45.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 37: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 37: ================================================================================ 37: ============================= End of ROCm SMI Log ============================== 15: START 2059276: Fri Nov 25 09:39:25 EET 2022 56: 56: 56: ======================= ROCm System Management Interface ======================= 56: ================================= Concise Info ================================= 56: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 56: 0 41.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 56: 1 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 56: 2 46.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 56: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 56: 4 41.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 56: 5 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 56: 6 43.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 56: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 56: ================================================================================ 56: ============================= End of ROCm SMI Log ============================== 26: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 26: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 26: START 2059276: Fri Nov 25 09:39:25 EET 2022 11: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 35: 35: 35: ======================= ROCm System Management Interface ======================= 35: ================================= Concise Info ================================= 35: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 35: 0 44.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 35: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 35: 2 42.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 35: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 35: 4 43.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 35: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 35: 6 39.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 35: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 35: ================================================================================ 35: ============================= End of ROCm SMI Log ============================== 11: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 11: START 2059276: Fri Nov 25 09:39:25 EET 2022 21: Megatron-DeepSpeed/pretrain_gpt.py --tensor-model-parallel-size 2 --pipeline-model-parallel-size 2 --num-layers 42 --hidden-size 4096 --num-attention-heads 32 --kv-channels 128 --ffn-hidden-size 16384 --seq-length 2048 --max-position-embeddings 2048 --micro-batch-size 2 --global-batch-size 1024 --train-samples 5_625_981 --vocab-file gpt2/vocab.json --merge-file gpt2/merges.txt --clip-grad 1.0 --kill-switch-path kill-switch-8b7 --bf16 --optimizer adam --adam-beta1 0.9 --adam-beta2 0.999 --adam-eps 1e-8 --lr 2e-4 --min-lr 2e-5 --lr-decay-style cosine --lr-decay-samples 5_625_981 --lr-warmup-samples 56_260 --clip-grad 1.0 --weight-decay 1e-1 --log-interval 10 --save-interval 1000 --eval-interval 1000 --eval-iters 1 --tensorboard-dir tensorboard_8b7 --tensorboard-queue-size 5 --log-timers-to-tensorboard --log-batch-size-to-tensorboard --log-validation-ppl-to-tensorboard --save checkpoints_8b7 --load checkpoints_8b7 --data-path /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document --data-i 21: mpl mmap --split 949,50,1 --deepspeed --deepspeed_config ds_configs/2059276.json --zero-stage 0 21: START 2059276: Fri Nov 25 09:39:25 EET 2022 49: 49: 49: ======================= ROCm System Management Interface ======================= 49: ================================= Concise Info ================================= 49: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 49: 0 42.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 49: 1 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 49: 2 45.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 49: 3 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 49: 4 47.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 49: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 49: 6 42.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 49: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 49: ================================================================================ 49: ============================= End of ROCm SMI Log ============================== 34: 34: 34: ======================= ROCm System Management Interface ======================= 34: ================================= Concise Info ================================= 34: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 34: 0 42.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 34: 1 56.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 34: 2 43.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 34: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 34: 4 43.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 34: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 34: 6 37.0c 97.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 34: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 34: ================================================================================ 34: ============================= End of ROCm SMI Log ============================== 46: 46: 46: ======================= ROCm System Management Interface ======================= 46: ================================= Concise Info ================================= 46: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 46: 0 43.0c 97.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 46: 1 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 46: 2 41.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 46: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 46: 4 50.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 46: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 46: 6 45.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 46: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 46: ================================================================================ 46: ============================= End of ROCm SMI Log ============================== 1: 1: 1: ======================= ROCm System Management Interface ======================= 1: ================================= Concise Info ================================= 1: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 1: 0 44.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 1: 1 51.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 1: 2 41.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 1: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 1: 4 46.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 1: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 1: 6 42.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 1: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 1: ================================================================================ 1: ============================= End of ROCm SMI Log ============================== 47: 47: 47: ======================= ROCm System Management Interface ======================= 47: ================================= Concise Info ================================= 47: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 47: 0 47.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 47: 1 52.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 47: 2 41.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 47: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 47: 4 39.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 47: 5 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 47: 6 41.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 47: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 47: ================================================================================ 47: ============================= End of ROCm SMI Log ============================== 41: 41: 41: ======================= ROCm System Management Interface ======================= 41: ================================= Concise Info ================================= 41: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 41: 0 47.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 41: 1 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 41: 2 44.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 41: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 41: 4 42.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 41: 5 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 41: 6 39.0c 100.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 41: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 41: ================================================================================ 41: ============================= End of ROCm SMI Log ============================== 45: 45: 45: ======================= ROCm System Management Interface ======================= 45: ================================= Concise Info ================================= 45: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 45: 0 45.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 45: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 45: 2 41.0c 102.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 45: 3 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 45: 4 42.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 45: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 45: 6 43.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 45: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 45: ================================================================================ 45: ============================= End of ROCm SMI Log ============================== 16: 16: 16: ======================= ROCm System Management Interface ======================= 16: ================================= Concise Info ================================= 16: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 16: 0 44.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 16: 1 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 16: 2 44.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 16: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 16: 4 41.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 16: 5 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 16: 6 44.0c 97.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 16: 7 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 16: ================================================================================ 16: ============================= End of ROCm SMI Log ============================== 5: 5: 5: ======================= ROCm System Management Interface ======================= 5: ================================= Concise Info ================================= 5: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 5: 0 42.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 5: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 5: 2 40.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 5: 3 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 5: 4 47.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 5: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 5: 6 41.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 5: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 5: ================================================================================ 5: ============================= End of ROCm SMI Log ============================== 61: 61: 61: ======================= ROCm System Management Interface ======================= 61: ================================= Concise Info ================================= 61: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 61: 0 44.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 61: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 61: 2 45.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 61: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 61: 4 45.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 61: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 61: 6 39.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 61: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 61: ================================================================================ 61: ============================= End of ROCm SMI Log ============================== 10: 10: 10: ======================= ROCm System Management Interface ======================= 10: ================================= Concise Info ================================= 10: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 10: 0 45.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 10: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 10: 2 45.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 10: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 10: 4 39.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 10: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 10: 6 41.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 10: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 10: ================================================================================ 10: ============================= End of ROCm SMI Log ============================== 27: 27: 27: ======================= ROCm System Management Interface ======================= 27: ================================= Concise Info ================================= 27: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 27: 0 45.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 27: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 27: 2 40.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 27: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 27: 4 48.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 27: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 27: 6 43.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 27: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 27: ================================================================================ 27: ============================= End of ROCm SMI Log ============================== 25: 25: 25: ======================= ROCm System Management Interface ======================= 25: ================================= Concise Info ================================= 25: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 25: 0 38.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 25: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 25: 2 42.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 25: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 25: 4 43.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 25: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 25: 6 39.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 25: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 25: ================================================================================ 25: ============================= End of ROCm SMI Log ============================== 51: 51: 51: ======================= ROCm System Management Interface ======================= 51: ================================= Concise Info ================================= 51: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 51: 0 41.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 51: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 51: 2 40.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 51: 3 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 51: 4 47.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 51: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 51: 6 39.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 51: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 51: ================================================================================ 51: ============================= End of ROCm SMI Log ============================== 62: 62: 62: ======================= ROCm System Management Interface ======================= 62: ================================= Concise Info ================================= 62: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 62: 0 44.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 62: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 62: 2 41.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 62: 3 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 62: 4 41.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 62: 5 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 62: 6 44.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 62: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 62: ================================================================================ 62: ============================= End of ROCm SMI Log ============================== 3: 3: 3: ======================= ROCm System Management Interface ======================= 3: ================================= Concise Info ================================= 3: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 3: 0 44.0c 97.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 3: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 3: 2 37.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 3: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 3: 4 43.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 3: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 3: 6 42.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 3: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 3: ================================================================================ 3: ============================= End of ROCm SMI Log ============================== 24: 24: 24: ======================= ROCm System Management Interface ======================= 24: ================================= Concise Info ================================= 24: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 24: 0 45.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 24: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 24: 2 41.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 24: 3 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 24: 4 41.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 24: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 24: 6 40.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 24: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 24: ================================================================================ 24: ============================= End of ROCm SMI Log ============================== 7: 7: 7: ======================= ROCm System Management Interface ======================= 7: ================================= Concise Info ================================= 7: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 7: 0 50.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 7: 1 51.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 7: 2 39.0c 99.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 7: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 7: 4 44.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 7: 5 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 7: 6 40.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 7: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 7: ================================================================================ 7: ============================= End of ROCm SMI Log ============================== 17: 17: 17: ======================= ROCm System Management Interface ======================= 17: ================================= Concise Info ================================= 17: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 17: 0 41.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 17: 1 51.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 17: 2 40.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 17: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 17: 4 42.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 17: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 17: 6 40.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 17: 7 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 17: ================================================================================ 17: ============================= End of ROCm SMI Log ============================== 4: 4: 4: ======================= ROCm System Management Interface ======================= 4: ================================= Concise Info ================================= 4: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 4: 0 45.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 4: 1 53.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 4: 2 36.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 4: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 4: 4 42.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 4: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 4: 6 37.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 4: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 4: ================================================================================ 4: ============================= End of ROCm SMI Log ============================== 43: 43: 43: ======================= ROCm System Management Interface ======================= 43: ================================= Concise Info ================================= 43: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 43: 0 47.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 43: 1 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 43: 2 41.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 43: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 43: 4 41.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 43: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 43: 6 41.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 43: 7 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 43: ================================================================================ 43: ============================= End of ROCm SMI Log ============================== 23: 23: 23: ======================= ROCm System Management Interface ======================= 23: ================================= Concise Info ================================= 23: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 23: 0 49.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 23: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 23: 2 41.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 23: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 23: 4 43.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 23: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 23: 6 41.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 23: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 23: ================================================================================ 23: ============================= End of ROCm SMI Log ============================== 44: 44: 44: ======================= ROCm System Management Interface ======================= 44: ================================= Concise Info ================================= 44: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 44: 0 48.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 44: 1 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 44: 2 40.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 44: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 44: 4 40.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 44: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 44: 6 39.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 44: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 44: ================================================================================ 44: ============================= End of ROCm SMI Log ============================== 20: 20: 20: ======================= ROCm System Management Interface ======================= 20: ================================= Concise Info ================================= 20: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 20: 0 41.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 20: 1 51.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 20: 2 41.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 20: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 20: 4 43.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 20: 5 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 20: 6 42.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 20: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 20: ================================================================================ 20: ============================= End of ROCm SMI Log ============================== 6: 6: 6: ======================= ROCm System Management Interface ======================= 6: ================================= Concise Info ================================= 6: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 6: 0 46.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 6: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 6: 2 42.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 6: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 6: 4 42.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 6: 5 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 6: 6 41.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 6: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 6: ================================================================================ 6: ============================= End of ROCm SMI Log ============================== 42: 42: 42: ======================= ROCm System Management Interface ======================= 42: ================================= Concise Info ================================= 42: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 42: 0 44.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 42: 1 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 42: 2 42.0c 83.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 42: 3 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 42: 4 45.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 42: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 42: 6 39.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 42: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 42: ================================================================================ 42: ============================= End of ROCm SMI Log ============================== 9: 9: 9: ======================= ROCm System Management Interface ======================= 9: ================================= Concise Info ================================= 9: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 9: 0 44.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 9: 1 50.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 9: 2 44.0c 99.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 9: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 9: 4 41.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 9: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 9: 6 42.0c 97.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 9: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 9: ================================================================================ 9: ============================= End of ROCm SMI Log ============================== 32: 32: 32: ======================= ROCm System Management Interface ======================= 32: ================================= Concise Info ================================= 32: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 32: 0 45.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 32: 1 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 32: 2 46.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 32: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 32: 4 42.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 32: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 32: 6 39.0c 96.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 32: 7 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 32: ================================================================================ 32: ============================= End of ROCm SMI Log ============================== 48: 48: 48: ======================= ROCm System Management Interface ======================= 48: ================================= Concise Info ================================= 48: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 48: 0 42.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 48: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 48: 2 41.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 48: 3 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 48: 4 43.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 48: 5 51.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 48: 6 44.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 48: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 48: ================================================================================ 48: ============================= End of ROCm SMI Log ============================== 50: 50: 50: ======================= ROCm System Management Interface ======================= 50: ================================= Concise Info ================================= 50: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 50: 0 39.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 50: 1 53.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 50: 2 41.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 50: 3 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 50: 4 41.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 50: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 50: 6 45.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 50: 7 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 50: ================================================================================ 50: ============================= End of ROCm SMI Log ============================== 12: 12: 12: ======================= ROCm System Management Interface ======================= 12: ================================= Concise Info ================================= 12: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 12: 0 45.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 12: 1 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 12: 2 42.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 12: 3 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 12: 4 41.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 12: 5 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 12: 6 41.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 12: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 12: ================================================================================ 12: ============================= End of ROCm SMI Log ============================== 18: 18: 18: ======================= ROCm System Management Interface ======================= 18: ================================= Concise Info ================================= 18: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 18: 0 49.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 18: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 18: 2 42.0c 97.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 18: 3 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 18: 4 45.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 18: 5 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 18: 6 37.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 18: 7 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 18: ================================================================================ 18: ============================= End of ROCm SMI Log ============================== 14: 14: 14: ======================= ROCm System Management Interface ======================= 14: ================================= Concise Info ================================= 14: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 14: 0 45.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 14: 1 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 14: 2 38.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 14: 3 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 14: 4 41.0c 87.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 14: 5 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 14: 6 37.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 14: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 14: ================================================================================ 14: ============================= End of ROCm SMI Log ============================== 31: 31: 31: ======================= ROCm System Management Interface ======================= 31: ================================= Concise Info ================================= 31: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 31: 0 43.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 31: 1 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 31: 2 41.0c 85.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 31: 3 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 31: 4 45.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 31: 5 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 31: 6 41.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 31: 7 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 31: ================================================================================ 31: ============================= End of ROCm SMI Log ============================== 8: 8: 8: ======================= ROCm System Management Interface ======================= 8: ================================= Concise Info ================================= 8: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 8: 0 45.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 8: 1 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 8: 2 44.0c 97.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 8: 3 40.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 8: 4 46.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 8: 5 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 8: 6 42.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 8: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 8: ================================================================================ 8: ============================= End of ROCm SMI Log ============================== 19: 19: 19: ======================= ROCm System Management Interface ======================= 19: ================================= Concise Info ================================= 19: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 19: 0 48.0c 101.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 19: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 19: 2 43.0c 101.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 19: 3 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 19: 4 42.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 19: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 19: 6 42.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 19: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 19: ================================================================================ 19: ============================= End of ROCm SMI Log ============================== 15: 15: 15: ======================= ROCm System Management Interface ======================= 15: ================================= Concise Info ================================= 15: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 15: 0 46.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 15: 1 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 15: 2 43.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 15: 3 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 15: 4 43.0c 90.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 15: 5 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 15: 6 44.0c 93.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 15: 7 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 15: ================================================================================ 15: ============================= End of ROCm SMI Log ============================== 22: 22: 22: ======================= ROCm System Management Interface ======================= 22: ================================= Concise Info ================================= 22: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 22: 0 37.0c 97.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 22: 1 42.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 22: 2 40.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 22: 3 53.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 22: 4 41.0c 94.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 22: 5 47.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 22: 6 45.0c 99.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 22: 7 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 22: ================================================================================ 22: ============================= End of ROCm SMI Log ============================== 11: 11: 11: ======================= ROCm System Management Interface ======================= 11: ================================= Concise Info ================================= 11: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 11: 0 44.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 11: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 11: 2 40.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 11: 3 41.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 11: 4 45.0c 82.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 11: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 11: 6 46.0c 88.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 11: 7 44.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 11: ================================================================================ 11: ============================= End of ROCm SMI Log ============================== 26: 26: 26: ======================= ROCm System Management Interface ======================= 26: ================================= Concise Info ================================= 26: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 26: 0 42.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 26: 1 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 26: 2 41.0c 92.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 26: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 26: 4 44.0c 91.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 26: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 26: 6 40.0c 95.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 26: 7 46.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 26: ================================================================================ 26: ============================= End of ROCm SMI Log ============================== 21: 21: 21: ======================= ROCm System Management Interface ======================= 21: ================================= Concise Info ================================= 21: GPU Temp AvgPwr SCLK MCLK Fan Perf PwrCap VRAM% GPU% 21: 0 43.0c 97.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 21: 1 49.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 21: 2 40.0c 86.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 21: 3 43.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 21: 4 40.0c 89.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 21: 5 48.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 21: 6 43.0c 84.0W 800Mhz 1600Mhz 0% auto 560.0W 0% 0% 21: 7 45.0c N/A 800Mhz 1600Mhz 0% auto 0.0W 0% 0% 21: ================================================================================ 21: ============================= End of ROCm SMI Log ============================== 58: Launching on nid005802 (58/64), master nid005658 port 9999, GPUs 8, CUDA: True 57: Launching on nid005801 (57/64), master nid005658 port 9999, GPUs 8, CUDA: True 40: Launching on nid005698 (40/64), master nid005658 port 9999, GPUs 8, CUDA: True 52: Launching on nid005796 (52/64), master nid005658 port 9999, GPUs 8, CUDA: True 56: Launching on nid005800 (56/64), master nid005658 port 9999, GPUs 8, CUDA: True 41: Launching on nid005699 (41/64), master nid005658 port 9999, GPUs 8, CUDA: True 33: Launching on nid005691 (33/64), master nid005658 port 9999, GPUs 8, CUDA: True 63: Launching on nid005807 (63/64), master nid005658 port 9999, GPUs 8, CUDA: True 38: Launching on nid005696 (38/64), master nid005658 port 9999, GPUs 8, CUDA: True 30: Launching on nid005688 (30/64), master nid005658 port 9999, GPUs 8, CUDA: True 53: Launching on nid005797 (53/64), master nid005658 port 9999, GPUs 8, CUDA: True 34: Launching on nid005692 (34/64), master nid005658 port 9999, GPUs 8, CUDA: True 0: Launching on nid005658 (0/64), master nid005658 port 9999, GPUs 8, CUDA: True 29: Launching on nid005687 (29/64), master nid005658 port 9999, GPUs 8, CUDA: True 2: Launching on nid005660 (2/64), master nid005658 port 9999, GPUs 8, CUDA: True 35: Launching on nid005693 (35/64), master nid005658 port 9999, GPUs 8, CUDA: True 49: Launching on nid005793 (49/64), master nid005658 port 9999, GPUs 8, CUDA: True 37: Launching on nid005695 (37/64), master nid005658 port 9999, GPUs 8, CUDA: True 27: Launching on nid005685 (27/64), master nid005658 port 9999, GPUs 8, CUDA: True 10: Launching on nid005668 (10/64), master nid005658 port 9999, GPUs 8, CUDA: True 7: Launching on nid005665 (7/64), master nid005658 port 9999, GPUs 8, CUDA: True 4: Launching on nid005662 (4/64), master nid005658 port 9999, GPUs 8, CUDA: True 54: Launching on nid005798 (54/64), master nid005658 port 9999, GPUs 8, CUDA: True 44: Launching on nid005702 (44/64), master nid005658 port 9999, GPUs 8, CUDA: True 32: Launching on nid005690 (32/64), master nid005658 port 9999, GPUs 8, CUDA: True 18: Launching on nid005676 (18/64), master nid005658 port 9999, GPUs 8, CUDA: True 55: Launching on nid005799 (55/64), master nid005658 port 9999, GPUs 8, CUDA: True 61: Launching on nid005805 (61/64), master nid005658 port 9999, GPUs 8, CUDA: True 1: Launching on nid005659 (1/64), master nid005658 port 9999, GPUs 8, CUDA: True 28: Launching on nid005686 (28/64), master nid005658 port 9999, GPUs 8, CUDA: True 59: Launching on nid005803 (59/64), master nid005658 port 9999, GPUs 8, CUDA: True 39: Launching on nid005697 (39/64), master nid005658 port 9999, GPUs 8, CUDA: True 6: Launching on nid005664 (6/64), master nid005658 port 9999, GPUs 8, CUDA: True 24: Launching on nid005682 (24/64), master nid005658 port 9999, GPUs 8, CUDA: True 5: Launching on nid005663 (5/64), master nid005658 port 9999, GPUs 8, CUDA: True 21: Launching on nid005679 (21/64), master nid005658 port 9999, GPUs 8, CUDA: True 3: Launching on nid005661 (3/64), master nid005658 port 9999, GPUs 8, CUDA: True 50: Launching on nid005794 (50/64), master nid005658 port 9999, GPUs 8, CUDA: True 17: Launching on nid005675 (17/64), master nid005658 port 9999, GPUs 8, CUDA: True 23: Launching on nid005681 (23/64), master nid005658 port 9999, GPUs 8, CUDA: True 48: Launching on nid005792 (48/64), master nid005658 port 9999, GPUs 8, CUDA: True 25: Launching on nid005683 (25/64), master nid005658 port 9999, GPUs 8, CUDA: True 51: Launching on nid005795 (51/64), master nid005658 port 9999, GPUs 8, CUDA: True 13: Launching on nid005671 (13/64), master nid005658 port 9999, GPUs 8, CUDA: True 8: Launching on nid005666 (8/64), master nid005658 port 9999, GPUs 8, CUDA: True 42: Launching on nid005700 (42/64), master nid005658 port 9999, GPUs 8, CUDA: True 45: Launching on nid005703 (45/64), master nid005658 port 9999, GPUs 8, CUDA: True 15: Launching on nid005673 (15/64), master nid005658 port 9999, GPUs 8, CUDA: True 11: Launching on nid005669 (11/64), master nid005658 port 9999, GPUs 8, CUDA: True 43: Launching on nid005701 (43/64), master nid005658 port 9999, GPUs 8, CUDA: True 36: Launching on nid005694 (36/64), master nid005658 port 9999, GPUs 8, CUDA: True 19: Launching on nid005677 (19/64), master nid005658 port 9999, GPUs 8, CUDA: True 60: Launching on nid005804 (60/64), master nid005658 port 9999, GPUs 8, CUDA: True 20: Launching on nid005678 (20/64), master nid005658 port 9999, GPUs 8, CUDA: True 26: Launching on nid005684 (26/64), master nid005658 port 9999, GPUs 8, CUDA: True 16: Launching on nid005674 (16/64), master nid005658 port 9999, GPUs 8, CUDA: True 62: Launching on nid005806 (62/64), master nid005658 port 9999, GPUs 8, CUDA: True 31: Launching on nid005689 (31/64), master nid005658 port 9999, GPUs 8, CUDA: True 14: Launching on nid005672 (14/64), master nid005658 port 9999, GPUs 8, CUDA: True 9: Launching on nid005667 (9/64), master nid005658 port 9999, GPUs 8, CUDA: True 22: Launching on nid005680 (22/64), master nid005658 port 9999, GPUs 8, CUDA: True 12: Launching on nid005670 (12/64), master nid005658 port 9999, GPUs 8, CUDA: True 46: Launching on nid005790 (46/64), master nid005658 port 9999, GPUs 8, CUDA: True 47: Launching on nid005791 (47/64), master nid005658 port 9999, GPUs 8, CUDA: True 0: using world size: 512, data-parallel-size: 128, tensor-model-parallel size: 2, pipeline-model-parallel size: 2 0: accumulate and all-reduce gradients in fp32 for bfloat16 data type. 0: using torch.bfloat16 for parameters ... 0: ------------------------ arguments ------------------------ 0: abort_on_unmet_fused_kernel_constraints ......... False 0: accumulate_allreduce_grads_in_fp32 .............. True 0: adam_beta1 ...................................... 0.9 0: adam_beta2 ...................................... 0.999 0: adam_eps ........................................ 1e-08 0: adlr_autoresume ................................. False 0: adlr_autoresume_interval ........................ 1000 0: apply_query_key_layer_scaling ................... True 0: apply_residual_connection_post_layernorm ........ False 0: attention_dropout ............................... 0.1 0: attention_softmax_in_fp32 ....................... False 0: bert_binary_head ................................ True 0: bert_load ....................................... None 0: bf16 ............................................ True 0: bias_dropout_fusion ............................. True 0: bias_gelu_fusion ................................ True 0: biencoder_projection_dim ........................ 0 0: biencoder_shared_query_context_model ............ False 0: block_data_path ................................. None 0: checkpoint_activations .......................... False 0: checkpoint_in_cpu ............................... False 0: checkpoint_num_layers ........................... 1 0: clip_grad ....................................... 1.0 0: codecarbon_dir .................................. None 0: consumed_train_samples .......................... 0 0: consumed_train_tokens ........................... 0 0: consumed_valid_samples .......................... 0 0: contigious_checkpointing ........................ False 0: cpu_optimizer ................................... False 0: cpu_torch_adam .................................. False 0: curriculum_learning ............................. False 0: data_impl ....................................... mmap 0: data_parallel_size .............................. 128 0: data_path ....................................... ['/scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document'] 0: dataloader_type ................................. single 0: DDP_impl ........................................ local 0: decoder_seq_length .............................. None 0: deepscale ....................................... False 0: deepscale_config ................................ None 0: deepspeed ....................................... True 0: deepspeed_activation_checkpointing .............. False 0: deepspeed_config ................................ ds_configs/2059276.json 0: deepspeed_mpi ................................... False 0: distribute_checkpointed_activations ............. False 0: distributed_backend ............................. nccl 0: embed_layernorm ................................. False 0: embedding_path .................................. None 0: encoder_seq_length .............................. 2048 0: eod_mask_loss ................................... False 0: eval_interval ................................... 1000 0: eval_iters ...................................... 1 0: eval_only ....................................... None 0: evidence_data_path .............................. None 0: exit_duration_in_mins ........................... None 0: exit_interval ................................... None 0: ffn_hidden_size ................................. 16384 0: finetune ........................................ False 0: fp16 ............................................ False 0: fp16_lm_cross_entropy ........................... False 0: fp32_residual_connection ........................ False 0: gigaflos_no_embeds .............................. 0 0: global_batch_size ............................... 1024 0: glu_activation .................................. None 0: hidden_dropout .................................. 0.1 0: hidden_size ..................................... 4096 0: hysteresis ...................................... 2 0: ict_head_size ................................... None 0: ict_load ........................................ None 0: img_dim ......................................... 224 0: indexer_batch_size .............................. 128 0: indexer_log_interval ............................ 1000 0: inference ....................................... False 0: init_method_std ................................. 0.02 0: init_method_xavier_uniform ...................... False 0: initial_loss_scale .............................. 4294967296 0: kill_switch_path ................................ kill-switch-8b7 0: kv_channels ..................................... 128 0: layer_norm_fusion ............................... True 0: layernorm_epsilon ............................... 1e-05 0: lazy_mpu_init ................................... None 0: load ............................................ checkpoints_8b7 0: local_rank ...................................... None 0: log_batch_size_to_tensorboard ................... True 0: log_interval .................................... 10 0: log_learning_rate_to_tensorboard ................ True 0: log_level ....................................... None 0: log_level_replica ............................... None 0: log_loss_scale_to_tensorboard ................... True 0: log_num_zeros_in_grad ........................... False 0: log_params_norm ................................. False 0: log_path ........................................ None 0: log_timers_to_tensorboard ....................... True 0: log_validation_ppl_to_tensorboard ............... True 0: loss_on_targets_only ............................ False 0: loss_scale ...................................... None 0: loss_scale_window ............................... 1000 0: lr .............................................. 0.0002 0: lr_decay_iters .................................. None 0: lr_decay_samples ................................ 5625981 0: lr_decay_style .................................. cosine 0: lr_decay_tokens ................................. None 0: lr_warmup_fraction .............................. None 0: lr_warmup_iters ................................. 0 0: lr_warmup_samples ............................... 56260 0: make_vocab_size_divisible_by .................... 128 0: mask_prob ....................................... 0.15 0: masked_softmax_fusion ........................... True 0: max_position_embeddings ......................... 2048 0: mean_noise_span_length .......................... None 0: memory_centric_tiled_linear ..................... False 0: merge_file ...................................... gpt2/merges.txt 0: micro_batch_size ................................ 2 0: min_loss_scale .................................. 1.0 0: min_lr .......................................... 2e-05 0: mmap_warmup ..................................... False 0: no_load_optim ................................... None 0: no_load_rng ..................................... None 0: no_save_optim ................................... None 0: no_save_rng ..................................... None 0: noise_density ................................... None 0: num_attention_heads ............................. 32 0: num_channels .................................... 3 0: num_classes ..................................... 1000 0: num_layers ...................................... 42 0: num_layers_per_virtual_pipeline_stage ........... None 0: num_workers ..................................... 2 0: onnx_safe ....................................... None 0: openai_gelu ..................................... False 0: optimizer ....................................... adam 0: optimizer_fusion ................................ True 0: override_lr_scheduler ........................... False 0: pad_vocab_size_to ............................... None 0: params_dtype .................................... torch.bfloat16 0: partition_activations ........................... False 0: patch_dim ....................................... 16 0: pipeline_model_parallel_size .................... 2 0: position_embedding_type ......................... PositionEmbeddingType.absolute 0: pp_partition_method ............................. None 0: profile_backward ................................ False 0: query_in_block_prob ............................. 0.1 0: rampup_batch_size ............................... None 0: rank ............................................ 0 0: remote_device ................................... none 0: reset_attention_mask ............................ False 0: reset_position_ids .............................. False 0: retriever_report_topk_accuracies ................ [] 0: retriever_score_scaling ......................... False 0: retriever_seq_length ............................ 256 0: reweight_loss_based_on_position_frequency ....... False 0: sample_rate ..................................... 1.0 0: save ............................................ checkpoints_8b7 0: save_interval ................................... 1000 0: scatter_gather_tensors_in_pipeline .............. True 0: scattered_embeddings ............................ False 0: seed ............................................ 1234 0: seq_length ...................................... 2048 0: sgd_momentum .................................... 0.9 0: short_seq_prob .................................. 0.1 0: skip_train_iteration_range ...................... None 0: split ........................................... 949,50,1 0: split_transformers .............................. False 0: sync_tp_duplicated_parameters ................... False 0: synchronize_each_layer .......................... False 0: tensor_model_parallel_size ...................... 2 0: tensorboard_dir ................................. tensorboard_8b7 0: tensorboard_log_interval ........................ 1 0: tensorboard_queue_size .......................... 5 0: test_weighted_split_names ....................... None 0: test_weighted_split_paths ....................... None 0: test_weighted_split_paths_path .................. None 0: test_weighted_split_splits ...................... None 0: test_weighted_split_weights ..................... None 0: tile_factor ..................................... 1 0: titles_data_path ................................ None 0: tokenizer_name_or_path .......................... None 0: tokenizer_type .................................. GPT2BPETokenizer 0: train_iters ..................................... None 0: train_samples ................................... 5625981 0: train_tokens .................................... None 0: train_weighted_split_paths ...................... None 0: train_weighted_split_paths_path ................. None 0: universal_checkpoint ............................ False 0: use_bnb_optimizer ............................... False 0: use_checkpoint_lr_scheduler ..................... False 0: use_contiguous_buffers_in_ddp ................... True 0: use_cpu_initialization .......................... None 0: use_one_sent_docs ............................... False 0: use_pin_memory .................................. False 0: valid_num_workers ............................... 2 0: valid_weighted_split_names ...................... None 0: valid_weighted_split_paths ...................... None 0: valid_weighted_split_paths_path ................. None 0: valid_weighted_split_splits ..................... None 0: valid_weighted_split_weights .................... None 0: virtual_pipeline_model_parallel_size ............ None 0: vocab_extra_ids ................................. 0 0: vocab_file ...................................... gpt2/vocab.json 0: weight_decay .................................... 0.1 0: world_size ...................................... 512 0: zero_allgather_bucket_size ...................... 0.0 0: zero_contigious_gradients ....................... False 0: zero_reduce_bucket_size ......................... 0.0 0: zero_reduce_scatter ............................. False 0: zero_stage ...................................... 0 0: -------------------- end of arguments --------------------- 0: setting number of micro-batches to constant 4 0: > building GPT2BPETokenizer tokenizer ... 0: > padded vocab (size: 50257) with 175 dummy tokens (new size: 50432) 0: DeepSpeed general environment info: 0: torch install path ............... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch'] 0: torch version .................... 1.13.0+rocm5.2 0: torch cuda version ............... None 0: torch hip version ................ 5.2.21151-afdc89f8 0: nvcc version ..................... None 0: deepspeed install path ........... ['/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/deepspeed'] 0: deepspeed info ................... 0.7.5, unknown, unknown 0: deepspeed wheel compiled w. ...... torch 1.13, hip 5.1 0: **** Git info for Megatron: git_hash=unknown git_branch=unknown **** 0: > initializing torch distributed ... 0: [2022-11-25 09:39:34,646] [INFO] [comm.py:633:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl 63: > setting tensorboard ... 0: > initializing tensor model parallel with size 2 0: > initializing pipeline model parallel with size 2 0: > setting random seeds to 1234 ... 0: > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234 0: > compiling dataset index builder ... 0: make: Entering directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' 0: make: Nothing to be done for 'default'. 0: make: Leaving directory '/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/data' 0: >>> done with dataset index builder. Compilation time: 0.063 seconds 0: > compiling and loading fused kernels ... 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.cpp [skipped, already hipified] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.hip [skipped, already hipified] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] 0: Total number of unsupported CUDA function calls: 0 0: 0: 0: Total number of replaced kernel launches: 87 0: [1/1] c++ scaled_upper_triang_masked_softmax_hip.o scaled_upper_triang_masked_softmax_hip.cuda.o -shared -L/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib -lamdhip64 -o scaled_upper_triang_masked_softmax_cuda.so 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.cpp [skipped, already hipified] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_cuda.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.hip [skipped, already hipified] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] 0: Total number of unsupported CUDA function calls: 0 0: 0: 0: Total number of replaced kernel launches: 63 0: ninja: no work to do. 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda.cpp [skipped, no changes] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_cuda_kernel.cu -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/layer_norm_hip_kernel.hip [skipped, already hipified] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/type_shim.h [skipped, no changes] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/compat.h [skipped, no changes] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_upper_triang_masked_softmax_hip.h [skipped, already hipified] 0: /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax.h -> /pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/Megatron-DeepSpeed/megatron/fused_kernels/scaled_masked_softmax_hip.h [skipped, already hipified] 0: Total number of unsupported CUDA function calls: 0 0: 0: 0: Total number of replaced kernel launches: 67 0: [1/1] c++ layer_norm_hip_kernel.cuda.o layer_norm_cuda.o -shared -L/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/venv/lib/python3.9/site-packages/torch/lib -lc10 -lc10_hip -ltorch_cpu -ltorch_hip -ltorch -ltorch_python -L/pfs/lustrep2/projappl/project_462000125/samantao-public/rocm/rocm-5.2.3/lib -lamdhip64 -o fused_mix_prec_layer_norm_cuda.so 0: >>> done with compiling and loading fused kernels. Compilation time: 29.863 seconds 0: time to initialize megatron (seconds): 100.581 0: [after megatron is initialized] datetime: 2022-11-25 09:40:20 0: building GPT model ... 0: [2022-11-25 09:40:20,645] [INFO] [utils.py:827:see_memory_usage] Before Building Model 0: [2022-11-25 09:40:20,645] [INFO] [utils.py:828:see_memory_usage] MA 0.0 GB Max_MA 0.0 GB CA 0.0 GB Max_CA 0 GB 0: [2022-11-25 09:40:20,646] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 43.83 GB, percent = 8.7% 0: SEED_LAYERS=False BASE_SEED=1234 SEED_FN=None 0: Using topology: {ProcessCoord(pipe=0, data=0, model=0): 0, ProcessCoord(pipe=0, data=0, model=1): 1, ProcessCoord(pipe=0, data=1, model=0): 2, ProcessCoord(pipe=0, data=1, model=1): 3, ProcessCoord(pipe=0, data=2, model=0): 4, ProcessCoord(pipe=0, data=2, model=1): 5, ProcessCoord(pipe=0, data=3, model=0): 6, ProcessCoord(pipe=0, data=3, model=1): 7, ProcessCoord(pipe=0, data=4, model=0): 8, ProcessCoord(pipe=0, data=4, model=1): 9, ProcessCoord(pipe=0, data=5, model=0): 10, ProcessCoord(pipe=0, data=5, model=1): 11, ProcessCoord(pipe=0, data=6, model=0): 12, ProcessCoord(pipe=0, data=6, model=1): 13, ProcessCoord(pipe=0, data=7, model=0): 14, ProcessCoord(pipe=0, data=7, model=1): 15, ProcessCoord(pipe=0, data=8, model=0): 16, ProcessCoord(pipe=0, data=8, model=1): 17, ProcessCoord(pipe=0, data=9, model=0): 18, ProcessCoord(pipe=0, data=9, model=1): 19, ProcessCoord(pipe=0, data=10, model=0): 20, ProcessCoord(pipe=0, data=10, model=1): 21, ProcessCoord(pipe=0, data=11, model=0): 22, ProcessCoord(pipe=0, data 0: =11, model=1): 23, ProcessCoord(pipe=0, data=12, model=0): 24, ProcessCoord(pipe=0, data=12, model=1): 25, ProcessCoord(pipe=0, data=13, model=0): 26, ProcessCoord(pipe=0, data=13, model=1): 27, ProcessCoord(pipe=0, data=14, model=0): 28, ProcessCoord(pipe=0, data=14, model=1): 29, ProcessCoord(pipe=0, data=15, model=0): 30, ProcessCoord(pipe=0, data=15, model=1): 31, ProcessCoord(pipe=0, data=16, model=0): 32, ProcessCoord(pipe=0, data=16, model=1): 33, ProcessCoord(pipe=0, data=17, model=0): 34, ProcessCoord(pipe=0, data=17, model=1): 35, ProcessCoord(pipe=0, data=18, model=0): 36, ProcessCoord(pipe=0, data=18, model=1): 37, ProcessCoord(pipe=0, data=19, model=0): 38, ProcessCoord(pipe=0, data=19, model=1): 39, ProcessCoord(pipe=0, data=20, model=0): 40, ProcessCoord(pipe=0, data=20, model=1): 41, ProcessCoord(pipe=0, data=21, model=0): 42, ProcessCoord(pipe=0, data=21, model=1): 43, ProcessCoord(pipe=0, data=22, model=0): 44, ProcessCoord(pipe=0, data=22, model=1): 45, ProcessCoord(pipe=0, data=23, model=0 0: ): 46, ProcessCoord(pipe=0, data=23, model=1): 47, ProcessCoord(pipe=0, data=24, model=0): 48, ProcessCoord(pipe=0, data=24, model=1): 49, ProcessCoord(pipe=0, data=25, model=0): 50, ProcessCoord(pipe=0, data=25, model=1): 51, ProcessCoord(pipe=0, data=26, model=0): 52, ProcessCoord(pipe=0, data=26, model=1): 53, ProcessCoord(pipe=0, data=27, model=0): 54, ProcessCoord(pipe=0, data=27, model=1): 55, ProcessCoord(pipe=0, data=28, model=0): 56, ProcessCoord(pipe=0, data=28, model=1): 57, ProcessCoord(pipe=0, data=29, model=0): 58, ProcessCoord(pipe=0, data=29, model=1): 59, ProcessCoord(pipe=0, data=30, model=0): 60, ProcessCoord(pipe=0, data=30, model=1): 61, ProcessCoord(pipe=0, data=31, model=0): 62, ProcessCoord(pipe=0, data=31, model=1): 63, ProcessCoord(pipe=0, data=32, model=0): 64, ProcessCoord(pipe=0, data=32, model=1): 65, ProcessCoord(pipe=0, data=33, model=0): 66, ProcessCoord(pipe=0, data=33, model=1): 67, ProcessCoord(pipe=0, data=34, model=0): 68, ProcessCoord(pipe=0, data=34, model=1): 69, Proce 0: ssCoord(pipe=0, data=35, model=0): 70, ProcessCoord(pipe=0, data=35, model=1): 71, ProcessCoord(pipe=0, data=36, model=0): 72, ProcessCoord(pipe=0, data=36, model=1): 73, ProcessCoord(pipe=0, data=37, model=0): 74, ProcessCoord(pipe=0, data=37, model=1): 75, ProcessCoord(pipe=0, data=38, model=0): 76, ProcessCoord(pipe=0, data=38, model=1): 77, ProcessCoord(pipe=0, data=39, model=0): 78, ProcessCoord(pipe=0, data=39, model=1): 79, ProcessCoord(pipe=0, data=40, model=0): 80, ProcessCoord(pipe=0, data=40, model=1): 81, ProcessCoord(pipe=0, data=41, model=0): 82, ProcessCoord(pipe=0, data=41, model=1): 83, ProcessCoord(pipe=0, data=42, model=0): 84, ProcessCoord(pipe=0, data=42, model=1): 85, ProcessCoord(pipe=0, data=43, model=0): 86, ProcessCoord(pipe=0, data=43, model=1): 87, ProcessCoord(pipe=0, data=44, model=0): 88, ProcessCoord(pipe=0, data=44, model=1): 89, ProcessCoord(pipe=0, data=45, model=0): 90, ProcessCoord(pipe=0, data=45, model=1): 91, ProcessCoord(pipe=0, data=46, model=0): 92, ProcessCoord(pipe 0: =0, data=46, model=1): 93, ProcessCoord(pipe=0, data=47, model=0): 94, ProcessCoord(pipe=0, data=47, model=1): 95, ProcessCoord(pipe=0, data=48, model=0): 96, ProcessCoord(pipe=0, data=48, model=1): 97, ProcessCoord(pipe=0, data=49, model=0): 98, ProcessCoord(pipe=0, data=49, model=1): 99, ProcessCoord(pipe=0, data=50, model=0): 100, ProcessCoord(pipe=0, data=50, model=1): 101, ProcessCoord(pipe=0, data=51, model=0): 102, ProcessCoord(pipe=0, data=51, model=1): 103, ProcessCoord(pipe=0, data=52, model=0): 104, ProcessCoord(pipe=0, data=52, model=1): 105, ProcessCoord(pipe=0, data=53, model=0): 106, ProcessCoord(pipe=0, data=53, model=1): 107, ProcessCoord(pipe=0, data=54, model=0): 108, ProcessCoord(pipe=0, data=54, model=1): 109, ProcessCoord(pipe=0, data=55, model=0): 110, ProcessCoord(pipe=0, data=55, model=1): 111, ProcessCoord(pipe=0, data=56, model=0): 112, ProcessCoord(pipe=0, data=56, model=1): 113, ProcessCoord(pipe=0, data=57, model=0): 114, ProcessCoord(pipe=0, data=57, model=1): 115, ProcessCoord( 0: pipe=0, data=58, model=0): 116, ProcessCoord(pipe=0, data=58, model=1): 117, ProcessCoord(pipe=0, data=59, model=0): 118, ProcessCoord(pipe=0, data=59, model=1): 119, ProcessCoord(pipe=0, data=60, model=0): 120, ProcessCoord(pipe=0, data=60, model=1): 121, ProcessCoord(pipe=0, data=61, model=0): 122, ProcessCoord(pipe=0, data=61, model=1): 123, ProcessCoord(pipe=0, data=62, model=0): 124, ProcessCoord(pipe=0, data=62, model=1): 125, ProcessCoord(pipe=0, data=63, model=0): 126, ProcessCoord(pipe=0, data=63, model=1): 127, ProcessCoord(pipe=0, data=64, model=0): 128, ProcessCoord(pipe=0, data=64, model=1): 129, ProcessCoord(pipe=0, data=65, model=0): 130, ProcessCoord(pipe=0, data=65, model=1): 131, ProcessCoord(pipe=0, data=66, model=0): 132, ProcessCoord(pipe=0, data=66, model=1): 133, ProcessCoord(pipe=0, data=67, model=0): 134, ProcessCoord(pipe=0, data=67, model=1): 135, ProcessCoord(pipe=0, data=68, model=0): 136, ProcessCoord(pipe=0, data=68, model=1): 137, ProcessCoord(pipe=0, data=69, model=0): 138, Pr 0: ocessCoord(pipe=0, data=69, model=1): 139, ProcessCoord(pipe=0, data=70, model=0): 140, ProcessCoord(pipe=0, data=70, model=1): 141, ProcessCoord(pipe=0, data=71, model=0): 142, ProcessCoord(pipe=0, data=71, model=1): 143, ProcessCoord(pipe=0, data=72, model=0): 144, ProcessCoord(pipe=0, data=72, model=1): 145, ProcessCoord(pipe=0, data=73, model=0): 146, ProcessCoord(pipe=0, data=73, model=1): 147, ProcessCoord(pipe=0, data=74, model=0): 148, ProcessCoord(pipe=0, data=74, model=1): 149, ProcessCoord(pipe=0, data=75, model=0): 150, ProcessCoord(pipe=0, data=75, model=1): 151, ProcessCoord(pipe=0, data=76, model=0): 152, ProcessCoord(pipe=0, data=76, model=1): 153, ProcessCoord(pipe=0, data=77, model=0): 154, ProcessCoord(pipe=0, data=77, model=1): 155, ProcessCoord(pipe=0, data=78, model=0): 156, ProcessCoord(pipe=0, data=78, model=1): 157, ProcessCoord(pipe=0, data=79, model=0): 158, ProcessCoord(pipe=0, data=79, model=1): 159, ProcessCoord(pipe=0, data=80, model=0): 160, ProcessCoord(pipe=0, data=80, model= 0: 1): 161, ProcessCoord(pipe=0, data=81, model=0): 162, ProcessCoord(pipe=0, data=81, model=1): 163, ProcessCoord(pipe=0, data=82, model=0): 164, ProcessCoord(pipe=0, data=82, model=1): 165, ProcessCoord(pipe=0, data=83, model=0): 166, ProcessCoord(pipe=0, data=83, model=1): 167, ProcessCoord(pipe=0, data=84, model=0): 168, ProcessCoord(pipe=0, data=84, model=1): 169, ProcessCoord(pipe=0, data=85, model=0): 170, ProcessCoord(pipe=0, data=85, model=1): 171, ProcessCoord(pipe=0, data=86, model=0): 172, ProcessCoord(pipe=0, data=86, model=1): 173, ProcessCoord(pipe=0, data=87, model=0): 174, ProcessCoord(pipe=0, data=87, model=1): 175, ProcessCoord(pipe=0, data=88, model=0): 176, ProcessCoord(pipe=0, data=88, model=1): 177, ProcessCoord(pipe=0, data=89, model=0): 178, ProcessCoord(pipe=0, data=89, model=1): 179, ProcessCoord(pipe=0, data=90, model=0): 180, ProcessCoord(pipe=0, data=90, model=1): 181, ProcessCoord(pipe=0, data=91, model=0): 182, ProcessCoord(pipe=0, data=91, model=1): 183, ProcessCoord(pipe=0, data 0: =92, model=0): 184, ProcessCoord(pipe=0, data=92, model=1): 185, ProcessCoord(pipe=0, data=93, model=0): 186, ProcessCoord(pipe=0, data=93, model=1): 187, ProcessCoord(pipe=0, data=94, model=0): 188, ProcessCoord(pipe=0, data=94, model=1): 189, ProcessCoord(pipe=0, data=95, model=0): 190, ProcessCoord(pipe=0, data=95, model=1): 191, ProcessCoord(pipe=0, data=96, model=0): 192, ProcessCoord(pipe=0, data=96, model=1): 193, ProcessCoord(pipe=0, data=97, model=0): 194, ProcessCoord(pipe=0, data=97, model=1): 195, ProcessCoord(pipe=0, data=98, model=0): 196, ProcessCoord(pipe=0, data=98, model=1): 197, ProcessCoord(pipe=0, data=99, model=0): 198, ProcessCoord(pipe=0, data=99, model=1): 199, ProcessCoord(pipe=0, data=100, model=0): 200, ProcessCoord(pipe=0, data=100, model=1): 201, ProcessCoord(pipe=0, data=101, model=0): 202, ProcessCoord(pipe=0, data=101, model=1): 203, ProcessCoord(pipe=0, data=102, model=0): 204, ProcessCoord(pipe=0, data=102, model=1): 205, ProcessCoord(pipe=0, data=103, model=0): 206, Process 0: Coord(pipe=0, data=103, model=1): 207, ProcessCoord(pipe=0, data=104, model=0): 208, ProcessCoord(pipe=0, data=104, model=1): 209, ProcessCoord(pipe=0, data=105, model=0): 210, ProcessCoord(pipe=0, data=105, model=1): 211, ProcessCoord(pipe=0, data=106, model=0): 212, ProcessCoord(pipe=0, data=106, model=1): 213, ProcessCoord(pipe=0, data=107, model=0): 214, ProcessCoord(pipe=0, data=107, model=1): 215, ProcessCoord(pipe=0, data=108, model=0): 216, ProcessCoord(pipe=0, data=108, model=1): 217, ProcessCoord(pipe=0, data=109, model=0): 218, ProcessCoord(pipe=0, data=109, model=1): 219, ProcessCoord(pipe=0, data=110, model=0): 220, ProcessCoord(pipe=0, data=110, model=1): 221, ProcessCoord(pipe=0, data=111, model=0): 222, ProcessCoord(pipe=0, data=111, model=1): 223, ProcessCoord(pipe=0, data=112, model=0): 224, ProcessCoord(pipe=0, data=112, model=1): 225, ProcessCoord(pipe=0, data=113, model=0): 226, ProcessCoord(pipe=0, data=113, model=1): 227, ProcessCoord(pipe=0, data=114, model=0): 228, ProcessCoord(pipe=0 0: , data=114, model=1): 229, ProcessCoord(pipe=0, data=115, model=0): 230, ProcessCoord(pipe=0, data=115, model=1): 231, ProcessCoord(pipe=0, data=116, model=0): 232, ProcessCoord(pipe=0, data=116, model=1): 233, ProcessCoord(pipe=0, data=117, model=0): 234, ProcessCoord(pipe=0, data=117, model=1): 235, ProcessCoord(pipe=0, data=118, model=0): 236, ProcessCoord(pipe=0, data=118, model=1): 237, ProcessCoord(pipe=0, data=119, model=0): 238, ProcessCoord(pipe=0, data=119, model=1): 239, ProcessCoord(pipe=0, data=120, model=0): 240, ProcessCoord(pipe=0, data=120, model=1): 241, ProcessCoord(pipe=0, data=121, model=0): 242, ProcessCoord(pipe=0, data=121, model=1): 243, ProcessCoord(pipe=0, data=122, model=0): 244, ProcessCoord(pipe=0, data=122, model=1): 245, ProcessCoord(pipe=0, data=123, model=0): 246, ProcessCoord(pipe=0, data=123, model=1): 247, ProcessCoord(pipe=0, data=124, model=0): 248, ProcessCoord(pipe=0, data=124, model=1): 249, ProcessCoord(pipe=0, data=125, model=0): 250, ProcessCoord(pipe=0, data=125, 0: model=1): 251, ProcessCoord(pipe=0, data=126, model=0): 252, ProcessCoord(pipe=0, data=126, model=1): 253, ProcessCoord(pipe=0, data=127, model=0): 254, ProcessCoord(pipe=0, data=127, model=1): 255, ProcessCoord(pipe=1, data=0, model=0): 256, ProcessCoord(pipe=1, data=0, model=1): 257, ProcessCoord(pipe=1, data=1, model=0): 258, ProcessCoord(pipe=1, data=1, model=1): 259, ProcessCoord(pipe=1, data=2, model=0): 260, ProcessCoord(pipe=1, data=2, model=1): 261, ProcessCoord(pipe=1, data=3, model=0): 262, ProcessCoord(pipe=1, data=3, model=1): 263, ProcessCoord(pipe=1, data=4, model=0): 264, ProcessCoord(pipe=1, data=4, model=1): 265, ProcessCoord(pipe=1, data=5, model=0): 266, ProcessCoord(pipe=1, data=5, model=1): 267, ProcessCoord(pipe=1, data=6, model=0): 268, ProcessCoord(pipe=1, data=6, model=1): 269, ProcessCoord(pipe=1, data=7, model=0): 270, ProcessCoord(pipe=1, data=7, model=1): 271, ProcessCoord(pipe=1, data=8, model=0): 272, ProcessCoord(pipe=1, data=8, model=1): 273, ProcessCoord(pipe=1, data=9, mode 0: l=0): 274, ProcessCoord(pipe=1, data=9, model=1): 275, ProcessCoord(pipe=1, data=10, model=0): 276, ProcessCoord(pipe=1, data=10, model=1): 277, ProcessCoord(pipe=1, data=11, model=0): 278, ProcessCoord(pipe=1, data=11, model=1): 279, ProcessCoord(pipe=1, data=12, model=0): 280, ProcessCoord(pipe=1, data=12, model=1): 281, ProcessCoord(pipe=1, data=13, model=0): 282, ProcessCoord(pipe=1, data=13, model=1): 283, ProcessCoord(pipe=1, data=14, model=0): 284, ProcessCoord(pipe=1, data=14, model=1): 285, ProcessCoord(pipe=1, data=15, model=0): 286, ProcessCoord(pipe=1, data=15, model=1): 287, ProcessCoord(pipe=1, data=16, model=0): 288, ProcessCoord(pipe=1, data=16, model=1): 289, ProcessCoord(pipe=1, data=17, model=0): 290, ProcessCoord(pipe=1, data=17, model=1): 291, ProcessCoord(pipe=1, data=18, model=0): 292, ProcessCoord(pipe=1, data=18, model=1): 293, ProcessCoord(pipe=1, data=19, model=0): 294, ProcessCoord(pipe=1, data=19, model=1): 295, ProcessCoord(pipe=1, data=20, model=0): 296, ProcessCoord(pipe=1, dat 0: a=20, model=1): 297, ProcessCoord(pipe=1, data=21, model=0): 298, ProcessCoord(pipe=1, data=21, model=1): 299, ProcessCoord(pipe=1, data=22, model=0): 300, ProcessCoord(pipe=1, data=22, model=1): 301, ProcessCoord(pipe=1, data=23, model=0): 302, ProcessCoord(pipe=1, data=23, model=1): 303, ProcessCoord(pipe=1, data=24, model=0): 304, ProcessCoord(pipe=1, data=24, model=1): 305, ProcessCoord(pipe=1, data=25, model=0): 306, ProcessCoord(pipe=1, data=25, model=1): 307, ProcessCoord(pipe=1, data=26, model=0): 308, ProcessCoord(pipe=1, data=26, model=1): 309, ProcessCoord(pipe=1, data=27, model=0): 310, ProcessCoord(pipe=1, data=27, model=1): 311, ProcessCoord(pipe=1, data=28, model=0): 312, ProcessCoord(pipe=1, data=28, model=1): 313, ProcessCoord(pipe=1, data=29, model=0): 314, ProcessCoord(pipe=1, data=29, model=1): 315, ProcessCoord(pipe=1, data=30, model=0): 316, ProcessCoord(pipe=1, data=30, model=1): 317, ProcessCoord(pipe=1, data=31, model=0): 318, ProcessCoord(pipe=1, data=31, model=1): 319, ProcessCoord( 0: pipe=1, data=32, model=0): 320, ProcessCoord(pipe=1, data=32, model=1): 321, ProcessCoord(pipe=1, data=33, model=0): 322, ProcessCoord(pipe=1, data=33, model=1): 323, ProcessCoord(pipe=1, data=34, model=0): 324, ProcessCoord(pipe=1, data=34, model=1): 325, ProcessCoord(pipe=1, data=35, model=0): 326, ProcessCoord(pipe=1, data=35, model=1): 327, ProcessCoord(pipe=1, data=36, model=0): 328, ProcessCoord(pipe=1, data=36, model=1): 329, ProcessCoord(pipe=1, data=37, model=0): 330, ProcessCoord(pipe=1, data=37, model=1): 331, ProcessCoord(pipe=1, data=38, model=0): 332, ProcessCoord(pipe=1, data=38, model=1): 333, ProcessCoord(pipe=1, data=39, model=0): 334, ProcessCoord(pipe=1, data=39, model=1): 335, ProcessCoord(pipe=1, data=40, model=0): 336, ProcessCoord(pipe=1, data=40, model=1): 337, ProcessCoord(pipe=1, data=41, model=0): 338, ProcessCoord(pipe=1, data=41, model=1): 339, ProcessCoord(pipe=1, data=42, model=0): 340, ProcessCoord(pipe=1, data=42, model=1): 341, ProcessCoord(pipe=1, data=43, model=0): 342, Pr 0: ocessCoord(pipe=1, data=43, model=1): 343, ProcessCoord(pipe=1, data=44, model=0): 344, ProcessCoord(pipe=1, data=44, model=1): 345, ProcessCoord(pipe=1, data=45, model=0): 346, ProcessCoord(pipe=1, data=45, model=1): 347, ProcessCoord(pipe=1, data=46, model=0): 348, ProcessCoord(pipe=1, data=46, model=1): 349, ProcessCoord(pipe=1, data=47, model=0): 350, ProcessCoord(pipe=1, data=47, model=1): 351, ProcessCoord(pipe=1, data=48, model=0): 352, ProcessCoord(pipe=1, data=48, model=1): 353, ProcessCoord(pipe=1, data=49, model=0): 354, ProcessCoord(pipe=1, data=49, model=1): 355, ProcessCoord(pipe=1, data=50, model=0): 356, ProcessCoord(pipe=1, data=50, model=1): 357, ProcessCoord(pipe=1, data=51, model=0): 358, ProcessCoord(pipe=1, data=51, model=1): 359, ProcessCoord(pipe=1, data=52, model=0): 360, ProcessCoord(pipe=1, data=52, model=1): 361, ProcessCoord(pipe=1, data=53, model=0): 362, ProcessCoord(pipe=1, data=53, model=1): 363, ProcessCoord(pipe=1, data=54, model=0): 364, ProcessCoord(pipe=1, data=54, model= 0: 1): 365, ProcessCoord(pipe=1, data=55, model=0): 366, ProcessCoord(pipe=1, data=55, model=1): 367, ProcessCoord(pipe=1, data=56, model=0): 368, ProcessCoord(pipe=1, data=56, model=1): 369, ProcessCoord(pipe=1, data=57, model=0): 370, ProcessCoord(pipe=1, data=57, model=1): 371, ProcessCoord(pipe=1, data=58, model=0): 372, ProcessCoord(pipe=1, data=58, model=1): 373, ProcessCoord(pipe=1, data=59, model=0): 374, ProcessCoord(pipe=1, data=59, model=1): 375, ProcessCoord(pipe=1, data=60, model=0): 376, ProcessCoord(pipe=1, data=60, model=1): 377, ProcessCoord(pipe=1, data=61, model=0): 378, ProcessCoord(pipe=1, data=61, model=1): 379, ProcessCoord(pipe=1, data=62, model=0): 380, ProcessCoord(pipe=1, data=62, model=1): 381, ProcessCoord(pipe=1, data=63, model=0): 382, ProcessCoord(pipe=1, data=63, model=1): 383, ProcessCoord(pipe=1, data=64, model=0): 384, ProcessCoord(pipe=1, data=64, model=1): 385, ProcessCoord(pipe=1, data=65, model=0): 386, ProcessCoord(pipe=1, data=65, model=1): 387, ProcessCoord(pipe=1, data 0: =66, model=0): 388, ProcessCoord(pipe=1, data=66, model=1): 389, ProcessCoord(pipe=1, data=67, model=0): 390, ProcessCoord(pipe=1, data=67, model=1): 391, ProcessCoord(pipe=1, data=68, model=0): 392, ProcessCoord(pipe=1, data=68, model=1): 393, ProcessCoord(pipe=1, data=69, model=0): 394, ProcessCoord(pipe=1, data=69, model=1): 395, ProcessCoord(pipe=1, data=70, model=0): 396, ProcessCoord(pipe=1, data=70, model=1): 397, ProcessCoord(pipe=1, data=71, model=0): 398, ProcessCoord(pipe=1, data=71, model=1): 399, ProcessCoord(pipe=1, data=72, model=0): 400, ProcessCoord(pipe=1, data=72, model=1): 401, ProcessCoord(pipe=1, data=73, model=0): 402, ProcessCoord(pipe=1, data=73, model=1): 403, ProcessCoord(pipe=1, data=74, model=0): 404, ProcessCoord(pipe=1, data=74, model=1): 405, ProcessCoord(pipe=1, data=75, model=0): 406, ProcessCoord(pipe=1, data=75, model=1): 407, ProcessCoord(pipe=1, data=76, model=0): 408, ProcessCoord(pipe=1, data=76, model=1): 409, ProcessCoord(pipe=1, data=77, model=0): 410, ProcessCoord(p 0: ipe=1, data=77, model=1): 411, ProcessCoord(pipe=1, data=78, model=0): 412, ProcessCoord(pipe=1, data=78, model=1): 413, ProcessCoord(pipe=1, data=79, model=0): 414, ProcessCoord(pipe=1, data=79, model=1): 415, ProcessCoord(pipe=1, data=80, model=0): 416, ProcessCoord(pipe=1, data=80, model=1): 417, ProcessCoord(pipe=1, data=81, model=0): 418, ProcessCoord(pipe=1, data=81, model=1): 419, ProcessCoord(pipe=1, data=82, model=0): 420, ProcessCoord(pipe=1, data=82, model=1): 421, ProcessCoord(pipe=1, data=83, model=0): 422, ProcessCoord(pipe=1, data=83, model=1): 423, ProcessCoord(pipe=1, data=84, model=0): 424, ProcessCoord(pipe=1, data=84, model=1): 425, ProcessCoord(pipe=1, data=85, model=0): 426, ProcessCoord(pipe=1, data=85, model=1): 427, ProcessCoord(pipe=1, data=86, model=0): 428, ProcessCoord(pipe=1, data=86, model=1): 429, ProcessCoord(pipe=1, data=87, model=0): 430, ProcessCoord(pipe=1, data=87, model=1): 431, ProcessCoord(pipe=1, data=88, model=0): 432, ProcessCoord(pipe=1, data=88, model=1): 433, Pro 0: cessCoord(pipe=1, data=89, model=0): 434, ProcessCoord(pipe=1, data=89, model=1): 435, ProcessCoord(pipe=1, data=90, model=0): 436, ProcessCoord(pipe=1, data=90, model=1): 437, ProcessCoord(pipe=1, data=91, model=0): 438, ProcessCoord(pipe=1, data=91, model=1): 439, ProcessCoord(pipe=1, data=92, model=0): 440, ProcessCoord(pipe=1, data=92, model=1): 441, ProcessCoord(pipe=1, data=93, model=0): 442, ProcessCoord(pipe=1, data=93, model=1): 443, ProcessCoord(pipe=1, data=94, model=0): 444, ProcessCoord(pipe=1, data=94, model=1): 445, ProcessCoord(pipe=1, data=95, model=0): 446, ProcessCoord(pipe=1, data=95, model=1): 447, ProcessCoord(pipe=1, data=96, model=0): 448, ProcessCoord(pipe=1, data=96, model=1): 449, ProcessCoord(pipe=1, data=97, model=0): 450, ProcessCoord(pipe=1, data=97, model=1): 451, ProcessCoord(pipe=1, data=98, model=0): 452, ProcessCoord(pipe=1, data=98, model=1): 453, ProcessCoord(pipe=1, data=99, model=0): 454, ProcessCoord(pipe=1, data=99, model=1): 455, ProcessCoord(pipe=1, data=100, model= 0: 0): 456, ProcessCoord(pipe=1, data=100, model=1): 457, ProcessCoord(pipe=1, data=101, model=0): 458, ProcessCoord(pipe=1, data=101, model=1): 459, ProcessCoord(pipe=1, data=102, model=0): 460, ProcessCoord(pipe=1, data=102, model=1): 461, ProcessCoord(pipe=1, data=103, model=0): 462, ProcessCoord(pipe=1, data=103, model=1): 463, ProcessCoord(pipe=1, data=104, model=0): 464, ProcessCoord(pipe=1, data=104, model=1): 465, ProcessCoord(pipe=1, data=105, model=0): 466, ProcessCoord(pipe=1, data=105, model=1): 467, ProcessCoord(pipe=1, data=106, model=0): 468, ProcessCoord(pipe=1, data=106, model=1): 469, ProcessCoord(pipe=1, data=107, model=0): 470, ProcessCoord(pipe=1, data=107, model=1): 471, ProcessCoord(pipe=1, data=108, model=0): 472, ProcessCoord(pipe=1, data=108, model=1): 473, ProcessCoord(pipe=1, data=109, model=0): 474, ProcessCoord(pipe=1, data=109, model=1): 475, ProcessCoord(pipe=1, data=110, model=0): 476, ProcessCoord(pipe=1, data=110, model=1): 477, ProcessCoord(pipe=1, data=111, model=0): 478, Pro 0: cessCoord(pipe=1, data=111, model=1): 479, ProcessCoord(pipe=1, data=112, model=0): 480, ProcessCoord(pipe=1, data=112, model=1): 481, ProcessCoord(pipe=1, data=113, model=0): 482, ProcessCoord(pipe=1, data=113, model=1): 483, ProcessCoord(pipe=1, data=114, model=0): 484, ProcessCoord(pipe=1, data=114, model=1): 485, ProcessCoord(pipe=1, data=115, model=0): 486, ProcessCoord(pipe=1, data=115, model=1): 487, ProcessCoord(pipe=1, data=116, model=0): 488, ProcessCoord(pipe=1, data=116, model=1): 489, ProcessCoord(pipe=1, data=117, model=0): 490, ProcessCoord(pipe=1, data=117, model=1): 491, ProcessCoord(pipe=1, data=118, model=0): 492, ProcessCoord(pipe=1, data=118, model=1): 493, ProcessCoord(pipe=1, data=119, model=0): 494, ProcessCoord(pipe=1, data=119, model=1): 495, ProcessCoord(pipe=1, data=120, model=0): 496, ProcessCoord(pipe=1, data=120, model=1): 497, ProcessCoord(pipe=1, data=121, model=0): 498, ProcessCoord(pipe=1, data=121, model=1): 499, ProcessCoord(pipe=1, data=122, model=0): 500, ProcessCoord(pi 0: pe=1, data=122, model=1): 501, ProcessCoord(pipe=1, data=123, model=0): 502, ProcessCoord(pipe=1, data=123, model=1): 503, ProcessCoord(pipe=1, data=124, model=0): 504, ProcessCoord(pipe=1, data=124, model=1): 505, ProcessCoord(pipe=1, data=125, model=0): 506, ProcessCoord(pipe=1, data=125, model=1): 507, ProcessCoord(pipe=1, data=126, model=0): 508, ProcessCoord(pipe=1, data=126, model=1): 509, ProcessCoord(pipe=1, data=127, model=0): 510, ProcessCoord(pipe=1, data=127, model=1): 511} 0: [2022-11-25 09:40:32,073] [INFO] [module.py:366:_partition_layers] Partitioning pipeline stages with method type:transformer 0: stage=0 layers=24 0: 0: _to_float16 0: 1: EmbeddingPipe 0: 2: 0: 3: ParallelTransformerLayerPipe 0: 4: ParallelTransformerLayerPipe 0: 5: ParallelTransformerLayerPipe 0: 6: ParallelTransformerLayerPipe 0: 7: ParallelTransformerLayerPipe 0: 8: ParallelTransformerLayerPipe 0: 9: ParallelTransformerLayerPipe 0: 10: ParallelTransformerLayerPipe 0: 11: ParallelTransformerLayerPipe 0: 12: ParallelTransformerLayerPipe 0: 13: ParallelTransformerLayerPipe 0: 14: ParallelTransformerLayerPipe 0: 15: ParallelTransformerLayerPipe 0: 16: ParallelTransformerLayerPipe 0: 17: ParallelTransformerLayerPipe 0: 18: ParallelTransformerLayerPipe 0: 19: ParallelTransformerLayerPipe 0: 20: ParallelTransformerLayerPipe 0: 21: ParallelTransformerLayerPipe 0: 22: ParallelTransformerLayerPipe 0: 23: ParallelTransformerLayerPipe 0: stage=1 layers=25 0: 24: ParallelTransformerLayerPipe 0: 25: ParallelTransformerLayerPipe 0: 26: ParallelTransformerLayerPipe 0: 27: ParallelTransformerLayerPipe 0: 28: ParallelTransformerLayerPipe 0: 29: ParallelTransformerLayerPipe 0: 30: ParallelTransformerLayerPipe 0: 31: ParallelTransformerLayerPipe 0: 32: ParallelTransformerLayerPipe 0: 33: ParallelTransformerLayerPipe 0: 34: ParallelTransformerLayerPipe 0: 35: ParallelTransformerLayerPipe 0: 36: ParallelTransformerLayerPipe 0: 37: ParallelTransformerLayerPipe 0: 38: ParallelTransformerLayerPipe 0: 39: ParallelTransformerLayerPipe 0: 40: ParallelTransformerLayerPipe 0: 41: ParallelTransformerLayerPipe 0: 42: ParallelTransformerLayerPipe 0: 43: ParallelTransformerLayerPipe 0: 44: ParallelTransformerLayerPipe 0: 45: undo 0: 46: MixedFusedLayerNorm 0: 47: EmbeddingPipe 0: 48: float16_to_fp32 0: loss: CrossEntropy 0: [2022-11-25 09:40:37,878] [INFO] [utils.py:827:see_memory_usage] After Building Model 0: [2022-11-25 09:40:37,879] [INFO] [utils.py:828:see_memory_usage] MA 4.16 GB Max_MA 4.16 GB CA 4.17 GB Max_CA 4 GB 0: [2022-11-25 09:40:37,879] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 44.37 GB, percent = 8.8% 0: setting training iterations to 5494 0: > learning rate decay style: cosine 0: DeepSpeed is enabled. 0: [2022-11-25 09:40:37,881] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.7.5, git-hash=unknown, git-branch=unknown 40: ninja: no work to do. 11: Time to load utils op: 0.003289461135864258 seconds 40: Time to load utils op: 0.3481590747833252 seconds 40: Time to load utils op: 0.0005748271942138672 seconds 20: ninja: no work to do. 47: Time to load utils op: 0.2024707794189453 seconds 20: Time to load utils op: 0.17470908164978027 seconds 50: Time to load utils op: 0.30288028717041016 seconds 41: Time to load utils op: 0.302654504776001 seconds 49: Time to load utils op: 0.3044393062591553 seconds 53: Time to load utils op: 0.30236363410949707 seconds 51: Time to load utils op: 0.30266547203063965 seconds 50: Time to load utils op: 0.20204901695251465 seconds 51: Time to load utils op: 0.30274367332458496 seconds 48: Time to load utils op: 0.2028357982635498 secondsTime to load utils op: 0.20318913459777832 seconds 48: 51: Time to load utils op: 0.3034372329711914 seconds 49: Time to load utils op: 0.3043479919433594 seconds 53: Time to load utils op: 0.30209803581237793 seconds 48: Time to load utils op: 0.20255565643310547 seconds 49: Time to load utils op: 0.2021031379699707 secondsTime to load utils op: 0.20220065116882324 seconds 49: 48: Time to load utils op: 0.20266485214233398 seconds 50: Time to load utils op: 0.2024827003479004 seconds 49: Time to load utils op: 0.20230627059936523 seconds 50: Time to load utils op: 0.20212268829345703 seconds 53: Time to load utils op: 0.30218935012817383 seconds 49: Time to load utils op: 0.30452513694763184 seconds 50: Time to load utils op: 0.20168256759643555 seconds 49: Time to load utils op: 0.20274138450622559 seconds 49: Time to load utils op: 0.30489420890808105 seconds 52: Time to load utils op: 0.30364251136779785 seconds 52: Time to load utils op: 0.30367183685302734 seconds 51: Time to load utils op: 0.20232772827148438 secondsTime to load utils op: 0.20232486724853516 seconds 51: 52: Time to load utils op: 0.30293822288513184 secondsTime to load utils op: 0.3034684658050537 seconds 52: 51: Time to load utils op: 0.2018430233001709 seconds 51: Time to load utils op: 0.2020714282989502 seconds 54: Time to load utils op: 0.3030710220336914 seconds 54: Time to load utils op: 0.3026101589202881 seconds 52: Time to load utils op: 0.2017676830291748 seconds 54: Time to load utils op: 0.30256104469299316 seconds 52: Time to load utils op: 0.20196294784545898 seconds 52: Time to load utils op: 0.20176124572753906 seconds 54: Time to load utils op: 0.20198297500610352 seconds 52: Time to load utils op: 0.20262718200683594 seconds 54: Time to load utils op: 0.201704740524292 seconds 54: Time to load utils op: 0.20185017585754395 seconds 53: Time to load utils op: 0.20264172554016113 seconds 56: Time to load utils op: 0.3031470775604248 secondsTime to load utils op: 0.30306363105773926 seconds 56: 56: Time to load utils op: 0.30327773094177246 seconds 53: Time to load utils op: 0.202836275100708 seconds 53: Time to load utils op: 0.20283794403076172 seconds 58: Time to load utils op: 0.30245399475097656 secondsTime to load utils op: 0.3025646209716797 secondsTime to load utils op: 0.3024272918701172 seconds 58: 58: 57: Time to load utils op: 0.3028419017791748 seconds 53: Time to load utils op: 0.2033998966217041 seconds 57: Time to load utils op: 0.30263519287109375 seconds 57: Time to load utils op: 0.3032870292663574 seconds 56: Time to load utils op: 0.202012300491333 seconds 56: Time to load utils op: 0.20218253135681152 seconds 55: Time to load utils op: 0.30499792098999023 secondsTime to load utils op: 0.3046681880950928 secondsTime to load utils op: 0.20268797874450684 seconds 55: 55: 59: Time to load utils op: 0.30272579193115234 secondsTime to load utils op: 0.3028268814086914 seconds 59: 57: Time to load utils op: 0.20195245742797852 seconds 56: Time to load utils op: 0.20224881172180176 seconds 55: Time to load utils op: 0.20294404029846191 seconds 55: Time to load utils op: 0.2027876377105713 seconds 55: Time to load utils op: 0.30524492263793945 seconds 56: Time to load utils op: 0.20246458053588867 seconds 55: Time to load utils op: 0.30367040634155273 seconds 59: Time to load utils op: 0.20192408561706543 seconds 59: Time to load utils op: 0.30301427841186523 seconds 57: Time to load utils op: 0.20238852500915527 seconds 58: Time to load utils op: 0.2020277976989746 seconds 60: Time to load utils op: 0.3029639720916748 seconds 58: Time to load utils op: 0.20244288444519043 seconds 57: Time to load utils op: 0.20261263847351074 seconds 55: Time to load utils op: 0.20357656478881836 seconds 58: Time to load utils op: 0.20207619667053223 seconds 58: Time to load utils op: 0.20231223106384277 seconds 59: Time to load utils op: 0.20178937911987305 seconds 59: Time to load utils op: 0.2021636962890625 seconds 57: Time to load utils op: 0.20314908027648926 seconds 59: Time to load utils op: 0.20225238800048828 seconds 60: Time to load utils op: 0.303189754486084 seconds 60: Time to load utils op: 0.3027000427246094 seconds 60: Time to load utils op: 0.20259571075439453 seconds 60: Time to load utils op: 0.20239996910095215 seconds 61: Time to load utils op: 0.2020869255065918 secondsTime to load utils op: 0.30380797386169434 seconds 61: 63: Time to load utils op: 0.3032419681549072 seconds 61: Time to load utils op: 0.20267891883850098 seconds 60: Time to load utils op: 0.20291399955749512 seconds 63: Time to load utils op: 0.30297422409057617 seconds 61: Time to load utils op: 0.3045971393585205 seconds 60: Time to load utils op: 0.2026059627532959 seconds 62: Time to load utils op: 0.3037993907928467 seconds 63: Time to load utils op: 0.303600549697876 seconds 62: Time to load utils op: 0.3033623695373535 seconds 63: Time to load utils op: 0.2017204761505127 seconds 61: Time to load utils op: 0.3042876720428467 seconds 61: Time to load utils op: 0.20253896713256836 seconds 63: Time to load utils op: 0.2021629810333252 seconds 61: Time to load utils op: 0.20252203941345215 seconds 62: Time to load utils op: 0.2023146152496338 secondsTime to load utils op: 0.20227813720703125 seconds 62: 62: Time to load utils op: 0.20196270942687988 seconds 61: Time to load utils op: 0.30492711067199707 seconds 63: Time to load utils op: 0.20213985443115234 seconds 62: Time to load utils op: 0.3038349151611328 seconds 62: Time to load utils op: 0.3040621280670166 seconds 62: Time to load utils op: 0.20252418518066406 seconds 63: Time to load utils op: 0.20209407806396484 seconds 0: Time to load utils op: 0.10194873809814453 secondsTime to load utils op: 0.10187220573425293 seconds 0: 1: Time to load utils op: 0.10245656967163086 seconds 1: Time to load utils op: 0.10224390029907227 seconds 1: Time to load utils op: 0.1023554801940918 seconds 2: Time to load utils op: 0.1023099422454834 seconds 2: Time to load utils op: 0.1028280258178711 seconds 2: Time to load utils op: 0.1024484634399414 seconds 2: Time to load utils op: 0.10258102416992188 seconds 4: Time to load utils op: 0.10306859016418457 seconds 4: Time to load utils op: 0.10279345512390137 seconds 3: Time to load utils op: 0.10350775718688965 seconds 3: Time to load utils op: 0.10355973243713379 seconds 4: Time to load utils op: 0.1026914119720459 seconds 3: Time to load utils op: 0.10384440422058105 seconds 4: Time to load utils op: 0.10312390327453613 seconds 3: Time to load utils op: 0.10335969924926758 seconds 7: Time to load utils op: 0.10222220420837402 seconds 6: Time to load utils op: 0.1022336483001709 secondsTime to load utils op: 0.1022195816040039 seconds 6: 6: Time to load utils op: 0.10291790962219238 seconds 7: Time to load utils op: 0.10209369659423828 seconds 7: Time to load utils op: 0.10224223136901855 seconds 6: Time to load utils op: 0.10244941711425781 seconds 5: Time to load utils op: 0.10370850563049316 secondsTime to load utils op: 0.10362720489501953 seconds 5: 5: Time to load utils op: 0.10328435897827148 seconds 5: Time to load utils op: 0.10337400436401367 seconds 2: Time to load utils op: 0.20321321487426758 seconds 8: Time to load utils op: 0.10265088081359863 seconds 8: Time to load utils op: 0.10256505012512207 secondsTime to load utils op: 0.10276126861572266 seconds 8: 41: Time to load utils op: 0.00046706199645996094 seconds 53: Time to load utils op: 0.0004980564117431641 seconds 10: Time to load utils op: 0.10329604148864746 seconds 10: Time to load utils op: 0.10326337814331055 seconds 48: Time to load utils op: 0.0004918575286865234 seconds 47: Time to load utils op: 0.00043010711669921875 seconds 10: Time to load utils op: 0.10353708267211914 seconds 10: Time to load utils op: 0.10312366485595703 seconds 9: Time to load utils op: 0.10390806198120117 secondsTime to load utils op: 0.1039731502532959 seconds 9: 9: Time to load utils op: 0.1041862964630127 seconds 53: Time to load utils op: 0.0003452301025390625 seconds 50: Time to load utils op: 0.0004360675811767578 seconds 53: Time to load utils op: 0.0003921985626220703 seconds 9: Time to load utils op: 0.10430288314819336 seconds 48: Time to load utils op: 0.00034499168395996094 seconds 13: Time to load utils op: 0.1025857925415039 seconds 48: Time to load utils op: 0.00038886070251464844 seconds 48: Time to load utils op: 0.0003733634948730469 seconds 13: Time to load utils op: 0.10250973701477051 seconds 11: Time to load utils op: 0.10352444648742676 secondsTime to load utils op: 0.10355424880981445 seconds 11: 50: Time to load utils op: 0.00038504600524902344 seconds 13: Time to load utils op: 0.10239458084106445 seconds 53: Time to load utils op: 0.0003542900085449219 seconds 11: Time to load utils op: 0.10383296012878418 seconds 51: Time to load utils op: 0.0005366802215576172 seconds 50: Time to load utils op: 0.00035500526428222656 seconds 12: Time to load utils op: 0.10331034660339355 seconds 51: Time to load utils op: 0.0005078315734863281 secondsTime to load utils op: 0.0004856586456298828 seconds 51: 12: Time to load utils op: 0.10339617729187012 seconds 11: Time to load utils op: 0.10407447814941406 seconds 12: Time to load utils op: 0.10350370407104492 seconds 12: Time to load utils op: 0.10376191139221191 seconds 51: Time to load utils op: 0.0003643035888671875 seconds 14: Time to load utils op: 0.10319685935974121 secondsTime to load utils op: 0.10271167755126953 seconds 14: 10: Time to load utils op: 0.2023165225982666 seconds 50: Time to load utils op: 0.0004088878631591797 seconds 51: Time to load utils op: 0.0004146099090576172 seconds 50: Time to load utils op: 0.00036334991455078125 seconds 14: Time to load utils op: 0.10282468795776367 seconds 14: Time to load utils op: 0.10286903381347656 seconds 54: Time to load utils op: 0.0004863739013671875 secondsTime to load utils op: 0.0004742145538330078 seconds 54: 51: Time to load utils op: 0.0004000663757324219 seconds 53: Time to load utils op: 0.0003383159637451172 seconds 51: Time to load utils op: 0.0003523826599121094 seconds 54: Time to load utils op: 0.0003407001495361328 seconds 52: Time to load utils op: 0.0005290508270263672 seconds 54: Time to load utils op: 0.00036787986755371094 secondsTime to load utils op: 0.0003559589385986328 seconds 54: 53: Time to load utils op: 0.0003802776336669922 seconds 15: Time to load utils op: 0.1031789779663086 seconds 49: Time to load utils op: 0.0004379749298095703 seconds 15: Time to load utils op: 0.10323739051818848 seconds 53: Time to load utils op: 0.00033283233642578125 seconds 52: Time to load utils op: 0.0003986358642578125 seconds 54: Time to load utils op: 0.0003638267517089844 seconds 49: Time to load utils op: 0.0005271434783935547 seconds 15: Time to load utils op: 0.1029047966003418 secondsTime to load utils op: 0.10290408134460449 seconds 15: 52: Time to load utils op: 0.00048089027404785156 seconds 17: Time to load utils op: 0.1025094985961914 seconds 49: Time to load utils op: 0.0004258155822753906 secondsTime to load utils op: 0.0005371570587158203 secondsTime to load utils op: 0.00042128562927246094 seconds 49: 49: 18: Time to load utils op: 0.10235023498535156 seconds 17: Time to load utils op: 0.10251712799072266 seconds 17: Time to load utils op: 0.1024174690246582 seconds 18: Time to load utils op: 0.10198354721069336 secondsTime to load utils op: 0.10198116302490234 seconds 18: 58: Time to load utils op: 0.0004608631134033203 seconds 52: Time to load utils op: 0.0004048347473144531 seconds 49: Time to load utils op: 0.00057220458984375 seconds 49: Time to load utils op: 0.000408172607421875 seconds 58: Time to load utils op: 0.0003399848937988281 seconds 49: Time to load utils op: 0.0004253387451171875 seconds 52: Time to load utils op: 0.00037980079650878906 seconds 16: Time to load utils op: 0.10367488861083984 seconds 52: Time to load utils op: 0.0003769397735595703 seconds 16: Time to load utils op: 0.10373449325561523 seconds 57: Time to load utils op: 0.0004680156707763672 seconds 19: Time to load utils op: 0.10170793533325195 seconds 16: Time to load utils op: 0.10377979278564453 seconds 52: Time to load utils op: 0.00037741661071777344 seconds 16: Time to load utils op: 0.10364556312561035 seconds 52: Time to load utils op: 0.0003933906555175781 seconds 19: Time to load utils op: 0.1021881103515625 seconds 20: Time to load utils op: 0.1016688346862793 seconds 19: Time to load utils op: 0.10213446617126465 seconds 57: Time to load utils op: 0.0003917217254638672 seconds 20: Time to load utils op: 0.10202836990356445 seconds 20: Time to load utils op: 0.10233783721923828 seconds 58: Time to load utils op: 0.0003769397735595703 secondsTime to load utils op: 0.00040841102600097656 seconds 58: 56: Time to load utils op: 0.00035262107849121094 seconds 58: Time to load utils op: 0.00034618377685546875 seconds 56: Time to load utils op: 0.0005195140838623047 seconds 20: Time to load utils op: 0.10226726531982422 seconds 57: Time to load utils op: 0.00035858154296875 seconds 59: Time to load utils op: 0.00046896934509277344 seconds 56: Time to load utils op: 0.0005440711975097656 seconds 21: Time to load utils op: 0.10188961029052734 seconds 21: Time to load utils op: 0.10216856002807617 seconds 21: Time to load utils op: 0.1018514633178711 seconds 55: Time to load utils op: 0.0005598068237304688 seconds 59: Time to load utils op: 0.0005254745483398438 seconds 56: Time to load utils op: 0.0003123283386230469 seconds 21: Time to load utils op: 0.10192656517028809 seconds 57: Time to load utils op: 0.00037789344787597656 seconds 59: Time to load utils op: 0.0003459453582763672 seconds 58: Time to load utils op: 0.0003924369812011719 seconds 56: Time to load utils op: 0.0003185272216796875 seconds 22: Time to load utils op: 0.1022484302520752 seconds 57: Time to load utils op: 0.00039768218994140625 seconds 57: Time to load utils op: 0.0004382133483886719 seconds 59: Time to load utils op: 0.0003833770751953125 seconds 60: Time to load utils op: 0.00043487548828125 seconds 22: Time to load utils op: 0.10212278366088867 seconds 58: Time to load utils op: 0.0003345012664794922 seconds 22: Time to load utils op: 0.10214972496032715 seconds 57: Time to load utils op: 0.0003514289855957031 seconds 59: Time to load utils op: 0.00032329559326171875 seconds 56: Time to load utils op: 0.0003619194030761719 seconds 56: Time to load utils op: 0.00037670135498046875 seconds 60: Time to load utils op: 0.0003795623779296875 secondsTime to load utils op: 0.0004074573516845703 seconds 60: 55: Time to load utils op: 0.0003674030303955078 seconds 60: Time to load utils op: 0.00036334991455078125 seconds 59: Time to load utils op: 0.0003619194030761719 seconds 24: Time to load utils op: 0.10203027725219727 seconds 23: Time to load utils op: 0.10251665115356445 seconds 59: Time to load utils op: 0.0003669261932373047 seconds 60: Time to load utils op: 0.0003795623779296875 seconds 24: Time to load utils op: 0.10162568092346191 seconds 55: Time to load utils op: 0.00036978721618652344 seconds 23: Time to load utils op: 0.10226893424987793 seconds 55: Time to load utils op: 0.0003821849822998047 seconds 23: Time to load utils op: 0.10194063186645508 seconds 60: Time to load utils op: 0.0003247261047363281 seconds 24: Time to load utils op: 0.10174417495727539 seconds 60: Time to load utils op: 0.00040435791015625 seconds 24: Time to load utils op: 0.10167765617370605 seconds 55: Time to load utils op: 0.0003705024719238281 seconds 55: Time to load utils op: 0.0004546642303466797 secondsTime to load utils op: 0.0004432201385498047 seconds 55: 55: Time to load utils op: 0.00040221214294433594 seconds 63: Time to load utils op: 0.00045013427734375 seconds 63: Time to load utils op: 0.00039577484130859375 seconds 63: Time to load utils op: 0.0003330707550048828 seconds 62: Time to load utils op: 0.0004634857177734375 seconds 61: Time to load utils op: 0.0005419254302978516 seconds 21: Time to load utils op: 0.2025461196899414 seconds 61: Time to load utils op: 0.00045943260192871094 seconds 26: Time to load utils op: 0.10286283493041992 seconds 63: Time to load utils op: 0.000354766845703125 seconds 61: Time to load utils op: 0.0005066394805908203 seconds 26: Time to load utils op: 0.10297703742980957 seconds 63: Time to load utils op: 0.0003504753112792969 seconds 63: Time to load utils op: 0.0003848075866699219 seconds 25: Time to load utils op: 0.1036677360534668 seconds 26: Time to load utils op: 0.1026160717010498 seconds 26: Time to load utils op: 0.10279059410095215 seconds 62: Time to load utils op: 0.0003666877746582031 seconds 25: Time to load utils op: 0.10399866104125977 seconds 63: Time to load utils op: 0.0003688335418701172 seconds 25: Time to load utils op: 0.10406255722045898 secondsTime to load utils op: 0.10402035713195801 seconds 25: 62: Time to load utils op: 0.0004711151123046875 seconds 61: Time to load utils op: 0.0004279613494873047 seconds 61: Time to load utils op: 0.0004353523254394531 seconds 62: Time to load utils op: 0.0003542900085449219 seconds 61: Time to load utils op: 0.00041174888610839844 seconds 61: Time to load utils op: 0.0003998279571533203 seconds 61: Time to load utils op: 0.0004303455352783203 seconds 62: Time to load utils op: 0.00038814544677734375 seconds 27: Time to load utils op: 0.10332226753234863 seconds 27: Time to load utils op: 0.1035161018371582 seconds 62: Time to load utils op: 0.0004146099090576172 seconds 27: Time to load utils op: 0.10378742218017578 secondsTime to load utils op: 0.10352873802185059 seconds 27: 30: Time to load utils op: 0.1018838882446289 secondsTime to load utils op: 0.10251522064208984 seconds 30: 62: Time to load utils op: 0.00039577484130859375 seconds 30: Time to load utils op: 0.10199379920959473 seconds 62: Time to load utils op: 0.00042891502380371094 seconds 28: Time to load utils op: 0.1033334732055664 seconds 30: Time to load utils op: 0.10225176811218262 seconds 28: Time to load utils op: 0.10337519645690918 seconds 31: Time to load utils op: 0.10192704200744629 seconds 28: Time to load utils op: 0.1033470630645752 seconds 28: Time to load utils op: 0.10362744331359863 seconds 24: Time to load utils op: 0.20236587524414062 seconds 31: Time to load utils op: 0.10219192504882812 seconds 31: Time to load utils op: 0.10223984718322754 seconds 29: Time to load utils op: 0.1032249927520752 seconds 29: Time to load utils op: 0.10356760025024414 seconds 29: Time to load utils op: 0.10372447967529297 secondsTime to load utils op: 0.10382795333862305 seconds 29: 0: Time to load utils op: 0.00046706199645996094 seconds 0: Time to load utils op: 0.00046515464782714844 seconds 1: Time to load utils op: 0.0004177093505859375 seconds 1: Time to load utils op: 0.00032401084899902344 seconds 1: Time to load utils op: 0.0003216266632080078 seconds 33: Time to load utils op: 0.4037971496582031 seconds 2: Time to load utils op: 0.0004394054412841797 seconds 2: Time to load utils op: 0.00043702125549316406 seconds 2: Time to load utils op: 0.00037741661071777344 seconds 37: Time to load utils op: 0.40297627449035645 seconds 36: Time to load utils op: 0.4037749767303467 seconds 3: Time to load utils op: 0.000431060791015625 seconds 7: Time to load utils op: 0.0003509521484375 secondsTime to load utils op: 0.00045680999755859375 seconds 7: 38: Time to load utils op: 0.40319061279296875 seconds 2: Time to load utils op: 0.00035381317138671875 seconds 4: Time to load utils op: 0.000453948974609375 seconds 3: Time to load utils op: 0.0003173351287841797 seconds 7: Time to load utils op: 0.00031948089599609375 seconds 3: Time to load utils op: 0.000347137451171875 seconds 4: Time to load utils op: 0.000331878662109375 seconds 6: Time to load utils op: 0.0004324913024902344 seconds 4: Time to load utils op: 0.00033855438232421875 seconds 4: Time to load utils op: 0.0003445148468017578 seconds 6: Time to load utils op: 0.0003559589385986328 seconds 3: Time to load utils op: 0.00033593177795410156 seconds 6: Time to load utils op: 0.00033783912658691406 seconds 5: Time to load utils op: 0.00044989585876464844 seconds 5: Time to load utils op: 0.00033211708068847656 seconds 6: Time to load utils op: 0.0003364086151123047 seconds 5: Time to load utils op: 0.0003409385681152344 seconds 41: Time to load utils op: 0.4029521942138672 seconds 40: Time to load utils op: 0.4031844139099121 seconds 5: Time to load utils op: 0.0003230571746826172 seconds 8: Time to load utils op: 0.00047516822814941406 seconds 8: Time to load utils op: 0.0004930496215820312 seconds 8: Time to load utils op: 0.0003421306610107422 seconds 43: Time to load utils op: 0.40363025665283203 seconds 11: Time to load utils op: 0.0004754066467285156 seconds 11: Time to load utils op: 0.000331878662109375 seconds 11: Time to load utils op: 0.0003197193145751953 seconds 10: Time to load utils op: 0.00033974647521972656 secondsTime to load utils op: 0.0004963874816894531 seconds 10: 10: Time to load utils op: 0.0003764629364013672 seconds 10: Time to load utils op: 0.00032711029052734375 seconds 11: Time to load utils op: 0.00030612945556640625 seconds 45: Time to load utils op: 0.40342140197753906 seconds 13: Time to load utils op: 0.00044918060302734375 seconds 13: Time to load utils op: 0.00046944618225097656 seconds 9: Time to load utils op: 0.0004532337188720703 seconds 13: Time to load utils op: 0.0003299713134765625 seconds 46: Time to load utils op: 0.40354132652282715 seconds 12: Time to load utils op: 0.00045371055603027344 seconds 12: Time to load utils op: 0.0005059242248535156 seconds 9: Time to load utils op: 0.0003559589385986328 secondsTime to load utils op: 0.0003399848937988281 seconds 9: 12: Time to load utils op: 0.0003287792205810547 seconds 9: Time to load utils op: 0.0003352165222167969 seconds 12: Time to load utils op: 0.0003809928894042969 seconds 15: Time to load utils op: 0.00044083595275878906 seconds 47: Time to load utils op: 0.40328431129455566 seconds 14: Time to load utils op: 0.0005142688751220703 seconds 14: Time to load utils op: 0.0004544258117675781 seconds 14: Time to load utils op: 0.00036716461181640625 seconds 14: Time to load utils op: 0.0004887580871582031 seconds 15: Time to load utils op: 0.0003638267517089844 secondsTime to load utils op: 0.00036644935607910156 seconds 15: 15: Time to load utils op: 0.00035262107849121094 seconds 17: Time to load utils op: 0.0004582405090332031 seconds 19: Time to load utils op: 0.0004634857177734375 seconds 18: Time to load utils op: 0.00048351287841796875 seconds 48: Time to load utils op: 0.4041738510131836 seconds 17: Time to load utils op: 0.0003311634063720703 seconds 19: Time to load utils op: 0.0003333091735839844 seconds 18: Time to load utils op: 0.00034356117248535156 seconds 18: Time to load utils op: 0.0003447532653808594 seconds 19: Time to load utils op: 0.000324249267578125 seconds 17: Time to load utils op: 0.00032210350036621094 seconds 16: Time to load utils op: 0.0004582405090332031 seconds 51: Time to load utils op: 0.403491735458374 seconds 16: Time to load utils op: 0.0003273487091064453 seconds 20: Time to load utils op: 0.0004744529724121094 seconds 16: Time to load utils op: 0.0003292560577392578 seconds 20: Time to load utils op: 0.0003993511199951172 seconds 20: Time to load utils op: 0.0003762245178222656 seconds 21: Time to load utils op: 0.0004885196685791016 seconds 53: Time to load utils op: 0.40323734283447266 seconds 16: Time to load utils op: 0.0003237724304199219 seconds 20: Time to load utils op: 0.00037169456481933594 seconds 50: Time to load utils op: 0.40481114387512207 seconds 21: Time to load utils op: 0.00035381317138671875 seconds 54: Time to load utils op: 0.40296435356140137 seconds 21: Time to load utils op: 0.0003533363342285156 seconds 22: Time to load utils op: 0.0004432201385498047 seconds 22: Time to load utils op: 0.00032639503479003906 seconds 24: Time to load utils op: 0.0005054473876953125 secondsTime to load utils op: 0.0003616809844970703 seconds 24: 21: Time to load utils op: 0.00037550926208496094 seconds 22: Time to load utils op: 0.0003249645233154297 seconds 24: Time to load utils op: 0.0003464221954345703 seconds 23: Time to load utils op: 0.00042629241943359375 seconds 24: Time to load utils op: 0.00030517578125 seconds 23: Time to load utils op: 0.00032711029052734375 seconds 23: Time to load utils op: 0.00033283233642578125 seconds 56: Time to load utils op: 0.4035966396331787 seconds 25: Time to load utils op: 0.00048470497131347656 seconds 25: Time to load utils op: 0.0003752708435058594 seconds 25: Time to load utils op: 0.00041365623474121094 seconds 25: Time to load utils op: 0.00034356117248535156 seconds 57: Time to load utils op: 0.4034700393676758 seconds 58: Time to load utils op: 0.4032266139984131 seconds 26: Time to load utils op: 0.0004286766052246094 seconds 26: Time to load utils op: 0.0003719329833984375 seconds 26: Time to load utils op: 0.0003457069396972656 seconds 26: Time to load utils op: 0.0003333091735839844 seconds 27: Time to load utils op: 0.0004513263702392578 secondsTime to load utils op: 0.0003457069396972656 seconds 27: 27: Time to load utils op: 0.0003421306610107422 seconds 60: Time to load utils op: 0.4037294387817383 seconds 30: Time to load utils op: 0.0004589557647705078 seconds 27: Time to load utils op: 0.00033402442932128906 seconds 29: Time to load utils op: 0.0004134178161621094 seconds 28: Time to load utils op: 0.00043272972106933594 seconds 30: Time to load utils op: 0.00034356117248535156 seconds 59: Time to load utils op: 0.4040663242340088 seconds 28: Time to load utils op: 0.0003371238708496094 seconds 30: Time to load utils op: 0.0003485679626464844 seconds 30: Time to load utils op: 0.00032329559326171875 seconds 28: Time to load utils op: 0.0003345012664794922 seconds 29: Time to load utils op: 0.0003333091735839844 seconds 28: Time to load utils op: 0.0003151893615722656 seconds 31: Time to load utils op: 0.0004496574401855469 seconds 29: Time to load utils op: 0.00033402442932128906 seconds 31: Time to load utils op: 0.00034999847412109375 seconds 31: Time to load utils op: 0.00034165382385253906 seconds 29: Time to load utils op: 0.00034499168395996094 seconds 63: Time to load utils op: 0.40377378463745117 seconds 33: Time to load utils op: 0.0004220008850097656 seconds 37: Time to load utils op: 0.0005121231079101562 seconds 36: Time to load utils op: 0.0005011558532714844 seconds 38: Time to load utils op: 0.0004143714904785156 seconds 40: Time to load utils op: 0.0005292892456054688 seconds 32: Time to load utils op: 0.4039945602416992 seconds 0: Time to load utils op: 0.20255208015441895 seconds 1: Time to load utils op: 0.20257067680358887 seconds 41: Time to load utils op: 0.0003230571746826172 seconds 43: Time to load utils op: 0.000396728515625 seconds 45: Time to load utils op: 0.00048732757568359375 seconds 33: Time to load utils op: 0.402207612991333 secondsTime to load utils op: 0.4027731418609619 seconds 33: 33: Time to load utils op: 0.4022407531738281 seconds 33: Time to load utils op: 0.4025845527648926 seconds 7: Time to load utils op: 0.2024683952331543 seconds 47: Time to load utils op: 0.00034046173095703125 seconds 32: Time to load utils op: 0.4035365581512451 seconds 32: Time to load utils op: 0.40314793586730957 secondsTime to load utils op: 0.4032108783721924 seconds 32: 8: Time to load utils op: 0.20258641242980957 seconds 36: Time to load utils op: 0.40250205993652344 secondsTime to load utils op: 0.40210843086242676 seconds 36: 46: Time to load utils op: 0.0005142688751220703 seconds 36: Time to load utils op: 0.4024322032928467 seconds 48: Time to load utils op: 0.0003185272216796875 seconds 36: Time to load utils op: 0.40234994888305664 seconds 34: Time to load utils op: 0.40433526039123535 secondsTime to load utils op: 0.4035766124725342 secondsTime to load utils op: 0.40343236923217773 seconds 34: 34: 34: Time to load utils op: 0.40368032455444336 seconds 51: Time to load utils op: 0.0003523826599121094 seconds 38: Time to load utils op: 0.4028654098510742 secondsTime to load utils op: 0.4028191566467285 secondsTime to load utils op: 0.4027705192565918 seconds 38: 38: 53: Time to load utils op: 0.0003933906555175781 seconds 37: Time to load utils op: 0.40324831008911133 seconds 37: Time to load utils op: 0.40352535247802734 seconds 35: Time to load utils op: 0.40399813652038574 seconds 13: Time to load utils op: 0.20277166366577148 seconds 37: Time to load utils op: 0.4038200378417969 secondsTime to load utils op: 0.4032268524169922 seconds 37: 38: Time to load utils op: 0.40245819091796875 seconds 35: Time to load utils op: 0.40477538108825684 seconds 35: Time to load utils op: 0.4047532081604004 seconds 54: Time to load utils op: 0.0003921985626220703 seconds 35: Time to load utils op: 0.4055614471435547 seconds 40: Time to load utils op: 0.40280985832214355 seconds 40: Time to load utils op: 0.4028623104095459 seconds 50: Time to load utils op: 0.0003523826599121094 seconds 39: Time to load utils op: 0.40445828437805176 seconds 39: Time to load utils op: 0.40465664863586426 secondsTime to load utils op: 0.4045848846435547 seconds 39: Time to load utils op: 0.40497255325317383 seconds 39: 56: Time to load utils op: 0.00034689903259277344 seconds 17: Time to load utils op: 0.20233988761901855 seconds 18: Time to load utils op: 0.20270395278930664 seconds 19: Time to load utils op: 0.2023460865020752 seconds 0: [2022-11-25 09:40:46,320] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False 0: [2022-11-25 09:40:46,320] [INFO] [logging.py:68:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer 57: Time to load utils op: 0.0003304481506347656 seconds 0: [2022-11-25 09:40:46,320] [INFO] [logging.py:68:log_dist] [Rank 0] Using client Optimizer as basic optimizer 58: Time to load utils op: 0.00033211708068847656 seconds 22: Time to load utils op: 0.20239949226379395 seconds 59: Time to load utils op: 0.0003523826599121094 seconds 23: Time to load utils op: 0.2023022174835205 seconds 60: Time to load utils op: 0.00032830238342285156 seconds 63: Time to load utils op: 0.0003371238708496094 seconds 31: Time to load utils op: 0.2024991512298584 seconds 1: Time to load utils op: 0.0003802776336669922 seconds 0: Time to load utils op: 0.00035500526428222656 seconds 0: [2022-11-25 09:40:46,328] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam 0: [2022-11-25 09:40:46,328] [INFO] [logging.py:68:log_dist] [Rank 0] Creating BF16 optimizer 32: Time to load utils op: 0.00045371055603027344 seconds 8: Time to load utils op: 0.00031566619873046875 seconds 7: Time to load utils op: 0.00036025047302246094 seconds 32: Time to load utils op: 0.00032591819763183594 seconds 32: Time to load utils op: 0.0003542900085449219 seconds 32: Time to load utils op: 0.00034618377685546875 seconds 33: Time to load utils op: 0.00038361549377441406 seconds 33: Time to load utils op: 0.0003674030303955078 seconds 33: Time to load utils op: 0.0003647804260253906 seconds 33: Time to load utils op: 0.0003612041473388672 seconds 13: Time to load utils op: 0.0003495216369628906 seconds 36: Time to load utils op: 0.0003654956817626953 seconds 36: Time to load utils op: 0.00038886070251464844 seconds 36: Time to load utils op: 0.0003464221954345703 seconds 36: Time to load utils op: 0.00034356117248535156 seconds 34: Time to load utils op: 0.00048613548278808594 seconds 37: Time to load utils op: 0.0003445148468017578 seconds 34: Time to load utils op: 0.0003871917724609375 seconds 38: Time to load utils op: 0.0003407001495361328 seconds 37: Time to load utils op: 0.0003933906555175781 seconds 37: Time to load utils op: 0.00038909912109375 seconds 34: Time to load utils op: 0.00036215782165527344 seconds 37: Time to load utils op: 0.0003325939178466797 seconds 40: Time to load utils op: 0.0003209114074707031 seconds 38: Time to load utils op: 0.00038743019104003906 seconds 34: Time to load utils op: 0.0004012584686279297 seconds 35: Time to load utils op: 0.0004260540008544922 seconds 38: Time to load utils op: 0.00036644935607910156 seconds 38: Time to load utils op: 0.00033783912658691406 seconds 19: Time to load utils op: 0.0003578662872314453 seconds 35: Time to load utils op: 0.00033473968505859375 seconds 40: Time to load utils op: 0.0003368854522705078 seconds 35: Time to load utils op: 0.0003566741943359375 seconds 35: Time to load utils op: 0.0003552436828613281 seconds 17: Time to load utils op: 0.0003743171691894531 seconds 18: Time to load utils op: 0.00034427642822265625 seconds 39: Time to load utils op: 0.0004374980926513672 seconds 22: Time to load utils op: 0.00034332275390625 seconds 39: Time to load utils op: 0.00037384033203125 seconds 39: Time to load utils op: 0.0003502368927001953 seconds 39: Time to load utils op: 0.0003478527069091797 seconds 23: Time to load utils op: 0.00033354759216308594 seconds 54: Time to load utils op: 0.603273868560791 seconds 32: Time to load utils op: 0.4021644592285156 seconds 32: Time to load utils op: 0.4021878242492676 seconds 32: Time to load utils op: 0.4027879238128662 seconds 31: Time to load utils op: 0.0003418922424316406 seconds 33: Time to load utils op: 0.4023888111114502 secondsTime to load utils op: 0.40273451805114746 seconds 33: 33: Time to load utils op: 0.40255212783813477 seconds 34: Time to load utils op: 0.40225720405578613 seconds 34: Time to load utils op: 0.4029347896575928 seconds 35: Time to load utils op: 0.4023442268371582 seconds 34: Time to load utils op: 0.40346240997314453 seconds 36: Time to load utils op: 0.40229129791259766 seconds 34: Time to load utils op: 0.4032433032989502 seconds 35: Time to load utils op: 0.4024636745452881 seconds 35: Time to load utils op: 0.402296781539917 seconds 35: Time to load utils op: 0.4024546146392822 seconds 36: Time to load utils op: 0.4022040367126465 seconds 36: Time to load utils op: 0.40236473083496094 seconds 37: Time to load utils op: 0.4019491672515869 seconds 37: Time to load utils op: 0.40222692489624023 seconds 37: Time to load utils op: 0.4020805358886719 seconds 38: Time to load utils op: 0.4026954174041748 seconds 38: Time to load utils op: 0.40290307998657227 seconds 39: Time to load utils op: 0.4023013114929199 seconds 39: Time to load utils op: 0.4024221897125244 seconds 38: Time to load utils op: 0.40328049659729004 seconds 39: Time to load utils op: 0.4025254249572754 secondsTime to load utils op: 0.4028894901275635 seconds 39: 40: Time to load utils op: 0.40202784538269043 seconds 40: Time to load utils op: 0.4025614261627197 seconds 40: Time to load utils op: 0.4021439552307129 seconds 41: Time to load utils op: 0.4028322696685791 seconds 41: Time to load utils op: 0.40268468856811523 seconds 32: Time to load utils op: 0.6030888557434082 seconds 41: Time to load utils op: 0.3019578456878662 seconds 40: Time to load utils op: 0.3024423122406006 seconds 41: Time to load utils op: 0.30201101303100586 seconds 43: Time to load utils op: 0.40257930755615234 seconds 45: Time to load utils op: 0.40258097648620605 seconds 43: Time to load utils op: 0.40339112281799316 seconds 54: Time to load utils op: 0.0006020069122314453 seconds 32: Time to load utils op: 0.0004634857177734375 seconds 32: Time to load utils op: 0.0006561279296875 seconds 32: Time to load utils op: 0.0006330013275146484 seconds 33: Time to load utils op: 0.0004534721374511719 seconds 34: Time to load utils op: 0.0006589889526367188 seconds 33: Time to load utils op: 0.0005116462707519531 seconds 33: Time to load utils op: 0.0005834102630615234 seconds 34: Time to load utils op: 0.0005736351013183594 seconds 34: Time to load utils op: 0.0005238056182861328 seconds 34: Time to load utils op: 0.0003809928894042969 seconds 35: Time to load utils op: 0.0005614757537841797 seconds 35: Time to load utils op: 0.0005605220794677734 seconds 35: Time to load utils op: 0.0005705356597900391 seconds 37: Time to load utils op: 0.0003643035888671875 secondsTime to load utils op: 0.0005559921264648438 seconds 37: 36: Time to load utils op: 0.0005652904510498047 seconds 36: Time to load utils op: 0.0005435943603515625 secondsTime to load utils op: 0.0004875659942626953 seconds 36: 35: Time to load utils op: 0.0004851818084716797 seconds 37: Time to load utils op: 0.00048661231994628906 seconds 39: Time to load utils op: 0.0004477500915527344 seconds 38: Time to load utils op: 0.0004994869232177734 seconds 38: Time to load utils op: 0.0004031658172607422 seconds 39: Time to load utils op: 0.0003483295440673828 seconds 38: Time to load utils op: 0.0003426074981689453 seconds 40: Time to load utils op: 0.0004665851593017578 seconds 40: Time to load utils op: 0.00033593177795410156 seconds 39: Time to load utils op: 0.0003383159637451172 seconds 39: Time to load utils op: 0.0003299713134765625 seconds 40: Time to load utils op: 0.00035953521728515625 seconds 40: Time to load utils op: 0.0003414154052734375 seconds 32: Time to load utils op: 0.0004520416259765625 seconds 41: Time to load utils op: 0.0005462169647216797 seconds 41: Time to load utils op: 0.0003802776336669922 seconds 41: Time to load utils op: 0.00036454200744628906 seconds 41: Time to load utils op: 0.0004019737243652344 seconds 45: Time to load utils op: 0.00042700767517089844 seconds 43: Time to load utils op: 0.00042700767517089844 seconds 43: Time to load utils op: 0.0004191398620605469 seconds 0: [2022-11-25 09:40:46,373] [INFO] [utils.py:827:see_memory_usage] begin bf16_optimizer 0: [2022-11-25 09:40:46,374] [INFO] [utils.py:828:see_memory_usage] MA 4.15 GB Max_MA 4.18 GB CA 4.18 GB Max_CA 4 GB 0: [2022-11-25 09:40:46,374] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 45.28 GB, percent = 9.0% 0: ninja: no work to do. 0: Time to load utils op: 0.1774001121520996 seconds 41: Time to load utils op: 0.5025560855865479 seconds 1: Time to load utils op: 0.20377135276794434 seconds 0: Time to load utils op: 0.20407342910766602 seconds 0: Time to load utils op: 0.20392942428588867 seconds 41: Time to load utils op: 0.5031650066375732 seconds 1: Time to load utils op: 0.2038097381591797 seconds 0: Time to load utils op: 0.40438246726989746 seconds 2: Time to load utils op: 0.20395135879516602 seconds 1: Time to load utils op: 0.20451593399047852 seconds 43: Time to load utils op: 0.6045563220977783 seconds 43: Time to load utils op: 0.5029113292694092 seconds 1: Time to load utils op: 0.20445466041564941 seconds 2: Time to load utils op: 0.20404553413391113 seconds 45: Time to load utils op: 0.6034750938415527 seconds 43: Time to load utils op: 0.5030322074890137 seconds 42: Time to load utils op: 0.6055235862731934 seconds 45: Time to load utils op: 0.6035482883453369 seconds 43: Time to load utils op: 0.5028383731842041 seconds 46: Time to load utils op: 0.6031513214111328 seconds 2: Time to load utils op: 0.20430588722229004 seconds 42: Time to load utils op: 0.6053345203399658 seconds 43: Time to load utils op: 0.5027360916137695 seconds 42: Time to load utils op: 0.5039992332458496 seconds 3: Time to load utils op: 0.20429158210754395 seconds 44: Time to load utils op: 0.604834794998169 seconds 42: Time to load utils op: 0.5031054019927979 seconds 42: Time to load utils op: 0.5044615268707275 seconds 42: Time to load utils op: 0.6057329177856445 seconds 3: Time to load utils op: 0.2044529914855957 seconds 42: Time to load utils op: 0.605647087097168 seconds 45: Time to load utils op: 0.5030539035797119 seconds 44: Time to load utils op: 0.6047689914703369 secondsTime to load utils op: 0.6050319671630859 seconds 44: 44: Time to load utils op: 0.5032401084899902 seconds 3: Time to load utils op: 0.2039642333984375 secondsTime to load utils op: 0.20415592193603516 seconds 3: 46: Time to load utils op: 0.6033556461334229 seconds 6: Time to load utils op: 0.20254278182983398 seconds 44: Time to load utils op: 0.502798318862915 seconds 47: Time to load utils op: 0.6034536361694336 seconds 46: Time to load utils op: 0.6038551330566406 seconds 6: Time to load utils op: 0.2029578685760498 seconds 6: Time to load utils op: 0.2033090591430664 seconds 42: Time to load utils op: 0.5037941932678223 seconds 44: Time to load utils op: 0.5029048919677734 seconds 47: Time to load utils op: 0.603203296661377 seconds 44: Time to load utils op: 0.5026121139526367 seconds 6: Time to load utils op: 0.20291376113891602 seconds 45: Time to load utils op: 0.5024874210357666 seconds 44: Time to load utils op: 0.6056504249572754 seconds 4: Time to load utils op: 0.20368313789367676 seconds 47: Time to load utils op: 0.6033203601837158 seconds 45: Time to load utils op: 0.5025758743286133 seconds 45: Time to load utils op: 0.5032634735107422 seconds 4: Time to load utils op: 0.20482373237609863 seconds 46: Time to load utils op: 0.5025324821472168 seconds 5: Time to load utils op: 0.2039802074432373 seconds 7: Time to load utils op: 0.20325231552124023 seconds 4: Time to load utils op: 0.20440077781677246 seconds 5: Time to load utils op: 0.20401620864868164 seconds 47: Time to load utils op: 0.5025463104248047 seconds 7: Time to load utils op: 0.20310521125793457 seconds 5: Time to load utils op: 0.20442676544189453 seconds 47: Time to load utils op: 0.5025486946105957 seconds 48: Time to load utils op: 0.6036684513092041 seconds 48: Time to load utils op: 0.603985071182251 seconds 4: Time to load utils op: 0.20516180992126465 seconds 47: Time to load utils op: 0.5027666091918945 seconds 48: Time to load utils op: 0.6044659614562988 seconds 5: Time to load utils op: 0.20489263534545898 seconds 7: Time to load utils op: 0.2036428451538086 seconds 46: Time to load utils op: 0.5031201839447021 seconds 8: Time to load utils op: 0.20273137092590332 seconds 46: Time to load utils op: 0.5033555030822754 seconds 46: Time to load utils op: 0.5032792091369629 seconds 50: Time to load utils op: 0.6036319732666016 seconds 7: Time to load utils op: 0.20381641387939453 seconds 50: Time to load utils op: 0.6032984256744385 seconds 9: Time to load utils op: 0.20290279388427734 seconds 10: Time to load utils op: 0.20267915725708008 seconds 8: Time to load utils op: 0.20279908180236816 seconds 8: Time to load utils op: 0.20356297492980957 seconds 10: Time to load utils op: 0.20258378982543945 seconds 8: Time to load utils op: 0.20352983474731445 seconds 9: Time to load utils op: 0.20322918891906738 seconds 10: Time to load utils op: 0.2025604248046875 seconds 12: Time to load utils op: 0.20224666595458984 seconds 9: Time to load utils op: 0.20314979553222656 seconds 9: Time to load utils op: 0.20364594459533691 seconds 11: Time to load utils op: 0.20271706581115723 seconds 11: Time to load utils op: 0.2029409408569336 seconds 12: Time to load utils op: 0.20235633850097656 seconds 12: Time to load utils op: 0.20252299308776855 secondsTime to load utils op: 0.20252442359924316 seconds 12: 13: Time to load utils op: 0.2024552822113037 seconds 13: Time to load utils op: 0.20198893547058105 seconds 11: Time to load utils op: 0.20309805870056152 seconds 13: Time to load utils op: 0.20234441757202148 seconds 13: Time to load utils op: 0.20219779014587402 seconds 14: Time to load utils op: 0.2031261920928955 seconds 14: Time to load utils op: 0.2034015655517578 seconds 14: Time to load utils op: 0.20331025123596191 seconds 15: Time to load utils op: 0.2030959129333496 seconds 16: Time to load utils op: 0.20282554626464844 seconds 16: Time to load utils op: 0.20212626457214355 seconds 15: Time to load utils op: 0.20305562019348145 seconds 14: Time to load utils op: 0.2037642002105713 seconds 16: Time to load utils op: 0.20226693153381348 seconds 15: Time to load utils op: 0.2028353214263916 seconds 15: Time to load utils op: 0.20301103591918945 seconds 16: Time to load utils op: 0.20244526863098145 seconds 17: Time to load utils op: 0.20324254035949707 seconds 18: Time to load utils op: 0.20321130752563477 seconds 17: Time to load utils op: 0.20345592498779297 seconds 17: Time to load utils op: 0.20364594459533691 seconds 18: Time to load utils op: 0.2032017707824707 seconds 17: Time to load utils op: 0.20350408554077148 seconds 19: Time to load utils op: 0.20318078994750977 seconds 19: Time to load utils op: 0.20314311981201172 seconds 19: Time to load utils op: 0.20283150672912598 seconds 18: Time to load utils op: 0.2033863067626953 seconds 20: Time to load utils op: 0.2021479606628418 secondsTime to load utils op: 0.20263051986694336 seconds 20: 18: Time to load utils op: 0.20371150970458984 seconds 20: Time to load utils op: 0.2025308609008789 seconds 19: Time to load utils op: 0.202805757522583 seconds 21: Time to load utils op: 0.20297026634216309 seconds 21: Time to load utils op: 0.20299863815307617 seconds 21: Time to load utils op: 0.20327186584472656 seconds 22: Time to load utils op: 0.20322227478027344 seconds 22: Time to load utils op: 0.20330023765563965 seconds 23: Time to load utils op: 0.20279407501220703 seconds 23: Time to load utils op: 0.20306658744812012 seconds 24: Time to load utils op: 0.2026207447052002 seconds 23: Time to load utils op: 0.20253777503967285 seconds 24: Time to load utils op: 0.20210671424865723 seconds 23: Time to load utils op: 0.20283079147338867 seconds 22: Time to load utils op: 0.20375633239746094 secondsTime to load utils op: 0.20376038551330566 seconds 22: 24: Time to load utils op: 0.2024364471435547 seconds 26: Time to load utils op: 0.2029860019683838 seconds 26: Time to load utils op: 0.20285320281982422 seconds 25: Time to load utils op: 0.20388460159301758 seconds 26: Time to load utils op: 0.20310425758361816 seconds 25: Time to load utils op: 0.20390653610229492 seconds 26: Time to load utils op: 0.20264840126037598 seconds 25: Time to load utils op: 0.2041482925415039 seconds 25: Time to load utils op: 0.20421814918518066 seconds 29: Time to load utils op: 0.2027735710144043 seconds 27: Time to load utils op: 0.2038278579711914 seconds 29: Time to load utils op: 0.20252704620361328 seconds 27: Time to load utils op: 0.20383310317993164 seconds 29: Time to load utils op: 0.20304012298583984 seconds 28: Time to load utils op: 0.2034902572631836 seconds 28: Time to load utils op: 0.2033369541168213 seconds 29: Time to load utils op: 0.20273995399475098 seconds 27: Time to load utils op: 0.20410680770874023 seconds 28: Time to load utils op: 0.20407962799072266 seconds 27: Time to load utils op: 0.20412063598632812 seconds 28: Time to load utils op: 0.20399785041809082 seconds 30: Time to load utils op: 0.20303559303283691 seconds 30: Time to load utils op: 0.2030792236328125 seconds 30: Time to load utils op: 0.20284461975097656 seconds 31: Time to load utils op: 0.20287346839904785 seconds 30: Time to load utils op: 0.20345807075500488 seconds 31: Time to load utils op: 0.20254898071289062 seconds 31: Time to load utils op: 0.20288610458374023 seconds 31: Time to load utils op: 0.20296692848205566 seconds 41: Time to load utils op: 0.00029158592224121094 seconds 41: Time to load utils op: 0.0004425048828125 seconds 0: Time to load utils op: 0.0005376338958740234 seconds 43: Time to load utils op: 0.00047206878662109375 seconds 43: Time to load utils op: 0.00036525726318359375 seconds 45: Time to load utils op: 0.0004711151123046875 seconds 45: Time to load utils op: 0.00037026405334472656 seconds 45: Time to load utils op: 0.0003540515899658203 seconds 46: Time to load utils op: 0.00033783912658691406 seconds 46: Time to load utils op: 0.0005502700805664062 seconds 50: Time to load utils op: 0.0004220008850097656 seconds 48: Time to load utils op: 0.0004067420959472656 seconds 45: Time to load utils op: 0.00034332275390625 seconds 46: Time to load utils op: 0.00041556358337402344 seconds 47: Time to load utils op: 0.0004677772521972656 seconds 43: Time to load utils op: 0.0003871917724609375 seconds 43: Time to load utils op: 0.00037026405334472656 seconds 50: Time to load utils op: 0.0003609657287597656 seconds 43: Time to load utils op: 0.0003573894500732422 seconds 46: Time to load utils op: 0.0003905296325683594 seconds 45: Time to load utils op: 0.00032806396484375 seconds 48: Time to load utils op: 0.00045228004455566406 seconds 47: Time to load utils op: 0.00037932395935058594 seconds 48: Time to load utils op: 0.000331878662109375 seconds 47: Time to load utils op: 0.00035381317138671875 seconds 45: Time to load utils op: 0.0003299713134765625 seconds 46: Time to load utils op: 0.0003554821014404297 seconds 42: Time to load utils op: 0.0004696846008300781 seconds 42: Time to load utils op: 0.0004818439483642578 seconds 47: Time to load utils op: 0.00036907196044921875 seconds 46: Time to load utils op: 0.00036072731018066406 seconds 47: Time to load utils op: 0.0003674030303955078 seconds 42: Time to load utils op: 0.00032711029052734375 seconds 42: Time to load utils op: 0.00043320655822753906 seconds 47: Time to load utils op: 0.00034618377685546875 seconds 42: Time to load utils op: 0.00036978721618652344 seconds 44: Time to load utils op: 0.0005497932434082031 seconds 44: Time to load utils op: 0.00038313865661621094 seconds 42: Time to load utils op: 0.0003745555877685547 seconds 42: Time to load utils op: 0.00036334991455078125 seconds 42: Time to load utils op: 0.0003604888916015625 seconds 46: Time to load utils op: 0.0003991127014160156 seconds 44: Time to load utils op: 0.0004203319549560547 seconds 44: Time to load utils op: 0.00042057037353515625 seconds 44: Time to load utils op: 0.0004229545593261719 secondsTime to load utils op: 0.0004181861877441406 seconds 44: 44: Time to load utils op: 0.00037288665771484375 seconds 44: Time to load utils op: 0.000431060791015625 seconds 0: Time to load utils op: 0.20203757286071777 seconds 0: [2022-11-25 09:40:46,618] [INFO] [utils.py:827:see_memory_usage] before initializing group 0 0: [2022-11-25 09:40:46,618] [INFO] [utils.py:828:see_memory_usage] MA 4.15 GB Max_MA 4.15 GB CA 4.18 GB Max_CA 4 GB 0: [2022-11-25 09:40:46,619] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 45.28 GB, percent = 9.0% 1: Time to load utils op: 0.0004801750183105469 seconds 1: Time to load utils op: 0.00044727325439453125 seconds 2: Time to load utils op: 0.0005483627319335938 seconds 0: Time to load utils op: 0.0004706382751464844 seconds 0: Time to load utils op: 0.0004177093505859375 seconds 1: Time to load utils op: 0.0003662109375 secondsTime to load utils op: 0.0003428459167480469 seconds 1: 0: Time to load utils op: 0.0005133152008056641 seconds 2: Time to load utils op: 0.0003139972686767578 seconds 2: Time to load utils op: 0.0003170967102050781 seconds 3: Time to load utils op: 0.0005044937133789062 seconds 2: Time to load utils op: 0.0003218650817871094 seconds 3: Time to load utils op: 0.0003440380096435547 seconds 3: Time to load utils op: 0.00036334991455078125 seconds 4: Time to load utils op: 0.0004317760467529297 seconds 4: Time to load utils op: 0.0003514289855957031 seconds 3: Time to load utils op: 0.0003542900085449219 seconds 4: Time to load utils op: 0.0003228187561035156 seconds 4: Time to load utils op: 0.00032901763916015625 seconds 5: Time to load utils op: 0.00043129920959472656 seconds 5: Time to load utils op: 0.0003120899200439453 seconds 6: Time to load utils op: 0.0004589557647705078 seconds 5: Time to load utils op: 0.0003535747528076172 seconds 5: Time to load utils op: 0.0003390312194824219 seconds 7: Time to load utils op: 0.0004892349243164062 seconds 6: Time to load utils op: 0.00041294097900390625 seconds 8: Time to load utils op: 0.00044655799865722656 seconds 7: Time to load utils op: 0.00033402442932128906 seconds 6: Time to load utils op: 0.00034618377685546875 seconds 6: Time to load utils op: 0.0003495216369628906 seconds 7: Time to load utils op: 0.0003268718719482422 seconds 10: Time to load utils op: 0.0005631446838378906 seconds 7: Time to load utils op: 0.00035881996154785156 seconds 9: Time to load utils op: 0.0004837512969970703 seconds 8: Time to load utils op: 0.0003323554992675781 seconds 9: Time to load utils op: 0.00035691261291503906 seconds 8: Time to load utils op: 0.0003440380096435547 seconds 9: Time to load utils op: 0.0003497600555419922 seconds 8: Time to load utils op: 0.0003592967987060547 seconds 9: Time to load utils op: 0.00032258033752441406 seconds 11: Time to load utils op: 0.00047850608825683594 seconds 10: Time to load utils op: 0.00034809112548828125 seconds 11: Time to load utils op: 0.00031280517578125 seconds 10: Time to load utils op: 0.0003578662872314453 seconds 11: Time to load utils op: 0.00032210350036621094 seconds 10: Time to load utils op: 0.0003502368927001953 seconds 11: Time to load utils op: 0.00030612945556640625 seconds 12: Time to load utils op: 0.0004227161407470703 seconds 13: Time to load utils op: 0.00043702125549316406 seconds 13: Time to load utils op: 0.0003516674041748047 seconds 12: Time to load utils op: 0.00034236907958984375 seconds 13: Time to load utils op: 0.0003445148468017578 seconds 12: Time to load utils op: 0.0003445148468017578 seconds 12: Time to load utils op: 0.00034499168395996094 seconds 13: Time to load utils op: 0.00033664703369140625 seconds 14: Time to load utils op: 0.00043010711669921875 seconds 14: Time to load utils op: 0.0003209114074707031 seconds 14: Time to load utils op: 0.00034737586975097656 seconds 14: Time to load utils op: 0.00033283233642578125 seconds 15: Time to load utils op: 0.0004513263702392578 seconds 15: Time to load utils op: 0.000431060791015625 seconds 15: Time to load utils op: 0.00034546852111816406 seconds 15: Time to load utils op: 0.0003266334533691406 seconds 16: Time to load utils op: 0.0004374980926513672 seconds 16: Time to load utils op: 0.00030994415283203125 seconds 16: Time to load utils op: 0.00033211708068847656 seconds 16: Time to load utils op: 0.00032973289489746094 seconds 17: Time to load utils op: 0.0004622936248779297 seconds 17: Time to load utils op: 0.0003495216369628906 seconds 19: Time to load utils op: 0.00048804283142089844 seconds 19: Time to load utils op: 0.0003790855407714844 seconds 18: Time to load utils op: 0.0004563331604003906 seconds 18: Time to load utils op: 0.0003094673156738281 seconds 21: Time to load utils op: 0.00046563148498535156 seconds 17: Time to load utils op: 0.0003476142883300781 seconds 19: Time to load utils op: 0.0003371238708496094 seconds 17: Time to load utils op: 0.0003523826599121094 seconds 18: Time to load utils op: 0.0003311634063720703 seconds 18: Time to load utils op: 0.00036978721618652344 seconds 19: Time to load utils op: 0.00031638145446777344 seconds 20: Time to load utils op: 0.0006451606750488281 seconds 20: Time to load utils op: 0.00036597251892089844 seconds 20: Time to load utils op: 0.0003275871276855469 seconds 20: Time to load utils op: 0.0003371238708496094 seconds 21: Time to load utils op: 0.0003380775451660156 seconds 21: Time to load utils op: 0.0003247261047363281 seconds 22: Time to load utils op: 0.0003399848937988281 secondsTime to load utils op: 0.00046706199645996094 seconds 22: 23: Time to load utils op: 0.0004630088806152344 seconds 22: Time to load utils op: 0.0003523826599121094 seconds 22: Time to load utils op: 0.0003371238708496094 seconds 23: Time to load utils op: 0.00034236907958984375 seconds 24: Time to load utils op: 0.0005352497100830078 seconds 21: Time to load utils op: 0.00032806396484375 seconds 24: Time to load utils op: 0.0005664825439453125 seconds 23: Time to load utils op: 0.00030922889709472656 seconds 25: Time to load utils op: 0.00046896934509277344 seconds 24: Time to load utils op: 0.00033664703369140625 seconds 25: Time to load utils op: 0.00044846534729003906 seconds 23: Time to load utils op: 0.0003561973571777344 seconds 24: Time to load utils op: 0.0003409385681152344 seconds 25: Time to load utils op: 0.00034046173095703125 seconds 25: Time to load utils op: 0.0003287792205810547 seconds 26: Time to load utils op: 0.00040721893310546875 seconds 27: Time to load utils op: 0.00045299530029296875 seconds 27: Time to load utils op: 0.0003294944763183594 seconds 27: Time to load utils op: 0.0003101825714111328 seconds 26: Time to load utils op: 0.0003142356872558594 seconds 27: Time to load utils op: 0.0003230571746826172 seconds 26: Time to load utils op: 0.000347137451171875 seconds 26: Time to load utils op: 0.00033354759216308594 seconds 28: Time to load utils op: 0.00046706199645996094 seconds 28: Time to load utils op: 0.00045943260192871094 seconds 28: Time to load utils op: 0.0003390312194824219 seconds 29: Time to load utils op: 0.00046539306640625 seconds 28: Time to load utils op: 0.0003180503845214844 seconds 29: Time to load utils op: 0.0003657341003417969 seconds 30: Time to load utils op: 0.00046634674072265625 seconds 29: Time to load utils op: 0.0003666877746582031 seconds 30: Time to load utils op: 0.0003304481506347656 seconds 29: Time to load utils op: 0.000308990478515625 seconds 30: Time to load utils op: 0.000354766845703125 seconds 30: Time to load utils op: 0.00034427642822265625 seconds 31: Time to load utils op: 0.0004215240478515625 seconds 31: Time to load utils op: 0.00036406517028808594 seconds 31: Time to load utils op: 0.0003376007080078125 seconds 31: Time to load utils op: 0.00034499168395996094 seconds 0: [2022-11-25 09:40:46,666] [INFO] [utils.py:827:see_memory_usage] after initializing group 0 0: [2022-11-25 09:40:46,666] [INFO] [utils.py:828:see_memory_usage] MA 8.44 GB Max_MA 8.44 GB CA 10.57 GB Max_CA 11 GB 0: [2022-11-25 09:40:46,666] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 45.31 GB, percent = 9.0% 0: [2022-11-25 09:40:46,710] [INFO] [utils.py:827:see_memory_usage] before initializing group 1 0: [2022-11-25 09:40:46,711] [INFO] [utils.py:828:see_memory_usage] MA 8.44 GB Max_MA 8.44 GB CA 10.57 GB Max_CA 11 GB 0: [2022-11-25 09:40:46,711] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 45.39 GB, percent = 9.0% 0: [2022-11-25 09:40:46,746] [INFO] [utils.py:827:see_memory_usage] after initializing group 1 0: [2022-11-25 09:40:46,746] [INFO] [utils.py:828:see_memory_usage] MA 12.5 GB Max_MA 12.5 GB CA 16.62 GB Max_CA 17 GB 0: [2022-11-25 09:40:46,747] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 45.42 GB, percent = 9.0% 0: [2022-11-25 09:40:46,787] [INFO] [utils.py:827:see_memory_usage] before initializing group 2 0: [2022-11-25 09:40:46,788] [INFO] [utils.py:828:see_memory_usage] MA 12.5 GB Max_MA 12.5 GB CA 16.62 GB Max_CA 17 GB 0: [2022-11-25 09:40:46,788] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 45.47 GB, percent = 9.0% 0: [2022-11-25 09:40:46,840] [INFO] [utils.py:827:see_memory_usage] after initializing group 2 0: [2022-11-25 09:40:46,841] [INFO] [utils.py:828:see_memory_usage] MA 12.51 GB Max_MA 12.51 GB CA 16.62 GB Max_CA 17 GB 0: [2022-11-25 09:40:46,841] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 45.63 GB, percent = 9.1% 0: [2022-11-25 09:40:46,893] [INFO] [utils.py:827:see_memory_usage] before initialize_optimizer 0: [2022-11-25 09:40:46,894] [INFO] [utils.py:828:see_memory_usage] MA 12.51 GB Max_MA 12.51 GB CA 16.62 GB Max_CA 17 GB 0: [2022-11-25 09:40:46,894] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 45.75 GB, percent = 9.1% 0: [2022-11-25 09:40:46,943] [INFO] [utils.py:827:see_memory_usage] end initialize_optimizer 0: [2022-11-25 09:40:46,944] [INFO] [utils.py:828:see_memory_usage] MA 12.64 GB Max_MA 12.64 GB CA 16.62 GB Max_CA 17 GB 0: [2022-11-25 09:40:46,944] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 45.84 GB, percent = 9.1% 0: [2022-11-25 09:40:46,982] [INFO] [utils.py:827:see_memory_usage] end bf16_optimizer 0: [2022-11-25 09:40:46,982] [INFO] [utils.py:828:see_memory_usage] MA 12.64 GB Max_MA 12.64 GB CA 16.62 GB Max_CA 17 GB 0: [2022-11-25 09:40:46,982] [INFO] [utils.py:836:see_memory_usage] CPU Virtual Memory: used = 45.84 GB, percent = 9.1% 0: [2022-11-25 09:40:46,982] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam 0: [2022-11-25 09:40:46,983] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using client LR scheduler 0: [2022-11-25 09:40:46,983] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = 0: [2022-11-25 09:40:46,983] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0, 0.0, 0.0], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] 0: [2022-11-25 09:40:46,983] [INFO] [config.py:1007:print] DeepSpeedEngine configuration: 0: [2022-11-25 09:40:46,983] [INFO] [config.py:1011:print] activation_checkpointing_config { 0: "partition_activations": false, 0: "contiguous_memory_optimization": false, 0: "cpu_checkpointing": false, 0: "number_checkpoints": null, 0: "synchronize_checkpoint_boundary": false, 0: "profile": false 0: } 0: [2022-11-25 09:40:46,983] [INFO] [config.py:1011:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} 0: [2022-11-25 09:40:46,983] [INFO] [config.py:1011:print] amp_enabled .................. False 0: [2022-11-25 09:40:46,983] [INFO] [config.py:1011:print] amp_params ................... False 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] autotuning_config ............ { 0: "enabled": false, 0: "start_step": null, 0: "end_step": null, 0: "metric_path": null, 0: "arg_mappings": null, 0: "metric": "throughput", 0: "model_info": null, 0: "results_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_results", 0: "exps_dir": "/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/autotuning_exps", 0: "overwrite": true, 0: "fast": true, 0: "start_profile_step": 3, 0: "end_profile_step": 5, 0: "tuner_type": "gridsearch", 0: "tuner_early_stopping": 5, 0: "tuner_num_trials": 50, 0: "model_info_path": null, 0: "mp_size": 1, 0: "max_train_batch_size": null, 0: "min_train_batch_size": 1, 0: "max_train_micro_batch_size_per_gpu": 1.024000e+03, 0: "min_train_micro_batch_size_per_gpu": 1, 0: "num_tuning_micro_batch_sizes": 3 0: } 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] bfloat16_enabled ............. True 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] checkpoint_parallel_write_pipeline False 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] checkpoint_tag_validation_enabled True 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] checkpoint_tag_validation_fail False 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] comms_config ................. 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] communication_data_type ...... None 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_pa 0: rameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] curriculum_enabled ........... False 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] curriculum_params ............ False 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] dataloader_drop_last ......... False 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] disable_allgather ............ False 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] dump_state ................... False 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] dynamic_loss_scale_args ...... None 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] eigenvalue_enabled ........... False 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] eigenvalue_gas_boundary_resolution 1 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] eigenvalue_layer_name ........ bert.encoder.layer 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] eigenvalue_layer_num ......... 0 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] eigenvalue_max_iter .......... 100 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] eigenvalue_stability ......... 1e-06 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] eigenvalue_tol ............... 0.01 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] eigenvalue_verbose ........... False 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] elasticity_enabled ........... False 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] flops_profiler_config ........ { 0: "enabled": false, 0: "profile_step": 1, 0: "module_depth": -1, 0: "top_modules": 1, 0: "detailed": true, 0: "output_file": null 0: } 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] fp16_auto_cast ............... None 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] fp16_enabled ................. False 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] fp16_master_weights_and_gradients False 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] global_rank .................. 0 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] gradient_accumulation_steps .. 4 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] gradient_clipping ............ 1.0 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] gradient_predivide_factor .... 1.0 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] initial_dynamic_scale ........ 1 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] load_universal_checkpoint .... False 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] loss_scale ................... 1.0 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] memory_breakdown ............. False 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] monitor_config ............... 0: [2022-11-25 09:40:46,984] [INFO] [config.py:1011:print] nebula_config ................ { 0: "enabled": false, 0: "persistent_storage_path": null, 0: "persistent_time_interval": 100, 0: "num_of_version_in_retention": 2, 0: "enable_nebula_load": true, 0: "load_path": null 0: } 0: [2022-11-25 09:40:46,985] [INFO] [config.py:1011:print] optimizer_legacy_fusion ...... False 0: [2022-11-25 09:40:46,985] [INFO] [config.py:1011:print] optimizer_name ............... None 0: [2022-11-25 09:40:46,985] [INFO] [config.py:1011:print] optimizer_params ............. None 0: [2022-11-25 09:40:46,985] [INFO] [config.py:1011:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} 0: [2022-11-25 09:40:46,985] [INFO] [config.py:1011:print] pld_enabled .................. False 0: [2022-11-25 09:40:46,985] [INFO] [config.py:1011:print] pld_params ................... False 0: [2022-11-25 09:40:46,985] [INFO] [config.py:1011:print] prescale_gradients ........... False 0: [2022-11-25 09:40:46,985] [INFO] [config.py:1011:print] scheduler_name ............... None 0: [2022-11-25 09:40:46,985] [INFO] [config.py:1011:print] scheduler_params ............. None 0: [2022-11-25 09:40:46,985] [INFO] [config.py:1011:print] sparse_attention ............. None 0: [2022-11-25 09:40:46,985] [INFO] [config.py:1011:print] sparse_gradients_enabled ..... False 0: [2022-11-25 09:40:46,985] [INFO] [config.py:1011:print] steps_per_print .............. 2000 0: [2022-11-25 09:40:46,985] [INFO] [config.py:1011:print] train_batch_size ............. 1024 0: [2022-11-25 09:40:46,985] [INFO] [config.py:1011:print] train_micro_batch_size_per_gpu 2 0: [2022-11-25 09:40:46,985] [INFO] [config.py:1011:print] use_node_local_storage ....... False 0: [2022-11-25 09:40:46,985] [INFO] [config.py:1011:print] wall_clock_breakdown ......... False 0: [2022-11-25 09:40:46,985] [INFO] [config.py:1011:print] world_size ................... 128 0: [2022-11-25 09:40:46,985] [INFO] [config.py:1011:print] zero_allow_untested_optimizer False 0: [2022-11-25 09:40:46,985] [INFO] [config.py:1011:print] zero_config .................. stage=0 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500000000 allgather_partitions=True allgather_bucket_size=500000000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50000000 param_persistence_threshold=100000 model_persistence_threshold=9223372036854775807 max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False 0: [2022-11-25 09:40:46,985] [INFO] [config.py:1011:print] zero_enabled ................. False 0: [2022-11-25 09:40:46,985] [INFO] [config.py:1011:print] zero_optimization_stage ...... 0 0: [2022-11-25 09:40:46,985] [INFO] [config.py:996:print_user_config] json = { 0: "train_micro_batch_size_per_gpu": 2, 0: "train_batch_size": 1.024000e+03, 0: "gradient_clipping": 1.0, 0: "zero_optimization": { 0: "stage": 0 0: }, 0: "bf16": { 0: "enabled": true 0: }, 0: "steps_per_print": 2.000000e+03, 0: "wall_clock_breakdown": false 0: } 0: Time to load utils op: 0.00041103363037109375 seconds 0: [2022-11-25 09:40:46,986] [INFO] [engine.py:87:__init__] CONFIG: micro_batches=4 micro_batch_size=2 0: [2022-11-25 09:40:47,132] [INFO] [engine.py:145:__init__] RANK=0 STAGE=0 LAYERS=24 [0, 24) STAGE_PARAMS=2226419712 (2226.420M) TOTAL_PARAMS=8905695232 (8905.695M) UNIQUE_PARAMS=8682348544 (8682.349M) 32: [2022-11-25 09:40:47,132] [INFO] [engine.py:145:__init__] RANK=256 STAGE=1 LAYERS=25 [24, 49) STAGE_PARAMS=2226427904 (2226.428M) TOTAL_PARAMS=8905695232 (8905.695M) UNIQUE_PARAMS=8682348544 (8682.349M) 32: [2022-11-25 09:40:47,132] [INFO] [engine.py:145:__init__] RANK=257 STAGE=1 LAYERS=25 [24, 49) STAGE_PARAMS=2226427904 (2226.428M) TOTAL_PARAMS=8905695232 (8905.695M) UNIQUE_PARAMS=8682348544 (8682.349M) 0: [2022-11-25 09:40:47,132] [INFO] [engine.py:145:__init__] RANK=1 STAGE=0 LAYERS=24 [0, 24) STAGE_PARAMS=2226419712 (2226.420M) TOTAL_PARAMS=8905695232 (8905.695M) UNIQUE_PARAMS=8682348544 (8682.349M) 0: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 0: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 0: WARNING: could not find the metadata file checkpoints_8b7 0: will not load any checkpoints and will start from random 63: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 63: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 60: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 60: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 56: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 56: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 56: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 56: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 62: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 62: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 0: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 48: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 48: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 32: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 32: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 63: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 63: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 55: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 55: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 60: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 56: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 62: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 62: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 0: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 48: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 32: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 39: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 55: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 61: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 61: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 59: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 59: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 58: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 58: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 58: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 58: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 40: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 40: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 60: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 52: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 52: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 52: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 52: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 30: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 30: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 56: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 56: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 54: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 54: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 54: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 62: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 28: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 0: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 24: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 24: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 24: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 47: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 47: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 31: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 31: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 46: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 46: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 44: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 44: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 16: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 16: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 48: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 57: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 57: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 57: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 57: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 63: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 39: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 53: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 53: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 53: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 51: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 51: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 61: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 59: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 35: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 35: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 29: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 29: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 58: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 40: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 40: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 60: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 52: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 52: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 30: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 4: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 4: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 4: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 56: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 54: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 62: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 36: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 36: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 36: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 36: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 28: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 38: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 38: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 0: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 24: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 49: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 49: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 47: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 41: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 41: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 45: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 45: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 43: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 43: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 43: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 43: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 27: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 27: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 23: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 23: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 23: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 23: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 11: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 11: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 31: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 31: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 19: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 19: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 34: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 34: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 46: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 44: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 44: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 16: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 16: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 18: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 10: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 10: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 20: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 20: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 8: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 8: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 8: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 48: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 50: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 50: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 42: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 42: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 42: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 42: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 32: [2022-11-25 09:40:50,239] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 12: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 12: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 26: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 26: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 26: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 26: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 14: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 14: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 15: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 15: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 22: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 22: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 57: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 63: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 39: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 53: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 55: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 51: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 51: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 33: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 1: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 1: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 61: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 59: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 13: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 13: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 35: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 35: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 5: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 5: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 29: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 29: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 37: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 37: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 58: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 40: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 60: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 52: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 52: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 30: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 4: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 54: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 54: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 62: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 36: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 28: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 38: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 0: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 2: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 2: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 24: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 49: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 49: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 47: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 41: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 45: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 45: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 9: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 9: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 43: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 27: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 25: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 3: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 7: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 7: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 17: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 17: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 23: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 23: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 11: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 11: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 31: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 31: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 19: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 19: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 34: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 34: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 46: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 44: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 44: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 6: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 6: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 16: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 16: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 18: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 18: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 18: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 10: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 20: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 8: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 48: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 50: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 42: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 42: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 32: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 32: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 12: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 26: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 14: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 14: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 15: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 22: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 21: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 21: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 57: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 63: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 63: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 39: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 53: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 53: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 55: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 51: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 33: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 1: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 61: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 59: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 59: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 13: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 35: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 5: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 29: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 29: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 37: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 58: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 58: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 40: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 60: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 60: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 30: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 4: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 4: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 54: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 62: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 36: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 28: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 38: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 0: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 2: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 2: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 24: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 49: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 47: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 41: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 45: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 9: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 43: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 27: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 25: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 3: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 7: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 17: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 17: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 23: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 11: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 11: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 31: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 31: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 19: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 19: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 34: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 34: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 46: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 46: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 44: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 44: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 6: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 16: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 18: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 10: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 20: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 8: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 48: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 50: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 42: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 42: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 12: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 26: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 15: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 21: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 57: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 57: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 39: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 39: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 53: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 53: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 55: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 51: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 33: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 1: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 61: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 59: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 13: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 35: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 5: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 29: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 29: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 37: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 40: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 40: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 30: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 4: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 4: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 54: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 36: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 36: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 28: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 28: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 38: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 38: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 2: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 24: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 24: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 49: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 47: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 41: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 45: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 9: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 43: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 27: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 25: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 3: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 7: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 17: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 17: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 23: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 11: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 19: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 34: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 34: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 46: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 46: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 6: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 16: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 18: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 10: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 8: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 48: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 50: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 50: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 32: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 26: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 14: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 22: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 22: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 21: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 39: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 39: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 55: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 55: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 51: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 51: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 33: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 1: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 61: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 61: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 59: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 13: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 13: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 35: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 35: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 5: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 37: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 37: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 30: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 30: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 28: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 28: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 38: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 38: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 2: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 49: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 49: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 47: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 41: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 45: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 9: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 9: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 43: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 27: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 25: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 3: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 7: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 17: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 17: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 11: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 19: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 6: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 6: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 18: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 10: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 20: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 8: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 8: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 50: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 50: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 32: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 12: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 12: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 26: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 14: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 15: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 22: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 33: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 33: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 1: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 13: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 5: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 37: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 37: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 2: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 47: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 41: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 45: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 9: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 27: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 27: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 25: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 25: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 3: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 7: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 6: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 6: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 18: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 10: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 10: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 20: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 12: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 14: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 14: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 15: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 22: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 22: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 21: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 33: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 33: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 1: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 1: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 13: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 5: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 2: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 41: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 9: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 25: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 25: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 3: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 7: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 7: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 20: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 20: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 12: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 15: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 15: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 21: [2022-11-25 09:40:50,240] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 5: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 3: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 3: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 21: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 21: [2022-11-25 09:40:50,241] [WARNING] [engine.py:2581:load_checkpoint] Unable to find latest file at checkpoints_8b7/latest, if trying to load latest checkpoint please ensure this file exists or pass an explicit checkpoint tag when loading a checkpoint. 63: time (ms) | load-checkpoint: 11.24 0: estimated model parameters: 8.905678848 0: estimated model parameters without embeddings: 8.458985472 0: [after model, optimizer, and learning rate scheduler are built] datetime: 2022-11-25 09:40:50 0: > building train, validation, and test datasets ... 0: > datasets target sizes (minimum size): 0: train: 5625981 0: validation: 6144 0: test: 1024 0: > building train, validation, and test datasets for GPT ... 0: > building dataset index ... 0: reading sizes... 0: reading pointers... 0: reading document index... 0: creating numpy buffer of mmap... 0: creating memory view of numpy buffer... 0: > finished creating indexed dataset in 0.001253 seconds 0: number of documents: 210604984 0: > dataset split: 0: train: 0: document indices in [0, 199864130) total of 199864130 documents 0: validation: 0: document indices in [199864130, 210394379) total of 10530249 documents 0: test: 0: document indices in [210394379, 210604984) total of 210605 documents 0: > loading doc-idx mapping from /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document_train_indexmap_5625981ns_2048sl_1234s_doc_idx.npy 0: > loading sample-idx mapping from /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document_train_indexmap_5625981ns_2048sl_1234s_sample_idx.npy 0: > loading shuffle-idx mapping from /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document_train_indexmap_5625981ns_2048sl_1234s_shuffle_idx.npy 0: loaded indexed file in 0.004 seconds 0: total number of samples: 173377817 0: total number of epochs: 1 0: > loading doc-idx mapping from /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document_valid_indexmap_6144ns_2048sl_1234s_doc_idx.npy 0: > loading sample-idx mapping from /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document_valid_indexmap_6144ns_2048sl_1234s_sample_idx.npy 0: > loading shuffle-idx mapping from /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document_valid_indexmap_6144ns_2048sl_1234s_shuffle_idx.npy 0: loaded indexed file in 0.007 seconds 0: total number of samples: 9118345 0: total number of epochs: 1 0: > loading doc-idx mapping from /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document_test_indexmap_1024ns_2048sl_1234s_doc_idx.npy 0: > loading sample-idx mapping from /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document_test_indexmap_1024ns_2048sl_1234s_sample_idx.npy 0: > loading shuffle-idx mapping from /scratch/project_462000119/data/pile/megatron_data/meg-gpt2_pile_text_document_test_indexmap_1024ns_2048sl_1234s_shuffle_idx.npy 0: loaded indexed file in 0.003 seconds 0: total number of samples: 182928 0: total number of epochs: 1 0: > finished creating GPT datasets ... 0: [after dataloaders are built] datetime: 2022-11-25 09:41:03 0: done with setup ... 0: training ... 0: Number of parameters: [tensor rank - pipeline rank] w/ and w/o embeddings: 63: time (ms) | model-and-optimizer-setup: 29775.93 | train/valid/test-data-iterators-setup: 11866.08 0: [001-000] 8.9057B / 8.4590B 32: [001-001] 8.9057B / 8.4590B 32: [000-001] 8.9057B / 8.4590B 0: [000-000] 8.9057B / 8.4590B 0: [before the start of training step] datetime: 2022-11-25 09:41:04 0: [Rank 0] (after 10 iterations) memory (MB) | allocated: 17215.4833984375 | max allocated: 59062.27294921875 | reserved: 63330.0 | max reserved: 63394.0 32: [Rank 256] (after 10 iterations) memory (MB) | allocated: 18102.45556640625 | max allocated: 39591.79736328125 | reserved: 47074.0 | max reserved: 47074.0 0: [Rank 1] (after 10 iterations) memory (MB) | allocated: 17215.4833984375 | max allocated: 59062.27294921875 | reserved: 62386.0 | max reserved: 62386.0 32: [Rank 257] (after 10 iterations) memory (MB) | allocated: 18102.45556640625 | max allocated: 39591.79736328125 | reserved: 46066.0 | max reserved: 46066.0 63: iteration 10/ 5494 | consumed samples: 10240 | consumed tokens: 20971520 | elapsed time per iteration (s): 11.15 | learning rate: 3.640E-05 | global batch size: 1024 | lm loss: 1.425811E+01 | grad norm: 7.449 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 91.809 | TFLOPs: 20.53 | 63: iteration 20/ 5494 | consumed samples: 20480 | consumed tokens: 41943040 | elapsed time per iteration (s): 6.86 | learning rate: 7.280E-05 | global batch size: 1024 | lm loss: 9.124741E+00 | grad norm: 6.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 149.286 | TFLOPs: 33.38 | 63: iteration 30/ 5494 | consumed samples: 30720 | consumed tokens: 62914560 | elapsed time per iteration (s): 6.70 | learning rate: 1.092E-04 | global batch size: 1024 | lm loss: 7.498499E+00 | grad norm: 3.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 152.855 | TFLOPs: 34.17 | 63: iteration 40/ 5494 | consumed samples: 40960 | consumed tokens: 83886080 | elapsed time per iteration (s): 6.24 | learning rate: 1.456E-04 | global batch size: 1024 | lm loss: 7.268354E+00 | grad norm: 1.506 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 164.149 | TFLOPs: 36.70 | 63: iteration 50/ 5494 | consumed samples: 51200 | consumed tokens: 104857600 | elapsed time per iteration (s): 6.53 | learning rate: 1.820E-04 | global batch size: 1024 | lm loss: 7.204363E+00 | grad norm: 3.740 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 156.915 | TFLOPs: 35.08 | 63: iteration 60/ 5494 | consumed samples: 61440 | consumed tokens: 125829120 | elapsed time per iteration (s): 6.27 | learning rate: 2.000E-04 | global batch size: 1024 | lm loss: 7.129514E+00 | grad norm: 3.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 163.435 | TFLOPs: 36.54 | 63: iteration 70/ 5494 | consumed samples: 71680 | consumed tokens: 146800640 | elapsed time per iteration (s): 6.36 | learning rate: 2.000E-04 | global batch size: 1024 | lm loss: 6.949097E+00 | grad norm: 2.410 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 160.882 | TFLOPs: 35.97 | 63: iteration 80/ 5494 | consumed samples: 81920 | consumed tokens: 167772160 | elapsed time per iteration (s): 5.92 | learning rate: 2.000E-04 | global batch size: 1024 | lm loss: 6.782018E+00 | grad norm: 2.313 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.828 | TFLOPs: 38.64 | 63: iteration 90/ 5494 | consumed samples: 92160 | consumed tokens: 188743680 | elapsed time per iteration (s): 6.09 | learning rate: 2.000E-04 | global batch size: 1024 | lm loss: 6.744170E+00 | grad norm: 3.822 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 168.014 | TFLOPs: 37.56 | 63: iteration 100/ 5494 | consumed samples: 102400 | consumed tokens: 209715200 | elapsed time per iteration (s): 6.20 | learning rate: 2.000E-04 | global batch size: 1024 | lm loss: 6.639218E+00 | grad norm: 2.386 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 165.262 | TFLOPs: 36.95 | 63: iteration 110/ 5494 | consumed samples: 112640 | consumed tokens: 230686720 | elapsed time per iteration (s): 6.54 | learning rate: 2.000E-04 | global batch size: 1024 | lm loss: 6.536300E+00 | grad norm: 2.054 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 156.638 | TFLOPs: 35.02 | 63: iteration 120/ 5494 | consumed samples: 122880 | consumed tokens: 251658240 | elapsed time per iteration (s): 6.29 | learning rate: 1.999E-04 | global batch size: 1024 | lm loss: 6.427072E+00 | grad norm: 1.552 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 162.913 | TFLOPs: 36.42 | 63: iteration 130/ 5494 | consumed samples: 133120 | consumed tokens: 272629760 | elapsed time per iteration (s): 5.55 | learning rate: 1.999E-04 | global batch size: 1024 | lm loss: 6.322859E+00 | grad norm: 1.864 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.459 | TFLOPs: 41.24 | 63: iteration 140/ 5494 | consumed samples: 143360 | consumed tokens: 293601280 | elapsed time per iteration (s): 6.13 | learning rate: 1.999E-04 | global batch size: 1024 | lm loss: 6.248438E+00 | grad norm: 1.884 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 167.108 | TFLOPs: 37.36 | 63: iteration 150/ 5494 | consumed samples: 153600 | consumed tokens: 314572800 | elapsed time per iteration (s): 6.02 | learning rate: 1.999E-04 | global batch size: 1024 | lm loss: 6.100858E+00 | grad norm: 2.602 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 170.206 | TFLOPs: 38.05 | 63: iteration 160/ 5494 | consumed samples: 163840 | consumed tokens: 335544320 | elapsed time per iteration (s): 6.13 | learning rate: 1.998E-04 | global batch size: 1024 | lm loss: 6.012361E+00 | grad norm: 2.117 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 166.935 | TFLOPs: 37.32 | 63: iteration 170/ 5494 | consumed samples: 174080 | consumed tokens: 356515840 | elapsed time per iteration (s): 5.96 | learning rate: 1.998E-04 | global batch size: 1024 | lm loss: 5.936864E+00 | grad norm: 1.911 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 171.789 | TFLOPs: 38.41 | 63: iteration 180/ 5494 | consumed samples: 184320 | consumed tokens: 377487360 | elapsed time per iteration (s): 9.53 | learning rate: 1.998E-04 | global batch size: 1024 | lm loss: 5.858692E+00 | grad norm: 2.088 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 107.397 | TFLOPs: 24.01 | 63: iteration 190/ 5494 | consumed samples: 194560 | consumed tokens: 398458880 | elapsed time per iteration (s): 8.46 | learning rate: 1.997E-04 | global batch size: 1024 | lm loss: 5.794482E+00 | grad norm: 1.078 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 120.973 | TFLOPs: 27.05 | 63: iteration 200/ 5494 | consumed samples: 204800 | consumed tokens: 419430400 | elapsed time per iteration (s): 6.25 | learning rate: 1.997E-04 | global batch size: 1024 | lm loss: 5.722625E+00 | grad norm: 1.331 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 163.945 | TFLOPs: 36.65 | 63: iteration 210/ 5494 | consumed samples: 215040 | consumed tokens: 440401920 | elapsed time per iteration (s): 5.84 | learning rate: 1.996E-04 | global batch size: 1024 | lm loss: 5.652419E+00 | grad norm: 2.292 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.297 | TFLOPs: 39.19 | 63: iteration 220/ 5494 | consumed samples: 225280 | consumed tokens: 461373440 | elapsed time per iteration (s): 6.10 | learning rate: 1.996E-04 | global batch size: 1024 | lm loss: 5.593320E+00 | grad norm: 1.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 167.952 | TFLOPs: 37.55 | 63: iteration 230/ 5494 | consumed samples: 235520 | consumed tokens: 482344960 | elapsed time per iteration (s): 5.72 | learning rate: 1.995E-04 | global batch size: 1024 | lm loss: 5.553791E+00 | grad norm: 2.084 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.956 | TFLOPs: 40.01 | 63: iteration 240/ 5494 | consumed samples: 245760 | consumed tokens: 503316480 | elapsed time per iteration (s): 7.23 | learning rate: 1.995E-04 | global batch size: 1024 | lm loss: 5.506816E+00 | grad norm: 2.014 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 141.696 | TFLOPs: 31.68 | 63: iteration 250/ 5494 | consumed samples: 256000 | consumed tokens: 524288000 | elapsed time per iteration (s): 7.04 | learning rate: 1.994E-04 | global batch size: 1024 | lm loss: 5.467096E+00 | grad norm: 1.285 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 145.527 | TFLOPs: 32.54 | 63: iteration 260/ 5494 | consumed samples: 266240 | consumed tokens: 545259520 | elapsed time per iteration (s): 6.11 | learning rate: 1.994E-04 | global batch size: 1024 | lm loss: 5.428356E+00 | grad norm: 1.040 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 167.687 | TFLOPs: 37.49 | 63: iteration 270/ 5494 | consumed samples: 276480 | consumed tokens: 566231040 | elapsed time per iteration (s): 7.06 | learning rate: 1.993E-04 | global batch size: 1024 | lm loss: 5.379630E+00 | grad norm: 1.216 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 145.036 | TFLOPs: 32.43 | 63: iteration 280/ 5494 | consumed samples: 286720 | consumed tokens: 587202560 | elapsed time per iteration (s): 5.55 | learning rate: 1.992E-04 | global batch size: 1024 | lm loss: 5.354424E+00 | grad norm: 1.532 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.629 | TFLOPs: 41.28 | 63: iteration 290/ 5494 | consumed samples: 296960 | consumed tokens: 608174080 | elapsed time per iteration (s): 5.87 | learning rate: 1.992E-04 | global batch size: 1024 | lm loss: 5.309001E+00 | grad norm: 1.924 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 174.320 | TFLOPs: 38.97 | 63: iteration 300/ 5494 | consumed samples: 307200 | consumed tokens: 629145600 | elapsed time per iteration (s): 6.27 | learning rate: 1.991E-04 | global batch size: 1024 | lm loss: 5.298676E+00 | grad norm: 1.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 163.379 | TFLOPs: 36.53 | 63: iteration 310/ 5494 | consumed samples: 317440 | consumed tokens: 650117120 | elapsed time per iteration (s): 5.97 | learning rate: 1.990E-04 | global batch size: 1024 | lm loss: 5.212383E+00 | grad norm: 0.610 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 171.595 | TFLOPs: 38.36 | 63: iteration 320/ 5494 | consumed samples: 327680 | consumed tokens: 671088640 | elapsed time per iteration (s): 6.02 | learning rate: 1.989E-04 | global batch size: 1024 | lm loss: 5.143223E+00 | grad norm: 2.430 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 170.014 | TFLOPs: 38.01 | 63: iteration 330/ 5494 | consumed samples: 337920 | consumed tokens: 692060160 | elapsed time per iteration (s): 5.83 | learning rate: 1.989E-04 | global batch size: 1024 | lm loss: 5.109738E+00 | grad norm: 1.256 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.639 | TFLOPs: 39.27 | 63: iteration 340/ 5494 | consumed samples: 348160 | consumed tokens: 713031680 | elapsed time per iteration (s): 5.97 | learning rate: 1.988E-04 | global batch size: 1024 | lm loss: 5.063797E+00 | grad norm: 1.111 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 171.526 | TFLOPs: 38.35 | 63: iteration 350/ 5494 | consumed samples: 358400 | consumed tokens: 734003200 | elapsed time per iteration (s): 5.81 | learning rate: 1.987E-04 | global batch size: 1024 | lm loss: 5.040391E+00 | grad norm: 0.994 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.116 | TFLOPs: 39.37 | 63: iteration 360/ 5494 | consumed samples: 368640 | consumed tokens: 754974720 | elapsed time per iteration (s): 5.70 | learning rate: 1.986E-04 | global batch size: 1024 | lm loss: 4.981058E+00 | grad norm: 0.786 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.648 | TFLOPs: 40.16 | 63: iteration 370/ 5494 | consumed samples: 378880 | consumed tokens: 775946240 | elapsed time per iteration (s): 5.96 | learning rate: 1.985E-04 | global batch size: 1024 | lm loss: 4.953582E+00 | grad norm: 1.093 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 171.907 | TFLOPs: 38.43 | 63: iteration 380/ 5494 | consumed samples: 389120 | consumed tokens: 796917760 | elapsed time per iteration (s): 5.94 | learning rate: 1.984E-04 | global batch size: 1024 | lm loss: 4.925888E+00 | grad norm: 1.045 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.469 | TFLOPs: 38.56 | 63: iteration 390/ 5494 | consumed samples: 399360 | consumed tokens: 817889280 | elapsed time per iteration (s): 6.91 | learning rate: 1.983E-04 | global batch size: 1024 | lm loss: 4.877906E+00 | grad norm: 1.284 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 148.251 | TFLOPs: 33.14 | 63: iteration 400/ 5494 | consumed samples: 409600 | consumed tokens: 838860800 | elapsed time per iteration (s): 7.86 | learning rate: 1.982E-04 | global batch size: 1024 | lm loss: 4.852319E+00 | grad norm: 0.840 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 130.224 | TFLOPs: 29.11 | 63: iteration 410/ 5494 | consumed samples: 419840 | consumed tokens: 859832320 | elapsed time per iteration (s): 10.67 | learning rate: 1.981E-04 | global batch size: 1024 | lm loss: 4.779604E+00 | grad norm: 0.999 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 95.935 | TFLOPs: 21.45 | 63: iteration 420/ 5494 | consumed samples: 430080 | consumed tokens: 880803840 | elapsed time per iteration (s): 6.06 | learning rate: 1.980E-04 | global batch size: 1024 | lm loss: 4.765812E+00 | grad norm: 0.814 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 168.920 | TFLOPs: 37.76 | 63: iteration 430/ 5494 | consumed samples: 440320 | consumed tokens: 901775360 | elapsed time per iteration (s): 5.79 | learning rate: 1.979E-04 | global batch size: 1024 | lm loss: 4.732708E+00 | grad norm: 1.040 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.828 | TFLOPs: 39.53 | 63: iteration 440/ 5494 | consumed samples: 450560 | consumed tokens: 922746880 | elapsed time per iteration (s): 5.96 | learning rate: 1.978E-04 | global batch size: 1024 | lm loss: 4.699644E+00 | grad norm: 0.781 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 171.674 | TFLOPs: 38.38 | 63: iteration 450/ 5494 | consumed samples: 460800 | consumed tokens: 943718400 | elapsed time per iteration (s): 5.81 | learning rate: 1.977E-04 | global batch size: 1024 | lm loss: 4.622292E+00 | grad norm: 1.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.303 | TFLOPs: 39.42 | 63: iteration 460/ 5494 | consumed samples: 471040 | consumed tokens: 964689920 | elapsed time per iteration (s): 5.85 | learning rate: 1.975E-04 | global batch size: 1024 | lm loss: 4.599409E+00 | grad norm: 1.022 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.051 | TFLOPs: 39.14 | 63: iteration 470/ 5494 | consumed samples: 481280 | consumed tokens: 985661440 | elapsed time per iteration (s): 5.82 | learning rate: 1.974E-04 | global batch size: 1024 | lm loss: 4.529493E+00 | grad norm: 1.288 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.900 | TFLOPs: 39.33 | 63: iteration 480/ 5494 | consumed samples: 491520 | consumed tokens: 1006632960 | elapsed time per iteration (s): 5.95 | learning rate: 1.973E-04 | global batch size: 1024 | lm loss: 4.507388E+00 | grad norm: 0.819 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.209 | TFLOPs: 38.50 | 63: iteration 490/ 5494 | consumed samples: 501760 | consumed tokens: 1027604480 | elapsed time per iteration (s): 5.67 | learning rate: 1.972E-04 | global batch size: 1024 | lm loss: 4.464867E+00 | grad norm: 0.957 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.456 | TFLOPs: 40.34 | 63: iteration 500/ 5494 | consumed samples: 512000 | consumed tokens: 1048576000 | elapsed time per iteration (s): 5.66 | learning rate: 1.970E-04 | global batch size: 1024 | lm loss: 4.442460E+00 | grad norm: 0.718 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.811 | TFLOPs: 40.42 | 63: iteration 510/ 5494 | consumed samples: 522240 | consumed tokens: 1069547520 | elapsed time per iteration (s): 5.69 | learning rate: 1.969E-04 | global batch size: 1024 | lm loss: 4.377776E+00 | grad norm: 0.921 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.927 | TFLOPs: 40.23 | 63: iteration 520/ 5494 | consumed samples: 532480 | consumed tokens: 1090519040 | elapsed time per iteration (s): 5.80 | learning rate: 1.968E-04 | global batch size: 1024 | lm loss: 4.298764E+00 | grad norm: 1.056 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.540 | TFLOPs: 39.47 | 63: iteration 530/ 5494 | consumed samples: 542720 | consumed tokens: 1111490560 | elapsed time per iteration (s): 5.99 | learning rate: 1.966E-04 | global batch size: 1024 | lm loss: 4.227806E+00 | grad norm: 0.852 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 170.831 | TFLOPs: 38.19 | 63: iteration 540/ 5494 | consumed samples: 552960 | consumed tokens: 1132462080 | elapsed time per iteration (s): 5.55 | learning rate: 1.965E-04 | global batch size: 1024 | lm loss: 4.190243E+00 | grad norm: 0.532 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.586 | TFLOPs: 41.27 | 63: iteration 550/ 5494 | consumed samples: 563200 | consumed tokens: 1153433600 | elapsed time per iteration (s): 5.97 | learning rate: 1.963E-04 | global batch size: 1024 | lm loss: 4.143665E+00 | grad norm: 0.769 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 171.585 | TFLOPs: 38.36 | 63: iteration 560/ 5494 | consumed samples: 573440 | consumed tokens: 1174405120 | elapsed time per iteration (s): 6.00 | learning rate: 1.962E-04 | global batch size: 1024 | lm loss: 4.062264E+00 | grad norm: 0.570 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 170.596 | TFLOPs: 38.14 | 63: iteration 570/ 5494 | consumed samples: 583680 | consumed tokens: 1195376640 | elapsed time per iteration (s): 5.83 | learning rate: 1.960E-04 | global batch size: 1024 | lm loss: 3.954076E+00 | grad norm: 0.694 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.539 | TFLOPs: 39.24 | 63: iteration 580/ 5494 | consumed samples: 593920 | consumed tokens: 1216348160 | elapsed time per iteration (s): 6.10 | learning rate: 1.959E-04 | global batch size: 1024 | lm loss: 3.930460E+00 | grad norm: 0.712 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 167.852 | TFLOPs: 37.53 | 63: iteration 590/ 5494 | consumed samples: 604160 | consumed tokens: 1237319680 | elapsed time per iteration (s): 5.42 | learning rate: 1.957E-04 | global batch size: 1024 | lm loss: 3.824711E+00 | grad norm: 0.701 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 188.960 | TFLOPs: 42.25 | 63: iteration 600/ 5494 | consumed samples: 614400 | consumed tokens: 1258291200 | elapsed time per iteration (s): 5.55 | learning rate: 1.956E-04 | global batch size: 1024 | lm loss: 3.815291E+00 | grad norm: 1.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.362 | TFLOPs: 41.22 | 63: iteration 610/ 5494 | consumed samples: 624640 | consumed tokens: 1279262720 | elapsed time per iteration (s): 5.94 | learning rate: 1.954E-04 | global batch size: 1024 | lm loss: 3.710627E+00 | grad norm: 0.714 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.419 | TFLOPs: 38.55 | 63: iteration 620/ 5494 | consumed samples: 634880 | consumed tokens: 1300234240 | elapsed time per iteration (s): 6.16 | learning rate: 1.952E-04 | global batch size: 1024 | lm loss: 3.673405E+00 | grad norm: 0.666 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 166.262 | TFLOPs: 37.17 | 63: iteration 630/ 5494 | consumed samples: 645120 | consumed tokens: 1321205760 | elapsed time per iteration (s): 5.80 | learning rate: 1.951E-04 | global batch size: 1024 | lm loss: 3.599456E+00 | grad norm: 0.456 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.493 | TFLOPs: 39.46 | 63: iteration 640/ 5494 | consumed samples: 655360 | consumed tokens: 1342177280 | elapsed time per iteration (s): 6.09 | learning rate: 1.949E-04 | global batch size: 1024 | lm loss: 3.522515E+00 | grad norm: 0.464 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 168.256 | TFLOPs: 37.62 | 63: iteration 650/ 5494 | consumed samples: 665600 | consumed tokens: 1363148800 | elapsed time per iteration (s): 5.67 | learning rate: 1.947E-04 | global batch size: 1024 | lm loss: 3.506511E+00 | grad norm: 0.428 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.678 | TFLOPs: 40.39 | 63: iteration 660/ 5494 | consumed samples: 675840 | consumed tokens: 1384120320 | elapsed time per iteration (s): 5.92 | learning rate: 1.946E-04 | global batch size: 1024 | lm loss: 3.491068E+00 | grad norm: 0.352 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.021 | TFLOPs: 38.68 | 63: iteration 670/ 5494 | consumed samples: 686080 | consumed tokens: 1405091840 | elapsed time per iteration (s): 5.41 | learning rate: 1.944E-04 | global batch size: 1024 | lm loss: 3.428897E+00 | grad norm: 0.436 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.263 | TFLOPs: 42.31 | 63: iteration 680/ 5494 | consumed samples: 696320 | consumed tokens: 1426063360 | elapsed time per iteration (s): 5.85 | learning rate: 1.942E-04 | global batch size: 1024 | lm loss: 3.392717E+00 | grad norm: 0.405 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.004 | TFLOPs: 39.13 | 63: iteration 690/ 5494 | consumed samples: 706560 | consumed tokens: 1447034880 | elapsed time per iteration (s): 5.94 | learning rate: 1.940E-04 | global batch size: 1024 | lm loss: 3.344958E+00 | grad norm: 0.457 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.421 | TFLOPs: 38.55 | 63: iteration 700/ 5494 | consumed samples: 716800 | consumed tokens: 1468006400 | elapsed time per iteration (s): 5.67 | learning rate: 1.938E-04 | global batch size: 1024 | lm loss: 3.388753E+00 | grad norm: 0.591 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.566 | TFLOPs: 40.37 | 63: iteration 710/ 5494 | consumed samples: 727040 | consumed tokens: 1488977920 | elapsed time per iteration (s): 5.95 | learning rate: 1.936E-04 | global batch size: 1024 | lm loss: 3.336888E+00 | grad norm: 0.413 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.170 | TFLOPs: 38.49 | 63: iteration 720/ 5494 | consumed samples: 737280 | consumed tokens: 1509949440 | elapsed time per iteration (s): 5.68 | learning rate: 1.934E-04 | global batch size: 1024 | lm loss: 3.304044E+00 | grad norm: 0.462 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.131 | TFLOPs: 40.27 | 63: iteration 730/ 5494 | consumed samples: 747520 | consumed tokens: 1530920960 | elapsed time per iteration (s): 5.67 | learning rate: 1.932E-04 | global batch size: 1024 | lm loss: 3.262853E+00 | grad norm: 0.431 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.464 | TFLOPs: 40.35 | 63: iteration 740/ 5494 | consumed samples: 757760 | consumed tokens: 1551892480 | elapsed time per iteration (s): 5.80 | learning rate: 1.930E-04 | global batch size: 1024 | lm loss: 3.215022E+00 | grad norm: 0.474 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.545 | TFLOPs: 39.47 | 63: iteration 750/ 5494 | consumed samples: 768000 | consumed tokens: 1572864000 | elapsed time per iteration (s): 6.01 | learning rate: 1.928E-04 | global batch size: 1024 | lm loss: 3.238243E+00 | grad norm: 0.491 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 170.418 | TFLOPs: 38.10 | 63: iteration 760/ 5494 | consumed samples: 778240 | consumed tokens: 1593835520 | elapsed time per iteration (s): 5.55 | learning rate: 1.926E-04 | global batch size: 1024 | lm loss: 3.182378E+00 | grad norm: 0.332 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.338 | TFLOPs: 41.21 | 63: iteration 770/ 5494 | consumed samples: 788480 | consumed tokens: 1614807040 | elapsed time per iteration (s): 5.79 | learning rate: 1.924E-04 | global batch size: 1024 | lm loss: 3.186862E+00 | grad norm: 0.316 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.834 | TFLOPs: 39.53 | 63: iteration 780/ 5494 | consumed samples: 798720 | consumed tokens: 1635778560 | elapsed time per iteration (s): 5.95 | learning rate: 1.922E-04 | global batch size: 1024 | lm loss: 3.146424E+00 | grad norm: 0.394 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.161 | TFLOPs: 38.49 | 63: iteration 790/ 5494 | consumed samples: 808960 | consumed tokens: 1656750080 | elapsed time per iteration (s): 5.67 | learning rate: 1.920E-04 | global batch size: 1024 | lm loss: 3.128597E+00 | grad norm: 0.371 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.734 | TFLOPs: 40.41 | 63: iteration 800/ 5494 | consumed samples: 819200 | consumed tokens: 1677721600 | elapsed time per iteration (s): 5.55 | learning rate: 1.918E-04 | global batch size: 1024 | lm loss: 3.105857E+00 | grad norm: 0.452 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.447 | TFLOPs: 41.24 | 63: iteration 810/ 5494 | consumed samples: 829440 | consumed tokens: 1698693120 | elapsed time per iteration (s): 5.55 | learning rate: 1.916E-04 | global batch size: 1024 | lm loss: 3.121742E+00 | grad norm: 0.385 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.362 | TFLOPs: 41.22 | 63: iteration 820/ 5494 | consumed samples: 839680 | consumed tokens: 1719664640 | elapsed time per iteration (s): 5.67 | learning rate: 1.914E-04 | global batch size: 1024 | lm loss: 3.078194E+00 | grad norm: 0.523 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.507 | TFLOPs: 40.36 | 63: iteration 830/ 5494 | consumed samples: 849920 | consumed tokens: 1740636160 | elapsed time per iteration (s): 5.68 | learning rate: 1.911E-04 | global batch size: 1024 | lm loss: 3.100348E+00 | grad norm: 0.655 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.243 | TFLOPs: 40.30 | 63: iteration 840/ 5494 | consumed samples: 860160 | consumed tokens: 1761607680 | elapsed time per iteration (s): 5.83 | learning rate: 1.909E-04 | global batch size: 1024 | lm loss: 3.512693E+00 | grad norm: 4.793 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.778 | TFLOPs: 39.30 | 63: iteration 850/ 5494 | consumed samples: 870400 | consumed tokens: 1782579200 | elapsed time per iteration (s): 5.79 | learning rate: 1.907E-04 | global batch size: 1024 | lm loss: 5.991812E+00 | grad norm: 3.781 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.793 | TFLOPs: 39.53 | 63: iteration 860/ 5494 | consumed samples: 880640 | consumed tokens: 1803550720 | elapsed time per iteration (s): 5.69 | learning rate: 1.904E-04 | global batch size: 1024 | lm loss: 5.560976E+00 | grad norm: 1.532 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.020 | TFLOPs: 40.25 | 63: iteration 870/ 5494 | consumed samples: 890880 | consumed tokens: 1824522240 | elapsed time per iteration (s): 5.82 | learning rate: 1.902E-04 | global batch size: 1024 | lm loss: 4.848498E+00 | grad norm: 1.953 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.915 | TFLOPs: 39.33 | 63: iteration 880/ 5494 | consumed samples: 901120 | consumed tokens: 1845493760 | elapsed time per iteration (s): 5.56 | learning rate: 1.900E-04 | global batch size: 1024 | lm loss: 4.549866E+00 | grad norm: 2.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.049 | TFLOPs: 41.15 | 63: iteration 890/ 5494 | consumed samples: 911360 | consumed tokens: 1866465280 | elapsed time per iteration (s): 5.70 | learning rate: 1.897E-04 | global batch size: 1024 | lm loss: 4.181594E+00 | grad norm: 1.424 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.617 | TFLOPs: 40.16 | 63: iteration 900/ 5494 | consumed samples: 921600 | consumed tokens: 1887436800 | elapsed time per iteration (s): 5.59 | learning rate: 1.895E-04 | global batch size: 1024 | lm loss: 3.888839E+00 | grad norm: 0.641 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 183.326 | TFLOPs: 40.99 | 63: iteration 910/ 5494 | consumed samples: 931840 | consumed tokens: 1908408320 | elapsed time per iteration (s): 5.84 | learning rate: 1.892E-04 | global batch size: 1024 | lm loss: 3.636053E+00 | grad norm: 0.723 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.252 | TFLOPs: 39.18 | 63: iteration 920/ 5494 | consumed samples: 942080 | consumed tokens: 1929379840 | elapsed time per iteration (s): 5.69 | learning rate: 1.890E-04 | global batch size: 1024 | lm loss: 3.434283E+00 | grad norm: 0.824 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.933 | TFLOPs: 40.23 | 63: iteration 930/ 5494 | consumed samples: 952320 | consumed tokens: 1950351360 | elapsed time per iteration (s): 5.67 | learning rate: 1.887E-04 | global batch size: 1024 | lm loss: 3.379798E+00 | grad norm: 0.289 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.734 | TFLOPs: 40.41 | 63: iteration 940/ 5494 | consumed samples: 962560 | consumed tokens: 1971322880 | elapsed time per iteration (s): 5.65 | learning rate: 1.885E-04 | global batch size: 1024 | lm loss: 3.250529E+00 | grad norm: 0.378 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.226 | TFLOPs: 40.52 | 63: iteration 950/ 5494 | consumed samples: 972800 | consumed tokens: 1992294400 | elapsed time per iteration (s): 5.94 | learning rate: 1.882E-04 | global batch size: 1024 | lm loss: 3.259420E+00 | grad norm: 0.945 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.515 | TFLOPs: 38.57 | 63: iteration 960/ 5494 | consumed samples: 983040 | consumed tokens: 2013265920 | elapsed time per iteration (s): 5.98 | learning rate: 1.880E-04 | global batch size: 1024 | lm loss: 3.296971E+00 | grad norm: 0.286 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 171.347 | TFLOPs: 38.31 | 63: iteration 970/ 5494 | consumed samples: 993280 | consumed tokens: 2034237440 | elapsed time per iteration (s): 5.77 | learning rate: 1.877E-04 | global batch size: 1024 | lm loss: 3.165143E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.553 | TFLOPs: 39.69 | 63: iteration 980/ 5494 | consumed samples: 1003520 | consumed tokens: 2055208960 | elapsed time per iteration (s): 5.80 | learning rate: 1.875E-04 | global batch size: 1024 | lm loss: 3.146221E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.502 | TFLOPs: 39.46 | 63: iteration 990/ 5494 | consumed samples: 1013760 | consumed tokens: 2076180480 | elapsed time per iteration (s): 5.41 | learning rate: 1.872E-04 | global batch size: 1024 | lm loss: 3.089372E+00 | grad norm: 0.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.334 | TFLOPs: 42.33 | 63: iteration 1000/ 5494 | consumed samples: 1024000 | consumed tokens: 2097152000 | elapsed time per iteration (s): 5.80 | learning rate: 1.869E-04 | global batch size: 1024 | lm loss: 3.079670E+00 | grad norm: 0.300 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.448 | TFLOPs: 39.45 | 63: ------------------------------------------------------------------------------------------ 63: valid loss at iteration 1000 | lm loss value: 3.028083E+00 | lm loss PPL: 2.065760E+01 | 63: ------------------------------------------------------------------------------------------ 0: saving checkpoint at iteration 1000 to checkpoints_8b7 0: [2022-11-25 11:23:02,988] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step1000 is begin to save! 32: [2022-11-25 11:23:03,415] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_24-model_01-model_states.pt... 32: [2022-11-25 11:23:03,415] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_24-model_00-model_states.pt... 0: [2022-11-25 11:23:03,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_01-model_01-model_states.pt... 0: [2022-11-25 11:23:03,435] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_01-model_00-model_states.pt... 32: [2022-11-25 11:23:03,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_24-model_00-model_states.pt. 32: [2022-11-25 11:23:03,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_24-model_01-model_states.pt. 32: [2022-11-25 11:23:03,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_25-model_01-model_states.pt... 32: [2022-11-25 11:23:03,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_25-model_00-model_states.pt... 0: [2022-11-25 11:23:03,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_01-model_01-model_states.pt. 0: [2022-11-25 11:23:03,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_01-model_00-model_states.pt. 0: [2022-11-25 11:23:03,873] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_03-model_01-model_states.pt... 0: [2022-11-25 11:23:03,873] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_03-model_00-model_states.pt... 32: [2022-11-25 11:23:04,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_25-model_00-model_states.pt. 32: [2022-11-25 11:23:04,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_26-model_00-model_states.pt... 0: [2022-11-25 11:23:04,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_03-model_00-model_states.pt. 0: [2022-11-25 11:23:04,098] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_04-model_00-model_states.pt... 32: [2022-11-25 11:23:04,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_25-model_01-model_states.pt. 32: [2022-11-25 11:23:04,118] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_26-model_01-model_states.pt... 0: [2022-11-25 11:23:04,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_03-model_01-model_states.pt. 0: [2022-11-25 11:23:04,125] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_04-model_01-model_states.pt... 0: [2022-11-25 11:23:04,355] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_04-model_00-model_states.pt. 0: [2022-11-25 11:23:04,355] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_05-model_00-model_states.pt... 32: [2022-11-25 11:23:04,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_26-model_00-model_states.pt. 32: [2022-11-25 11:23:04,356] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_26-model_01-model_states.pt. 32: [2022-11-25 11:23:04,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_27-model_00-model_states.pt... 32: [2022-11-25 11:23:04,356] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_27-model_01-model_states.pt... 0: [2022-11-25 11:23:04,368] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_04-model_01-model_states.pt. 0: [2022-11-25 11:23:04,369] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_05-model_01-model_states.pt... 32: [2022-11-25 11:23:04,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_27-model_00-model_states.pt. 32: [2022-11-25 11:23:04,585] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_28-model_00-model_states.pt... 0: [2022-11-25 11:23:04,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_05-model_00-model_states.pt. 0: [2022-11-25 11:23:04,587] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_06-model_00-model_states.pt... 32: [2022-11-25 11:23:04,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_27-model_01-model_states.pt. 32: [2022-11-25 11:23:04,595] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_28-model_01-model_states.pt... 0: [2022-11-25 11:23:04,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_05-model_01-model_states.pt. 0: [2022-11-25 11:23:04,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_06-model_01-model_states.pt... 0: [2022-11-25 11:23:04,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_06-model_00-model_states.pt. 0: [2022-11-25 11:23:04,812] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_07-model_00-model_states.pt... 32: [2022-11-25 11:23:04,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_28-model_01-model_states.pt. 32: [2022-11-25 11:23:04,826] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_29-model_01-model_states.pt... 32: [2022-11-25 11:23:04,826] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_28-model_00-model_states.pt. 32: [2022-11-25 11:23:04,827] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_29-model_00-model_states.pt... 0: [2022-11-25 11:23:04,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_06-model_01-model_states.pt. 0: [2022-11-25 11:23:04,828] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_07-model_01-model_states.pt... 0: [2022-11-25 11:23:05,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_07-model_01-model_states.pt. 0: [2022-11-25 11:23:05,055] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_08-model_01-model_states.pt... 0: [2022-11-25 11:23:05,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_07-model_00-model_states.pt. 0: [2022-11-25 11:23:05,055] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_08-model_00-model_states.pt... 32: [2022-11-25 11:23:05,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_29-model_00-model_states.pt. 32: [2022-11-25 11:23:05,065] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_30-model_00-model_states.pt... 32: [2022-11-25 11:23:05,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_29-model_01-model_states.pt. 32: [2022-11-25 11:23:05,077] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_30-model_01-model_states.pt... 0: [2022-11-25 11:23:05,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_08-model_01-model_states.pt. 0: [2022-11-25 11:23:05,291] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_09-model_01-model_states.pt... 0: [2022-11-25 11:23:05,292] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_08-model_00-model_states.pt. 0: [2022-11-25 11:23:05,292] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_09-model_00-model_states.pt... 32: [2022-11-25 11:23:05,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_30-model_01-model_states.pt. 32: [2022-11-25 11:23:05,307] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_30-model_00-model_states.pt. 32: [2022-11-25 11:23:05,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_31-model_01-model_states.pt... 32: [2022-11-25 11:23:05,307] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_31-model_00-model_states.pt... 0: [2022-11-25 11:23:05,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_09-model_01-model_states.pt. 0: [2022-11-25 11:23:05,508] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_10-model_01-model_states.pt... 32: [2022-11-25 11:23:05,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_31-model_00-model_states.pt. 32: [2022-11-25 11:23:05,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_31-model_01-model_states.pt. 32: [2022-11-25 11:23:05,532] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_32-model_00-model_states.pt... 32: [2022-11-25 11:23:05,532] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_32-model_01-model_states.pt... 0: [2022-11-25 11:23:05,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_09-model_00-model_states.pt. 0: [2022-11-25 11:23:05,537] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_10-model_00-model_states.pt... 0: [2022-11-25 11:23:05,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_10-model_01-model_states.pt. 0: [2022-11-25 11:23:05,729] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_11-model_01-model_states.pt... 0: [2022-11-25 11:23:05,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_10-model_00-model_states.pt. 0: [2022-11-25 11:23:05,759] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_11-model_00-model_states.pt... 32: [2022-11-25 11:23:05,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_32-model_01-model_states.pt. 32: [2022-11-25 11:23:05,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_33-model_01-model_states.pt... 32: [2022-11-25 11:23:05,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_32-model_00-model_states.pt. 32: [2022-11-25 11:23:05,780] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_33-model_00-model_states.pt... 0: [2022-11-25 11:23:05,948] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_11-model_01-model_states.pt. 0: [2022-11-25 11:23:05,949] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_12-model_01-model_states.pt... 0: [2022-11-25 11:23:05,985] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_11-model_00-model_states.pt. 0: [2022-11-25 11:23:05,986] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_12-model_00-model_states.pt... 32: [2022-11-25 11:23:06,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_33-model_00-model_states.pt. 32: [2022-11-25 11:23:06,020] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_33-model_01-model_states.pt. 32: [2022-11-25 11:23:06,020] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_34-model_01-model_states.pt... 32: [2022-11-25 11:23:06,020] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_34-model_00-model_states.pt... 0: [2022-11-25 11:23:06,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_12-model_01-model_states.pt. 0: [2022-11-25 11:23:06,171] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_13-model_01-model_states.pt... 0: [2022-11-25 11:23:06,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_12-model_00-model_states.pt. 0: [2022-11-25 11:23:06,207] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_13-model_00-model_states.pt... 32: [2022-11-25 11:23:06,254] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_34-model_00-model_states.pt. 32: [2022-11-25 11:23:06,255] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_34-model_01-model_states.pt. 32: [2022-11-25 11:23:06,255] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_35-model_00-model_states.pt... 32: [2022-11-25 11:23:06,255] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_35-model_01-model_states.pt... 0: [2022-11-25 11:23:06,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_13-model_01-model_states.pt. 0: [2022-11-25 11:23:06,392] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_14-model_01-model_states.pt... 0: [2022-11-25 11:23:06,427] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_13-model_00-model_states.pt. 0: [2022-11-25 11:23:06,427] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_14-model_00-model_states.pt... 32: [2022-11-25 11:23:06,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_35-model_00-model_states.pt. 32: [2022-11-25 11:23:06,493] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_35-model_01-model_states.pt. 32: [2022-11-25 11:23:06,494] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_36-model_00-model_states.pt... 32: [2022-11-25 11:23:06,494] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_36-model_01-model_states.pt... 0: [2022-11-25 11:23:06,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_14-model_01-model_states.pt. 0: [2022-11-25 11:23:06,652] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_15-model_01-model_states.pt... 0: [2022-11-25 11:23:06,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_14-model_00-model_states.pt. 0: [2022-11-25 11:23:06,673] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_15-model_00-model_states.pt... 32: [2022-11-25 11:23:06,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_36-model_01-model_states.pt. 32: [2022-11-25 11:23:06,743] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_37-model_01-model_states.pt... 32: [2022-11-25 11:23:06,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_36-model_00-model_states.pt. 32: [2022-11-25 11:23:06,758] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_37-model_00-model_states.pt... 0: [2022-11-25 11:23:06,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_15-model_00-model_states.pt. 0: [2022-11-25 11:23:06,904] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_16-model_00-model_states.pt... 0: [2022-11-25 11:23:06,923] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_15-model_01-model_states.pt. 0: [2022-11-25 11:23:06,924] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_16-model_01-model_states.pt... 32: [2022-11-25 11:23:06,980] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_37-model_00-model_states.pt. 32: [2022-11-25 11:23:06,981] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_38-model_00-model_states.pt... 32: [2022-11-25 11:23:07,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_37-model_01-model_states.pt. 32: [2022-11-25 11:23:07,008] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_38-model_01-model_states.pt... 0: [2022-11-25 11:23:07,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_16-model_01-model_states.pt. 0: [2022-11-25 11:23:07,154] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_17-model_01-model_states.pt... 0: [2022-11-25 11:23:07,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_16-model_00-model_states.pt. 0: [2022-11-25 11:23:07,172] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_17-model_00-model_states.pt... 32: [2022-11-25 11:23:07,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_38-model_01-model_states.pt. 32: [2022-11-25 11:23:07,239] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_38-model_00-model_states.pt. 32: [2022-11-25 11:23:07,239] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_39-model_01-model_states.pt... 32: [2022-11-25 11:23:07,239] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_39-model_00-model_states.pt... 0: [2022-11-25 11:23:07,391] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_17-model_00-model_states.pt. 0: [2022-11-25 11:23:07,392] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_18-model_00-model_states.pt... 0: [2022-11-25 11:23:07,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_17-model_01-model_states.pt. 0: [2022-11-25 11:23:07,393] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_18-model_01-model_states.pt... 32: [2022-11-25 11:23:07,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_39-model_00-model_states.pt. 32: [2022-11-25 11:23:07,469] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_40-model_00-model_states.pt... 32: [2022-11-25 11:23:07,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_39-model_01-model_states.pt. 32: [2022-11-25 11:23:07,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_40-model_01-model_states.pt... 0: [2022-11-25 11:23:07,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_18-model_01-model_states.pt. 0: [2022-11-25 11:23:07,635] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_19-model_01-model_states.pt... 0: [2022-11-25 11:23:07,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_18-model_00-model_states.pt. 0: [2022-11-25 11:23:07,640] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_19-model_00-model_states.pt... 32: [2022-11-25 11:23:07,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_40-model_01-model_states.pt. 32: [2022-11-25 11:23:07,704] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_41-model_01-model_states.pt... 32: [2022-11-25 11:23:07,728] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_40-model_00-model_states.pt. 32: [2022-11-25 11:23:07,729] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_41-model_00-model_states.pt... 0: [2022-11-25 11:23:07,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_19-model_00-model_states.pt. 0: [2022-11-25 11:23:07,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_20-model_00-model_states.pt... 0: [2022-11-25 11:23:07,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_19-model_01-model_states.pt. 0: [2022-11-25 11:23:07,879] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_20-model_01-model_states.pt... 32: [2022-11-25 11:23:07,951] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_41-model_00-model_states.pt. 32: [2022-11-25 11:23:07,951] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_42-model_00-model_states.pt... 32: [2022-11-25 11:23:07,968] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_41-model_01-model_states.pt. 32: [2022-11-25 11:23:07,968] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_42-model_01-model_states.pt... 0: [2022-11-25 11:23:08,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_20-model_01-model_states.pt. 0: [2022-11-25 11:23:08,107] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_21-model_01-model_states.pt... 0: [2022-11-25 11:23:08,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_20-model_00-model_states.pt. 0: [2022-11-25 11:23:08,108] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_21-model_00-model_states.pt... 32: [2022-11-25 11:23:08,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_42-model_00-model_states.pt. 32: [2022-11-25 11:23:08,202] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_43-model_00-model_states.pt... 32: [2022-11-25 11:23:08,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_42-model_01-model_states.pt. 32: [2022-11-25 11:23:08,202] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_43-model_01-model_states.pt... 0: [2022-11-25 11:23:08,332] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_21-model_01-model_states.pt. 0: [2022-11-25 11:23:08,332] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_22-model_01-model_states.pt... 0: [2022-11-25 11:23:08,343] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_21-model_00-model_states.pt. 0: [2022-11-25 11:23:08,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_22-model_00-model_states.pt... 32: [2022-11-25 11:23:08,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_43-model_01-model_states.pt. 32: [2022-11-25 11:23:08,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_43-model_00-model_states.pt. 32: [2022-11-25 11:23:08,450] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_44-model_01-model_states.pt... 32: [2022-11-25 11:23:08,451] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_44-model_00-model_states.pt... 0: [2022-11-25 11:23:08,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_22-model_00-model_states.pt. 0: [2022-11-25 11:23:08,567] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_23-model_00-model_states.pt... 0: [2022-11-25 11:23:08,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_22-model_01-model_states.pt. 0: [2022-11-25 11:23:08,589] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_23-model_01-model_states.pt... 32: [2022-11-25 11:23:08,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_44-model_01-model_states.pt. 32: [2022-11-25 11:23:08,689] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_46-model_01-model_states.pt... 32: [2022-11-25 11:23:08,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_44-model_00-model_states.pt. 32: [2022-11-25 11:23:08,689] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/layer_46-model_00-model_states.pt... 32: [2022-11-25 11:23:08,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_46-model_01-model_states.pt. 32: [2022-11-25 11:23:08,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_46-model_00-model_states.pt. 32: [2022-11-25 11:23:08,719] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/mp_rank_03_model_states.pt... 32: [2022-11-25 11:23:08,719] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/mp_rank_02_model_states.pt... 32: [2022-11-25 11:23:08,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/mp_rank_03_model_states.pt. 32: [2022-11-25 11:23:08,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/mp_rank_02_model_states.pt. 0: [2022-11-25 11:23:08,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_23-model_01-model_states.pt. 0: [2022-11-25 11:23:08,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/layer_23-model_00-model_states.pt. 0: [2022-11-25 11:23:08,795] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: checkpoints_8b7/global_step1000/mp_rank_01_model_states.pt 0: [2022-11-25 11:23:08,795] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/mp_rank_01_model_states.pt... 0: [2022-11-25 11:23:08,796] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_8b7/global_step1000/mp_rank_00_model_states.pt 0: [2022-11-25 11:23:08,796] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/mp_rank_00_model_states.pt... 0: [2022-11-25 11:23:08,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/mp_rank_00_model_states.pt. 0: [2022-11-25 11:23:08,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/mp_rank_01_model_states.pt. 0: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... 0: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... 32: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt... 63: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt... 63: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt... 63: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt... 63: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt... 39: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt... 39: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt... 39: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt... 39: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt... 53: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt... 53: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt... 53: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt... 53: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt... 51: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt... 51: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt... 51: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt... 51: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt... 61: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt... 61: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt... 61: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt... 61: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt... 40: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt... 40: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt... 60: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt... 60: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt... 60: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt... 56: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt... 56: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt... 56: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt... 56: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt... 0: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... 41: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt... 41: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt... 41: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt... 41: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt... 45: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt... 45: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt... 45: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt... 45: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt... 44: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt... 44: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt... 44: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt... 44: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt... 48: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt... 48: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt... 48: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt... 48: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt... 50: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt... 50: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt... 50: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt... 50: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt... 42: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt... 42: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt... 42: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt... 42: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt... 32: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt... 12: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... 12: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... 12: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... 57: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt... 57: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt... 57: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt... 57: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt... 63: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt... 63: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt... 39: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt... 39: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt... 39: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt... 39: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt... 53: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt... 53: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt... 53: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt... 53: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt... 55: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt... 55: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt... 55: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt... 55: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt... 51: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt... 51: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt... 33: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt... 33: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt... 33: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt... 33: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt... 1: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... 1: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... 1: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... 61: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt... 59: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt... 59: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt... 59: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt... 59: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt... 13: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... 13: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... 13: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... 13: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... 35: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt... 35: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt... 35: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt... 35: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt... 5: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... 5: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... 5: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... 5: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... 29: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... 29: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... 29: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... 29: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... 37: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt... 37: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt... 37: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt... 58: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt... 58: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt... 58: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt... 58: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt... 40: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt... 40: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt... 60: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt... 52: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt... 52: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt... 52: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt... 30: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... 30: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... 30: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... 30: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... 4: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... 4: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... 56: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt... 54: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt... 54: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt... 54: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt... 54: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt... 62: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt... 62: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt... 62: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt... 62: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt... 36: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt... 36: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt... 36: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt... 36: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt... 28: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... 28: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... 28: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... 28: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... 38: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt... 38: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt... 38: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt... 38: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt... 0: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... 2: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... 2: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... 2: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... 2: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... 24: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... 24: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... 24: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... 24: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... 49: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt... 49: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt... 49: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt... 47: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt... 47: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt... 47: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt... 47: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt... 41: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt... 41: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt... 41: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt... 41: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt... 45: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt... 9: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... 9: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... 9: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... 9: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... 43: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt... 43: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt... 43: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt... 43: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt... 27: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... 27: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... 27: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... 27: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... 25: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... 25: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... 25: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... 25: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... 3: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... 3: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... 3: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... 3: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... 7: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... 7: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... 7: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... 7: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... 17: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... 17: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... 17: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... 23: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... 23: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... 23: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... 23: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... 11: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... 11: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... 11: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... 11: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... 31: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... 31: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... 31: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... 31: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... 19: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... 19: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... 19: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... 19: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... 34: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt... 34: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt... 34: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt... 34: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt... 46: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt... 46: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt... 46: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt... 46: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt... 44: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt... 44: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt... 44: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt... 44: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt... 6: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... 6: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... 6: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... 6: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... 16: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... 16: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... 16: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... 16: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... 18: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... 18: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... 18: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... 18: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... 10: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... 10: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... 10: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... 10: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... 20: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... 20: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... 20: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... 20: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... 8: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... 8: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... 8: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... 8: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... 48: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt... 50: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt... 50: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt... 42: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt... 32: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt... 32: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt... 12: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... 26: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... 26: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... 26: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... 26: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... 14: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... 14: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... 14: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... 14: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... 15: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... 15: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... 15: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... 15: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... 22: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... 22: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... 22: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... 22: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... 21: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... 21: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... 21: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... 21: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... 57: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt... 63: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt... 63: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt... 55: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt... 55: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt... 55: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt... 55: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt... 51: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt... 33: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt... 33: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt... 1: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... 61: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt... 61: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt... 59: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt... 59: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt... 59: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt... 13: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... 13: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... 35: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt... 35: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt... 5: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... 5: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... 5: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... 29: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... 29: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... 37: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt... 58: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt... 58: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt... 40: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt... 60: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt... 60: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt... 60: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt... 52: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt... 30: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... 30: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... 4: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... 56: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt... 56: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt... 56: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt... 54: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt... 54: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt... 54: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt... 62: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt... 62: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt... 62: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt... 36: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt... 36: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt... 28: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... 28: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... 38: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt... 38: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt... 38: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt... 38: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt... 0: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... 2: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... 24: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... 49: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt... 47: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt... 47: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt... 45: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt... 9: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... 9: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... 43: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt... 43: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt... 43: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt... 27: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... 27: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... 27: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... 25: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... 3: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... 3: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... 3: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... 7: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... 7: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... 17: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... 17: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... 17: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... 23: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... 23: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... 23: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... 23: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... 11: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... 11: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... 31: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... 19: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... 34: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt... 34: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt... 46: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt... 46: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt... 46: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt... 46: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt... 6: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... 16: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... 16: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... 16: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... 18: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... 18: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... 18: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... 10: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... 10: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... 20: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... 20: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... 8: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... 8: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... 48: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt... 50: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt... 50: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt... 42: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt... 42: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt... 32: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt... 26: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... 14: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... 14: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... 14: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... 14: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... 15: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... 15: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... 22: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... 21: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... 21: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... 21: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... 21: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... 57: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt... 57: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt... 57: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt... 51: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt... 33: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt... 33: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt... 1: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... 1: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... 1: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... 61: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt... 59: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt... 13: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... 13: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... 35: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt... 35: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt... 5: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... 29: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... 37: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt... 58: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt... 58: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt... 40: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt... 40: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt... 40: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt... 60: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt... 52: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt... 52: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt... 30: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... 30: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... 4: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... 4: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... 4: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... 4: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... 54: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt... 62: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt... 36: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt... 36: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt... 28: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... 28: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... 0: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... 2: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... 2: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... 2: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... 24: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... 24: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... 49: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt... 49: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt... 49: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt... 47: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt... 47: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt... 45: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt... 45: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt... 9: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... 43: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt... 27: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... 25: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... 3: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... 7: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... 17: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... 11: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... 11: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... 31: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... 19: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... 19: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... 19: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... 34: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt... 34: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt... 6: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... 6: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... 16: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... 18: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... 10: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... 10: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... 20: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... 20: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... 8: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... 48: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt... 42: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt... 12: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... 12: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... 12: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... 26: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... 15: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... 22: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... 1: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... 29: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... 37: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt... 37: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt... 52: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt... 52: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt... 4: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... 0: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... 24: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... 49: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt... 9: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... 25: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... 25: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... 7: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... 17: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... 31: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... 31: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... 6: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... 8: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... 32: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt... 12: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... 26: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... 26: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... 15: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... 22: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... 22: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... 37: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt... 0: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... 48: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt... 32: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt... 32: [2022-11-25 11:23:08,982] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step1000/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt... 0: [2022-11-25 11:23:09,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. 0: [2022-11-25 11:23:09,251] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt 0: [2022-11-25 11:23:09,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. 0: [2022-11-25 11:23:09,251] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 0: [2022-11-25 11:23:09,251] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt 0: [2022-11-25 11:23:09,251] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 0: [2022-11-25 11:23:09,253] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. 0: [2022-11-25 11:23:09,253] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt 0: [2022-11-25 11:23:09,253] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 32: [2022-11-25 11:23:09,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt. 32: [2022-11-25 11:23:09,258] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt 32: [2022-11-25 11:23:09,258] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 32: [2022-11-25 11:23:09,261] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt. 32: [2022-11-25 11:23:09,261] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt 32: [2022-11-25 11:23:09,261] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 0: [2022-11-25 11:23:09,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. 0: [2022-11-25 11:23:09,279] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt 0: [2022-11-25 11:23:09,279] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 0: [2022-11-25 11:23:09,280] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. 0: [2022-11-25 11:23:09,280] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt 0: [2022-11-25 11:23:09,280] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 6: [2022-11-25 11:23:09,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. 6: [2022-11-25 11:23:09,289] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. 6: [2022-11-25 11:23:09,288] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. 6: [2022-11-25 11:23:09,289] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt 6: [2022-11-25 11:23:09,289] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt 6: [2022-11-25 11:23:09,289] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt 6: [2022-11-25 11:23:09,289] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 6: [2022-11-25 11:23:09,289] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 6: [2022-11-25 11:23:09,289] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 32: [2022-11-25 11:23:09,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt. 32: [2022-11-25 11:23:09,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt 32: [2022-11-25 11:23:09,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 32: [2022-11-25 11:23:09,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt. 32: [2022-11-25 11:23:09,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt 32: [2022-11-25 11:23:09,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 0: [2022-11-25 11:23:09,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. 0: [2022-11-25 11:23:09,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt 0: [2022-11-25 11:23:09,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 0: [2022-11-25 11:23:09,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. 6: [2022-11-25 11:23:09,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. 6: [2022-11-25 11:23:09,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt 6: [2022-11-25 11:23:09,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 49: [2022-11-25 11:23:09,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt. 49: [2022-11-25 11:23:09,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt. 49: [2022-11-25 11:23:09,324] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt. 49: [2022-11-25 11:23:09,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt 49: [2022-11-25 11:23:09,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt 49: [2022-11-25 11:23:09,324] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt 49: [2022-11-25 11:23:09,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 49: [2022-11-25 11:23:09,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 49: [2022-11-25 11:23:09,324] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 0: [2022-11-25 11:23:09,340] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. 0: [2022-11-25 11:23:09,340] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt 0: [2022-11-25 11:23:09,340] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 49: [2022-11-25 11:23:09,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt. 49: [2022-11-25 11:23:09,346] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt 49: [2022-11-25 11:23:09,346] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 49: [2022-11-25 11:23:09,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt. 49: [2022-11-25 11:23:09,346] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt 49: [2022-11-25 11:23:09,346] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 49: [2022-11-25 11:23:09,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt. 49: [2022-11-25 11:23:09,346] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt 49: [2022-11-25 11:23:09,346] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 6: [2022-11-25 11:23:09,353] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. 6: [2022-11-25 11:23:09,353] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt 6: [2022-11-25 11:23:09,353] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 57: [2022-11-25 11:23:09,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt. 57: [2022-11-25 11:23:09,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt. 57: [2022-11-25 11:23:09,379] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt. 57: [2022-11-25 11:23:09,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt. 57: [2022-11-25 11:23:09,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt 57: [2022-11-25 11:23:09,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt 57: [2022-11-25 11:23:09,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt 57: [2022-11-25 11:23:09,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt 57: [2022-11-25 11:23:09,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 57: [2022-11-25 11:23:09,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 57: [2022-11-25 11:23:09,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 57: [2022-11-25 11:23:09,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 57: [2022-11-25 11:23:09,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt. 57: [2022-11-25 11:23:09,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt 57: [2022-11-25 11:23:09,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 57: [2022-11-25 11:23:09,381] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt. 57: [2022-11-25 11:23:09,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt 57: [2022-11-25 11:23:09,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 32: [2022-11-25 11:23:09,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt. 32: [2022-11-25 11:23:09,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt 32: [2022-11-25 11:23:09,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 32: [2022-11-25 11:23:09,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt. 32: [2022-11-25 11:23:09,383] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt 32: [2022-11-25 11:23:09,383] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 32: [2022-11-25 11:23:09,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt. 32: [2022-11-25 11:23:09,383] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt 32: [2022-11-25 11:23:09,383] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 57: [2022-11-25 11:23:09,386] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt. 57: [2022-11-25 11:23:09,386] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt 57: [2022-11-25 11:23:09,386] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 29: [2022-11-25 11:23:09,390] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. 29: [2022-11-25 11:23:09,390] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. 29: [2022-11-25 11:23:09,390] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt 29: [2022-11-25 11:23:09,390] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt 29: [2022-11-25 11:23:09,390] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 29: [2022-11-25 11:23:09,390] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 29: [2022-11-25 11:23:09,390] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. 29: [2022-11-25 11:23:09,390] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. 29: [2022-11-25 11:23:09,391] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt 29: [2022-11-25 11:23:09,391] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 29: [2022-11-25 11:23:09,391] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt 29: [2022-11-25 11:23:09,391] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 63: [2022-11-25 11:23:09,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt. 63: [2022-11-25 11:23:09,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt. 63: [2022-11-25 11:23:09,393] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt. 63: [2022-11-25 11:23:09,393] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt 63: [2022-11-25 11:23:09,393] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 63: [2022-11-25 11:23:09,393] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt 63: [2022-11-25 11:23:09,393] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt 63: [2022-11-25 11:23:09,393] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 63: [2022-11-25 11:23:09,393] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 29: [2022-11-25 11:23:09,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. 32: [2022-11-25 11:23:09,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt. 19: [2022-11-25 11:23:09,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. 29: [2022-11-25 11:23:09,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt 29: [2022-11-25 11:23:09,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 32: [2022-11-25 11:23:09,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt 32: [2022-11-25 11:23:09,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 19: [2022-11-25 11:23:09,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. 19: [2022-11-25 11:23:09,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. 19: [2022-11-25 11:23:09,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. 19: [2022-11-25 11:23:09,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. 19: [2022-11-25 11:23:09,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt 19: [2022-11-25 11:23:09,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt 19: [2022-11-25 11:23:09,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt 19: [2022-11-25 11:23:09,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 19: [2022-11-25 11:23:09,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. 19: [2022-11-25 11:23:09,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt 19: [2022-11-25 11:23:09,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt 19: [2022-11-25 11:23:09,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 19: [2022-11-25 11:23:09,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 19: [2022-11-25 11:23:09,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 19: [2022-11-25 11:23:09,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 19: [2022-11-25 11:23:09,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt 19: [2022-11-25 11:23:09,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 63: [2022-11-25 11:23:09,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt. 30: [2022-11-25 11:23:09,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. 30: [2022-11-25 11:23:09,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. 30: [2022-11-25 11:23:09,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. 30: [2022-11-25 11:23:09,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. 30: [2022-11-25 11:23:09,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. 30: [2022-11-25 11:23:09,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. 63: [2022-11-25 11:23:09,414] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt 63: [2022-11-25 11:23:09,414] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 30: [2022-11-25 11:23:09,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. 30: [2022-11-25 11:23:09,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt 30: [2022-11-25 11:23:09,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt 30: [2022-11-25 11:23:09,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt 30: [2022-11-25 11:23:09,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt 30: [2022-11-25 11:23:09,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt 30: [2022-11-25 11:23:09,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt 30: [2022-11-25 11:23:09,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt 30: [2022-11-25 11:23:09,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 30: [2022-11-25 11:23:09,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 30: [2022-11-25 11:23:09,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 30: [2022-11-25 11:23:09,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 30: [2022-11-25 11:23:09,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 30: [2022-11-25 11:23:09,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 30: [2022-11-25 11:23:09,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 20: [2022-11-25 11:23:09,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. 20: [2022-11-25 11:23:09,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. 20: [2022-11-25 11:23:09,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. 20: [2022-11-25 11:23:09,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. 20: [2022-11-25 11:23:09,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. 20: [2022-11-25 11:23:09,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. 20: [2022-11-25 11:23:09,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. 20: [2022-11-25 11:23:09,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt 20: [2022-11-25 11:23:09,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt 20: [2022-11-25 11:23:09,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt 20: [2022-11-25 11:23:09,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt 20: [2022-11-25 11:23:09,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt 20: [2022-11-25 11:23:09,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt 20: [2022-11-25 11:23:09,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt 20: [2022-11-25 11:23:09,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 20: [2022-11-25 11:23:09,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 20: [2022-11-25 11:23:09,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 20: [2022-11-25 11:23:09,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 20: [2022-11-25 11:23:09,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 20: [2022-11-25 11:23:09,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 20: [2022-11-25 11:23:09,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 55: [2022-11-25 11:23:09,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt. 55: [2022-11-25 11:23:09,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt. 55: [2022-11-25 11:23:09,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt. 55: [2022-11-25 11:23:09,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt. 25: [2022-11-25 11:23:09,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. 25: [2022-11-25 11:23:09,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. 25: [2022-11-25 11:23:09,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. 25: [2022-11-25 11:23:09,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. 25: [2022-11-25 11:23:09,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. 55: [2022-11-25 11:23:09,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt. 55: [2022-11-25 11:23:09,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt 55: [2022-11-25 11:23:09,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt 55: [2022-11-25 11:23:09,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt 25: [2022-11-25 11:23:09,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. 25: [2022-11-25 11:23:09,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. 55: [2022-11-25 11:23:09,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt 55: [2022-11-25 11:23:09,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt 25: [2022-11-25 11:23:09,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt 25: [2022-11-25 11:23:09,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt 55: [2022-11-25 11:23:09,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 55: [2022-11-25 11:23:09,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 25: [2022-11-25 11:23:09,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt 25: [2022-11-25 11:23:09,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 25: [2022-11-25 11:23:09,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt 25: [2022-11-25 11:23:09,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt 55: [2022-11-25 11:23:09,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 25: [2022-11-25 11:23:09,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt 25: [2022-11-25 11:23:09,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 55: [2022-11-25 11:23:09,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 55: [2022-11-25 11:23:09,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 25: [2022-11-25 11:23:09,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt 25: [2022-11-25 11:23:09,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 25: [2022-11-25 11:23:09,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 25: [2022-11-25 11:23:09,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 25: [2022-11-25 11:23:09,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 25: [2022-11-25 11:23:09,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 45: [2022-11-25 11:23:09,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt. 45: [2022-11-25 11:23:09,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt. 13: [2022-11-25 11:23:09,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. 13: [2022-11-25 11:23:09,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. 13: [2022-11-25 11:23:09,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. 13: [2022-11-25 11:23:09,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. 13: [2022-11-25 11:23:09,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. 13: [2022-11-25 11:23:09,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. 13: [2022-11-25 11:23:09,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt 13: [2022-11-25 11:23:09,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt 13: [2022-11-25 11:23:09,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt 13: [2022-11-25 11:23:09,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 13: [2022-11-25 11:23:09,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt 13: [2022-11-25 11:23:09,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 13: [2022-11-25 11:23:09,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt 13: [2022-11-25 11:23:09,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt 13: [2022-11-25 11:23:09,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 13: [2022-11-25 11:23:09,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 13: [2022-11-25 11:23:09,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 13: [2022-11-25 11:23:09,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 19: [2022-11-25 11:23:09,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. 45: [2022-11-25 11:23:09,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt. 45: [2022-11-25 11:23:09,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt. 45: [2022-11-25 11:23:09,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt. 45: [2022-11-25 11:23:09,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt. 45: [2022-11-25 11:23:09,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt 45: [2022-11-25 11:23:09,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt. 45: [2022-11-25 11:23:09,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt 45: [2022-11-25 11:23:09,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 45: [2022-11-25 11:23:09,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt 45: [2022-11-25 11:23:09,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt 45: [2022-11-25 11:23:09,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt 45: [2022-11-25 11:23:09,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 45: [2022-11-25 11:23:09,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt 45: [2022-11-25 11:23:09,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt 45: [2022-11-25 11:23:09,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 45: [2022-11-25 11:23:09,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 45: [2022-11-25 11:23:09,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 45: [2022-11-25 11:23:09,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 45: [2022-11-25 11:23:09,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 19: [2022-11-25 11:23:09,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt 19: [2022-11-25 11:23:09,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 36: [2022-11-25 11:23:09,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt. 36: [2022-11-25 11:23:09,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt. 36: [2022-11-25 11:23:09,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt. 36: [2022-11-25 11:23:09,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt. 36: [2022-11-25 11:23:09,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt 36: [2022-11-25 11:23:09,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt 36: [2022-11-25 11:23:09,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt 36: [2022-11-25 11:23:09,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt 36: [2022-11-25 11:23:09,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 36: [2022-11-25 11:23:09,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 36: [2022-11-25 11:23:09,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 36: [2022-11-25 11:23:09,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 63: [2022-11-25 11:23:09,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt. 63: [2022-11-25 11:23:09,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt 63: [2022-11-25 11:23:09,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 63: [2022-11-25 11:23:09,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt. 63: [2022-11-25 11:23:09,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt 33: [2022-11-25 11:23:09,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt. 33: [2022-11-25 11:23:09,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt. 33: [2022-11-25 11:23:09,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt. 33: [2022-11-25 11:23:09,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt. 33: [2022-11-25 11:23:09,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt. 29: [2022-11-25 11:23:09,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. 41: [2022-11-25 11:23:09,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt. 41: [2022-11-25 11:23:09,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt. 41: [2022-11-25 11:23:09,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt. 41: [2022-11-25 11:23:09,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt. 63: [2022-11-25 11:23:09,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 33: [2022-11-25 11:23:09,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt. 33: [2022-11-25 11:23:09,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt 33: [2022-11-25 11:23:09,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt 33: [2022-11-25 11:23:09,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt 33: [2022-11-25 11:23:09,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt 33: [2022-11-25 11:23:09,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt 33: [2022-11-25 11:23:09,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt 33: [2022-11-25 11:23:09,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 33: [2022-11-25 11:23:09,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 33: [2022-11-25 11:23:09,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 33: [2022-11-25 11:23:09,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 33: [2022-11-25 11:23:09,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 33: [2022-11-25 11:23:09,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 33: [2022-11-25 11:23:09,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt. 41: [2022-11-25 11:23:09,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt. 33: [2022-11-25 11:23:09,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt 41: [2022-11-25 11:23:09,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt 41: [2022-11-25 11:23:09,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt 41: [2022-11-25 11:23:09,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt 33: [2022-11-25 11:23:09,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 41: [2022-11-25 11:23:09,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt 29: [2022-11-25 11:23:09,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt 41: [2022-11-25 11:23:09,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 41: [2022-11-25 11:23:09,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 41: [2022-11-25 11:23:09,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt 29: [2022-11-25 11:23:09,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 41: [2022-11-25 11:23:09,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 41: [2022-11-25 11:23:09,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 29: [2022-11-25 11:23:09,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. 41: [2022-11-25 11:23:09,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 29: [2022-11-25 11:23:09,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt 41: [2022-11-25 11:23:09,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt. 29: [2022-11-25 11:23:09,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 41: [2022-11-25 11:23:09,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt 41: [2022-11-25 11:23:09,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 41: [2022-11-25 11:23:09,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt. 41: [2022-11-25 11:23:09,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt 41: [2022-11-25 11:23:09,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 51: [2022-11-25 11:23:09,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt. 51: [2022-11-25 11:23:09,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt. 51: [2022-11-25 11:23:09,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt. 51: [2022-11-25 11:23:09,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt. 51: [2022-11-25 11:23:09,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt. 51: [2022-11-25 11:23:09,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt 51: [2022-11-25 11:23:09,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt 51: [2022-11-25 11:23:09,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt 51: [2022-11-25 11:23:09,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 51: [2022-11-25 11:23:09,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt 51: [2022-11-25 11:23:09,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt 51: [2022-11-25 11:23:09,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 51: [2022-11-25 11:23:09,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 51: [2022-11-25 11:23:09,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 51: [2022-11-25 11:23:09,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 51: [2022-11-25 11:23:09,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt. 51: [2022-11-25 11:23:09,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt 51: [2022-11-25 11:23:09,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 11: [2022-11-25 11:23:09,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. 11: [2022-11-25 11:23:09,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. 11: [2022-11-25 11:23:09,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. 11: [2022-11-25 11:23:09,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. 11: [2022-11-25 11:23:09,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. 11: [2022-11-25 11:23:09,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. 11: [2022-11-25 11:23:09,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. 11: [2022-11-25 11:23:09,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt 11: [2022-11-25 11:23:09,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt 11: [2022-11-25 11:23:09,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt 11: [2022-11-25 11:23:09,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt 11: [2022-11-25 11:23:09,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt 11: [2022-11-25 11:23:09,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt 11: [2022-11-25 11:23:09,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt 11: [2022-11-25 11:23:09,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 11: [2022-11-25 11:23:09,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 11: [2022-11-25 11:23:09,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 11: [2022-11-25 11:23:09,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 11: [2022-11-25 11:23:09,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 11: [2022-11-25 11:23:09,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 11: [2022-11-25 11:23:09,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 11: [2022-11-25 11:23:09,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. 11: [2022-11-25 11:23:09,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt 11: [2022-11-25 11:23:09,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 9: [2022-11-25 11:23:09,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. 9: [2022-11-25 11:23:09,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. 9: [2022-11-25 11:23:09,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. 9: [2022-11-25 11:23:09,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. 9: [2022-11-25 11:23:09,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. 9: [2022-11-25 11:23:09,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt 9: [2022-11-25 11:23:09,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. 9: [2022-11-25 11:23:09,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. 9: [2022-11-25 11:23:09,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt 9: [2022-11-25 11:23:09,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 9: [2022-11-25 11:23:09,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt 9: [2022-11-25 11:23:09,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt 9: [2022-11-25 11:23:09,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt 9: [2022-11-25 11:23:09,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt 9: [2022-11-25 11:23:09,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 9: [2022-11-25 11:23:09,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt 9: [2022-11-25 11:23:09,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 9: [2022-11-25 11:23:09,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 9: [2022-11-25 11:23:09,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 9: [2022-11-25 11:23:09,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 9: [2022-11-25 11:23:09,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 9: [2022-11-25 11:23:09,459] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. 9: [2022-11-25 11:23:09,459] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt 9: [2022-11-25 11:23:09,459] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 36: [2022-11-25 11:23:09,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt. 8: [2022-11-25 11:23:09,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. 8: [2022-11-25 11:23:09,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. 8: [2022-11-25 11:23:09,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. 8: [2022-11-25 11:23:09,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. 8: [2022-11-25 11:23:09,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. 8: [2022-11-25 11:23:09,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. 36: [2022-11-25 11:23:09,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt 36: [2022-11-25 11:23:09,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 51: [2022-11-25 11:23:09,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt. 17: [2022-11-25 11:23:09,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. 17: [2022-11-25 11:23:09,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. 17: [2022-11-25 11:23:09,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. 28: [2022-11-25 11:23:09,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. 28: [2022-11-25 11:23:09,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. 28: [2022-11-25 11:23:09,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. 8: [2022-11-25 11:23:09,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt 8: [2022-11-25 11:23:09,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt 8: [2022-11-25 11:23:09,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt 8: [2022-11-25 11:23:09,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt 8: [2022-11-25 11:23:09,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt 8: [2022-11-25 11:23:09,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt 8: [2022-11-25 11:23:09,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 8: [2022-11-25 11:23:09,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 8: [2022-11-25 11:23:09,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 8: [2022-11-25 11:23:09,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 8: [2022-11-25 11:23:09,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 8: [2022-11-25 11:23:09,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 8: [2022-11-25 11:23:09,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. 8: [2022-11-25 11:23:09,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt 8: [2022-11-25 11:23:09,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 51: [2022-11-25 11:23:09,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt 28: [2022-11-25 11:23:09,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. 28: [2022-11-25 11:23:09,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. 28: [2022-11-25 11:23:09,470] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. 51: [2022-11-25 11:23:09,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 28: [2022-11-25 11:23:09,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt 28: [2022-11-25 11:23:09,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt 28: [2022-11-25 11:23:09,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt 28: [2022-11-25 11:23:09,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 28: [2022-11-25 11:23:09,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt 28: [2022-11-25 11:23:09,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt 28: [2022-11-25 11:23:09,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt 28: [2022-11-25 11:23:09,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 28: [2022-11-25 11:23:09,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 28: [2022-11-25 11:23:09,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 28: [2022-11-25 11:23:09,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 28: [2022-11-25 11:23:09,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 22: [2022-11-25 11:23:09,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. 22: [2022-11-25 11:23:09,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. 22: [2022-11-25 11:23:09,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. 22: [2022-11-25 11:23:09,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. 22: [2022-11-25 11:23:09,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. 22: [2022-11-25 11:23:09,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. 22: [2022-11-25 11:23:09,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. 22: [2022-11-25 11:23:09,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt 22: [2022-11-25 11:23:09,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt 22: [2022-11-25 11:23:09,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt 22: [2022-11-25 11:23:09,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt 22: [2022-11-25 11:23:09,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 22: [2022-11-25 11:23:09,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 22: [2022-11-25 11:23:09,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt 22: [2022-11-25 11:23:09,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt 22: [2022-11-25 11:23:09,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 22: [2022-11-25 11:23:09,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 22: [2022-11-25 11:23:09,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt 22: [2022-11-25 11:23:09,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 22: [2022-11-25 11:23:09,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 22: [2022-11-25 11:23:09,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 17: [2022-11-25 11:23:09,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. 17: [2022-11-25 11:23:09,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. 17: [2022-11-25 11:23:09,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. 17: [2022-11-25 11:23:09,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt 17: [2022-11-25 11:23:09,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. 17: [2022-11-25 11:23:09,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt 17: [2022-11-25 11:23:09,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt 17: [2022-11-25 11:23:09,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt 17: [2022-11-25 11:23:09,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt 17: [2022-11-25 11:23:09,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 17: [2022-11-25 11:23:09,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 17: [2022-11-25 11:23:09,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 17: [2022-11-25 11:23:09,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt 17: [2022-11-25 11:23:09,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt 17: [2022-11-25 11:23:09,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 17: [2022-11-25 11:23:09,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 17: [2022-11-25 11:23:09,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 17: [2022-11-25 11:23:09,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 5: [2022-11-25 11:23:09,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. 5: [2022-11-25 11:23:09,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. 5: [2022-11-25 11:23:09,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. 5: [2022-11-25 11:23:09,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. 18: [2022-11-25 11:23:09,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. 18: [2022-11-25 11:23:09,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. 18: [2022-11-25 11:23:09,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. 60: [2022-11-25 11:23:09,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt. 60: [2022-11-25 11:23:09,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt. 60: [2022-11-25 11:23:09,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt. 60: [2022-11-25 11:23:09,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt. 5: [2022-11-25 11:23:09,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. 60: [2022-11-25 11:23:09,476] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt 60: [2022-11-25 11:23:09,476] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt 5: [2022-11-25 11:23:09,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt 60: [2022-11-25 11:23:09,476] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt 60: [2022-11-25 11:23:09,476] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt 5: [2022-11-25 11:23:09,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt 60: [2022-11-25 11:23:09,477] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 5: [2022-11-25 11:23:09,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt 5: [2022-11-25 11:23:09,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt 60: [2022-11-25 11:23:09,477] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 60: [2022-11-25 11:23:09,477] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 5: [2022-11-25 11:23:09,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 5: [2022-11-25 11:23:09,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt 60: [2022-11-25 11:23:09,477] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 5: [2022-11-25 11:23:09,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 5: [2022-11-25 11:23:09,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 5: [2022-11-25 11:23:09,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 5: [2022-11-25 11:23:09,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 5: [2022-11-25 11:23:09,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. 5: [2022-11-25 11:23:09,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt 5: [2022-11-25 11:23:09,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 18: [2022-11-25 11:23:09,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. 18: [2022-11-25 11:23:09,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. 18: [2022-11-25 11:23:09,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. 18: [2022-11-25 11:23:09,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. 18: [2022-11-25 11:23:09,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt 18: [2022-11-25 11:23:09,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt 18: [2022-11-25 11:23:09,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt 18: [2022-11-25 11:23:09,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt 18: [2022-11-25 11:23:09,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 18: [2022-11-25 11:23:09,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt 18: [2022-11-25 11:23:09,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 18: [2022-11-25 11:23:09,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt 18: [2022-11-25 11:23:09,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt 18: [2022-11-25 11:23:09,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 18: [2022-11-25 11:23:09,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 18: [2022-11-25 11:23:09,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 18: [2022-11-25 11:23:09,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 18: [2022-11-25 11:23:09,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 7: [2022-11-25 11:23:09,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. 7: [2022-11-25 11:23:09,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. 7: [2022-11-25 11:23:09,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. 7: [2022-11-25 11:23:09,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. 7: [2022-11-25 11:23:09,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. 7: [2022-11-25 11:23:09,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt 7: [2022-11-25 11:23:09,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. 7: [2022-11-25 11:23:09,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt 7: [2022-11-25 11:23:09,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt 7: [2022-11-25 11:23:09,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 7: [2022-11-25 11:23:09,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt 7: [2022-11-25 11:23:09,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 7: [2022-11-25 11:23:09,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt 7: [2022-11-25 11:23:09,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt 7: [2022-11-25 11:23:09,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 7: [2022-11-25 11:23:09,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 7: [2022-11-25 11:23:09,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 7: [2022-11-25 11:23:09,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 44: [2022-11-25 11:23:09,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt. 44: [2022-11-25 11:23:09,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt. 44: [2022-11-25 11:23:09,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt. 44: [2022-11-25 11:23:09,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt. 44: [2022-11-25 11:23:09,494] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt 44: [2022-11-25 11:23:09,494] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt 44: [2022-11-25 11:23:09,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt. 44: [2022-11-25 11:23:09,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt. 44: [2022-11-25 11:23:09,494] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt 44: [2022-11-25 11:23:09,494] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt 44: [2022-11-25 11:23:09,494] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 44: [2022-11-25 11:23:09,494] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 44: [2022-11-25 11:23:09,494] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt 44: [2022-11-25 11:23:09,494] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 44: [2022-11-25 11:23:09,494] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 44: [2022-11-25 11:23:09,494] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt 44: [2022-11-25 11:23:09,494] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 44: [2022-11-25 11:23:09,494] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 44: [2022-11-25 11:23:09,494] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt. 44: [2022-11-25 11:23:09,494] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt 44: [2022-11-25 11:23:09,494] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 38: [2022-11-25 11:23:09,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt. 38: [2022-11-25 11:23:09,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt. 38: [2022-11-25 11:23:09,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt. 38: [2022-11-25 11:23:09,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt. 21: [2022-11-25 11:23:09,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. 21: [2022-11-25 11:23:09,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. 21: [2022-11-25 11:23:09,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. 21: [2022-11-25 11:23:09,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. 44: [2022-11-25 11:23:09,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt. 44: [2022-11-25 11:23:09,497] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt 25: [2022-11-25 11:23:09,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. 38: [2022-11-25 11:23:09,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt. 38: [2022-11-25 11:23:09,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt 38: [2022-11-25 11:23:09,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt 38: [2022-11-25 11:23:09,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt 38: [2022-11-25 11:23:09,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt 38: [2022-11-25 11:23:09,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 38: [2022-11-25 11:23:09,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt 38: [2022-11-25 11:23:09,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 38: [2022-11-25 11:23:09,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 38: [2022-11-25 11:23:09,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 38: [2022-11-25 11:23:09,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 38: [2022-11-25 11:23:09,496] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt. 38: [2022-11-25 11:23:09,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt 38: [2022-11-25 11:23:09,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 21: [2022-11-25 11:23:09,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. 21: [2022-11-25 11:23:09,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. 21: [2022-11-25 11:23:09,497] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt 21: [2022-11-25 11:23:09,497] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. 21: [2022-11-25 11:23:09,497] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt 21: [2022-11-25 11:23:09,497] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt 21: [2022-11-25 11:23:09,497] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt 21: [2022-11-25 11:23:09,497] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt 21: [2022-11-25 11:23:09,497] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt 21: [2022-11-25 11:23:09,497] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 21: [2022-11-25 11:23:09,497] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 21: [2022-11-25 11:23:09,497] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt 21: [2022-11-25 11:23:09,497] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 21: [2022-11-25 11:23:09,497] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 21: [2022-11-25 11:23:09,497] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 21: [2022-11-25 11:23:09,497] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 21: [2022-11-25 11:23:09,497] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 25: [2022-11-25 11:23:09,498] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt 44: [2022-11-25 11:23:09,497] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 25: [2022-11-25 11:23:09,498] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 63: [2022-11-25 11:23:09,502] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt. 57: [2022-11-25 11:23:09,502] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt. 48: [2022-11-25 11:23:09,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt. 48: [2022-11-25 11:23:09,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt. 48: [2022-11-25 11:23:09,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt. 48: [2022-11-25 11:23:09,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt. 48: [2022-11-25 11:23:09,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt. 63: [2022-11-25 11:23:09,502] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt 63: [2022-11-25 11:23:09,502] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 48: [2022-11-25 11:23:09,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt. 48: [2022-11-25 11:23:09,500] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt. 48: [2022-11-25 11:23:09,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt 48: [2022-11-25 11:23:09,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt 48: [2022-11-25 11:23:09,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt 48: [2022-11-25 11:23:09,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt 48: [2022-11-25 11:23:09,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt 48: [2022-11-25 11:23:09,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt 48: [2022-11-25 11:23:09,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 48: [2022-11-25 11:23:09,500] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt 48: [2022-11-25 11:23:09,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 48: [2022-11-25 11:23:09,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 48: [2022-11-25 11:23:09,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 48: [2022-11-25 11:23:09,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 48: [2022-11-25 11:23:09,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 48: [2022-11-25 11:23:09,500] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 57: [2022-11-25 11:23:09,502] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt 57: [2022-11-25 11:23:09,502] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 47: [2022-11-25 11:23:09,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt. 47: [2022-11-25 11:23:09,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt. 47: [2022-11-25 11:23:09,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt. 47: [2022-11-25 11:23:09,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt. 47: [2022-11-25 11:23:09,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt. 47: [2022-11-25 11:23:09,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt. 47: [2022-11-25 11:23:09,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt 47: [2022-11-25 11:23:09,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt 47: [2022-11-25 11:23:09,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt. 47: [2022-11-25 11:23:09,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt 47: [2022-11-25 11:23:09,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt 47: [2022-11-25 11:23:09,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 47: [2022-11-25 11:23:09,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 47: [2022-11-25 11:23:09,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt 47: [2022-11-25 11:23:09,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt 47: [2022-11-25 11:23:09,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 47: [2022-11-25 11:23:09,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt 47: [2022-11-25 11:23:09,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 47: [2022-11-25 11:23:09,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 47: [2022-11-25 11:23:09,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 47: [2022-11-25 11:23:09,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 24: [2022-11-25 11:23:09,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. 24: [2022-11-25 11:23:09,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. 24: [2022-11-25 11:23:09,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. 24: [2022-11-25 11:23:09,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. 24: [2022-11-25 11:23:09,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. 24: [2022-11-25 11:23:09,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. 24: [2022-11-25 11:23:09,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt 24: [2022-11-25 11:23:09,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt 24: [2022-11-25 11:23:09,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt 24: [2022-11-25 11:23:09,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt 24: [2022-11-25 11:23:09,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt 24: [2022-11-25 11:23:09,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt 24: [2022-11-25 11:23:09,517] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 24: [2022-11-25 11:23:09,517] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 24: [2022-11-25 11:23:09,517] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 24: [2022-11-25 11:23:09,517] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 24: [2022-11-25 11:23:09,517] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 24: [2022-11-25 11:23:09,517] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 7: [2022-11-25 11:23:09,521] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. 8: [2022-11-25 11:23:09,519] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. 7: [2022-11-25 11:23:09,521] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt 7: [2022-11-25 11:23:09,521] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 8: [2022-11-25 11:23:09,519] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt 8: [2022-11-25 11:23:09,519] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 10: [2022-11-25 11:23:09,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. 10: [2022-11-25 11:23:09,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. 10: [2022-11-25 11:23:09,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. 10: [2022-11-25 11:23:09,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. 10: [2022-11-25 11:23:09,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. 10: [2022-11-25 11:23:09,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. 10: [2022-11-25 11:23:09,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. 10: [2022-11-25 11:23:09,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt 10: [2022-11-25 11:23:09,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt 60: [2022-11-25 11:23:09,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt. 10: [2022-11-25 11:23:09,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt 10: [2022-11-25 11:23:09,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt 10: [2022-11-25 11:23:09,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 42: [2022-11-25 11:23:09,531] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt. 42: [2022-11-25 11:23:09,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt. 10: [2022-11-25 11:23:09,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt 10: [2022-11-25 11:23:09,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt 10: [2022-11-25 11:23:09,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt 10: [2022-11-25 11:23:09,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 10: [2022-11-25 11:23:09,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 10: [2022-11-25 11:23:09,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 10: [2022-11-25 11:23:09,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 10: [2022-11-25 11:23:09,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 10: [2022-11-25 11:23:09,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 60: [2022-11-25 11:23:09,531] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt 60: [2022-11-25 11:23:09,531] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 42: [2022-11-25 11:23:09,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt. 42: [2022-11-25 11:23:09,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt. 42: [2022-11-25 11:23:09,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt. 42: [2022-11-25 11:23:09,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt. 42: [2022-11-25 11:23:09,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt. 42: [2022-11-25 11:23:09,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt 42: [2022-11-25 11:23:09,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt 42: [2022-11-25 11:23:09,532] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt. 42: [2022-11-25 11:23:09,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 42: [2022-11-25 11:23:09,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 42: [2022-11-25 11:23:09,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt 42: [2022-11-25 11:23:09,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt 42: [2022-11-25 11:23:09,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt 42: [2022-11-25 11:23:09,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt 42: [2022-11-25 11:23:09,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt 42: [2022-11-25 11:23:09,532] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt 42: [2022-11-25 11:23:09,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 42: [2022-11-25 11:23:09,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 42: [2022-11-25 11:23:09,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 42: [2022-11-25 11:23:09,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 42: [2022-11-25 11:23:09,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 42: [2022-11-25 11:23:09,532] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 55: [2022-11-25 11:23:09,537] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt. 55: [2022-11-25 11:23:09,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt 55: [2022-11-25 11:23:09,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 56: [2022-11-25 11:23:09,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt. 56: [2022-11-25 11:23:09,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt. 56: [2022-11-25 11:23:09,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt. 56: [2022-11-25 11:23:09,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt. 16: [2022-11-25 11:23:09,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. 16: [2022-11-25 11:23:09,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. 56: [2022-11-25 11:23:09,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt. 56: [2022-11-25 11:23:09,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt. 56: [2022-11-25 11:23:09,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt 56: [2022-11-25 11:23:09,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt 56: [2022-11-25 11:23:09,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt 56: [2022-11-25 11:23:09,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt 56: [2022-11-25 11:23:09,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt 56: [2022-11-25 11:23:09,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 56: [2022-11-25 11:23:09,537] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt 56: [2022-11-25 11:23:09,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 56: [2022-11-25 11:23:09,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 56: [2022-11-25 11:23:09,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 56: [2022-11-25 11:23:09,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 56: [2022-11-25 11:23:09,537] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 21: [2022-11-25 11:23:09,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. 21: [2022-11-25 11:23:09,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt 21: [2022-11-25 11:23:09,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 16: [2022-11-25 11:23:09,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. 16: [2022-11-25 11:23:09,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. 16: [2022-11-25 11:23:09,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. 16: [2022-11-25 11:23:09,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. 16: [2022-11-25 11:23:09,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. 16: [2022-11-25 11:23:09,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt 43: [2022-11-25 11:23:09,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt. 43: [2022-11-25 11:23:09,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt. 43: [2022-11-25 11:23:09,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt. 43: [2022-11-25 11:23:09,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt. 16: [2022-11-25 11:23:09,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt 16: [2022-11-25 11:23:09,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt 16: [2022-11-25 11:23:09,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt 16: [2022-11-25 11:23:09,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 16: [2022-11-25 11:23:09,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt 16: [2022-11-25 11:23:09,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt 16: [2022-11-25 11:23:09,535] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt 16: [2022-11-25 11:23:09,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 16: [2022-11-25 11:23:09,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 16: [2022-11-25 11:23:09,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 16: [2022-11-25 11:23:09,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 16: [2022-11-25 11:23:09,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 16: [2022-11-25 11:23:09,535] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 43: [2022-11-25 11:23:09,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt. 43: [2022-11-25 11:23:09,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt. 43: [2022-11-25 11:23:09,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt. 43: [2022-11-25 11:23:09,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt 43: [2022-11-25 11:23:09,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt. 43: [2022-11-25 11:23:09,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt 43: [2022-11-25 11:23:09,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt 43: [2022-11-25 11:23:09,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 43: [2022-11-25 11:23:09,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt 43: [2022-11-25 11:23:09,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt 43: [2022-11-25 11:23:09,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt 43: [2022-11-25 11:23:09,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt 43: [2022-11-25 11:23:09,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 43: [2022-11-25 11:23:09,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 43: [2022-11-25 11:23:09,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt 43: [2022-11-25 11:23:09,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 43: [2022-11-25 11:23:09,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 43: [2022-11-25 11:23:09,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 43: [2022-11-25 11:23:09,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 43: [2022-11-25 11:23:09,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 27: [2022-11-25 11:23:09,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. 27: [2022-11-25 11:23:09,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. 27: [2022-11-25 11:23:09,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. 27: [2022-11-25 11:23:09,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. 27: [2022-11-25 11:23:09,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. 27: [2022-11-25 11:23:09,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. 27: [2022-11-25 11:23:09,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. 27: [2022-11-25 11:23:09,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. 27: [2022-11-25 11:23:09,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt 27: [2022-11-25 11:23:09,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt 27: [2022-11-25 11:23:09,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt 27: [2022-11-25 11:23:09,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt 27: [2022-11-25 11:23:09,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt 27: [2022-11-25 11:23:09,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt 27: [2022-11-25 11:23:09,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt 27: [2022-11-25 11:23:09,545] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 27: [2022-11-25 11:23:09,545] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 27: [2022-11-25 11:23:09,545] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 27: [2022-11-25 11:23:09,545] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 27: [2022-11-25 11:23:09,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt 27: [2022-11-25 11:23:09,545] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 27: [2022-11-25 11:23:09,545] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 27: [2022-11-25 11:23:09,545] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 27: [2022-11-25 11:23:09,545] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 62: [2022-11-25 11:23:09,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt. 62: [2022-11-25 11:23:09,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt. 62: [2022-11-25 11:23:09,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt. 62: [2022-11-25 11:23:09,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt. 62: [2022-11-25 11:23:09,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt. 62: [2022-11-25 11:23:09,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt. 62: [2022-11-25 11:23:09,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt 62: [2022-11-25 11:23:09,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt. 62: [2022-11-25 11:23:09,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt 62: [2022-11-25 11:23:09,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt 62: [2022-11-25 11:23:09,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt 62: [2022-11-25 11:23:09,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt 62: [2022-11-25 11:23:09,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt 62: [2022-11-25 11:23:09,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 62: [2022-11-25 11:23:09,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 62: [2022-11-25 11:23:09,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 62: [2022-11-25 11:23:09,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 62: [2022-11-25 11:23:09,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt 62: [2022-11-25 11:23:09,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 62: [2022-11-25 11:23:09,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 62: [2022-11-25 11:23:09,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 62: [2022-11-25 11:23:09,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt. 62: [2022-11-25 11:23:09,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt 62: [2022-11-25 11:23:09,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 60: [2022-11-25 11:23:09,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt. 60: [2022-11-25 11:23:09,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt 60: [2022-11-25 11:23:09,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 49: [2022-11-25 11:23:09,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt. 49: [2022-11-25 11:23:09,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt 49: [2022-11-25 11:23:09,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 63: [2022-11-25 11:23:09,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt. 63: [2022-11-25 11:23:09,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt 63: [2022-11-25 11:23:09,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 13: [2022-11-25 11:23:09,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. 13: [2022-11-25 11:23:09,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt 13: [2022-11-25 11:23:09,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 17: [2022-11-25 11:23:09,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. 17: [2022-11-25 11:23:09,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt 17: [2022-11-25 11:23:09,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 61: [2022-11-25 11:23:09,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt. 61: [2022-11-25 11:23:09,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt. 61: [2022-11-25 11:23:09,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt. 61: [2022-11-25 11:23:09,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt. 61: [2022-11-25 11:23:09,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt. 61: [2022-11-25 11:23:09,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt 61: [2022-11-25 11:23:09,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt 61: [2022-11-25 11:23:09,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt 61: [2022-11-25 11:23:09,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 61: [2022-11-25 11:23:09,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt 61: [2022-11-25 11:23:09,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt 61: [2022-11-25 11:23:09,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 61: [2022-11-25 11:23:09,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 61: [2022-11-25 11:23:09,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 61: [2022-11-25 11:23:09,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 61: [2022-11-25 11:23:09,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt. 61: [2022-11-25 11:23:09,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt 61: [2022-11-25 11:23:09,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 61: [2022-11-25 11:23:09,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt. 61: [2022-11-25 11:23:09,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt 61: [2022-11-25 11:23:09,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 53: [2022-11-25 11:23:09,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt. 53: [2022-11-25 11:23:09,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt. 53: [2022-11-25 11:23:09,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt. 53: [2022-11-25 11:23:09,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt. 53: [2022-11-25 11:23:09,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt. 53: [2022-11-25 11:23:09,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt. 53: [2022-11-25 11:23:09,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt 53: [2022-11-25 11:23:09,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt 53: [2022-11-25 11:23:09,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 53: [2022-11-25 11:23:09,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 53: [2022-11-25 11:23:09,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt 53: [2022-11-25 11:23:09,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt 53: [2022-11-25 11:23:09,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt 53: [2022-11-25 11:23:09,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt 53: [2022-11-25 11:23:09,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 53: [2022-11-25 11:23:09,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 53: [2022-11-25 11:23:09,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 53: [2022-11-25 11:23:09,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 53: [2022-11-25 11:23:09,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt. 53: [2022-11-25 11:23:09,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt 53: [2022-11-25 11:23:09,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 30: [2022-11-25 11:23:09,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. 20: [2022-11-25 11:23:09,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. 23: [2022-11-25 11:23:09,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. 23: [2022-11-25 11:23:09,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. 20: [2022-11-25 11:23:09,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt 30: [2022-11-25 11:23:09,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt 20: [2022-11-25 11:23:09,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 30: [2022-11-25 11:23:09,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 23: [2022-11-25 11:23:09,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. 23: [2022-11-25 11:23:09,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. 23: [2022-11-25 11:23:09,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. 23: [2022-11-25 11:23:09,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. 23: [2022-11-25 11:23:09,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. 23: [2022-11-25 11:23:09,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt 23: [2022-11-25 11:23:09,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt 23: [2022-11-25 11:23:09,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt 23: [2022-11-25 11:23:09,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt 23: [2022-11-25 11:23:09,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt 23: [2022-11-25 11:23:09,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 23: [2022-11-25 11:23:09,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt 23: [2022-11-25 11:23:09,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 23: [2022-11-25 11:23:09,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt 23: [2022-11-25 11:23:09,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 23: [2022-11-25 11:23:09,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 23: [2022-11-25 11:23:09,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 23: [2022-11-25 11:23:09,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 23: [2022-11-25 11:23:09,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 61: [2022-11-25 11:23:09,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt. 61: [2022-11-25 11:23:09,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt 61: [2022-11-25 11:23:09,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 52: [2022-11-25 11:23:09,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt. 52: [2022-11-25 11:23:09,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt. 52: [2022-11-25 11:23:09,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt. 52: [2022-11-25 11:23:09,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt. 54: [2022-11-25 11:23:09,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt. 54: [2022-11-25 11:23:09,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt. 54: [2022-11-25 11:23:09,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt. 54: [2022-11-25 11:23:09,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt. 54: [2022-11-25 11:23:09,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt. 54: [2022-11-25 11:23:09,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt 56: [2022-11-25 11:23:09,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt. 54: [2022-11-25 11:23:09,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt 54: [2022-11-25 11:23:09,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt 54: [2022-11-25 11:23:09,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 54: [2022-11-25 11:23:09,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt 54: [2022-11-25 11:23:09,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt 54: [2022-11-25 11:23:09,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 54: [2022-11-25 11:23:09,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 56: [2022-11-25 11:23:09,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt 54: [2022-11-25 11:23:09,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 56: [2022-11-25 11:23:09,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 54: [2022-11-25 11:23:09,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 54: [2022-11-25 11:23:09,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt. 54: [2022-11-25 11:23:09,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt. 54: [2022-11-25 11:23:09,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt 54: [2022-11-25 11:23:09,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 54: [2022-11-25 11:23:09,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt 54: [2022-11-25 11:23:09,581] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 52: [2022-11-25 11:23:09,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt. 52: [2022-11-25 11:23:09,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt. 52: [2022-11-25 11:23:09,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt. 52: [2022-11-25 11:23:09,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt 52: [2022-11-25 11:23:09,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt 52: [2022-11-25 11:23:09,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt 52: [2022-11-25 11:23:09,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt 52: [2022-11-25 11:23:09,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt 52: [2022-11-25 11:23:09,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt 52: [2022-11-25 11:23:09,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt 52: [2022-11-25 11:23:09,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 52: [2022-11-25 11:23:09,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 52: [2022-11-25 11:23:09,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 52: [2022-11-25 11:23:09,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 52: [2022-11-25 11:23:09,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 52: [2022-11-25 11:23:09,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 52: [2022-11-25 11:23:09,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 47: [2022-11-25 11:23:09,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt. 47: [2022-11-25 11:23:09,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt 47: [2022-11-25 11:23:09,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 40: [2022-11-25 11:23:09,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt. 40: [2022-11-25 11:23:09,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt. 40: [2022-11-25 11:23:09,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt. 40: [2022-11-25 11:23:09,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt. 40: [2022-11-25 11:23:09,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt. 40: [2022-11-25 11:23:09,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt. 40: [2022-11-25 11:23:09,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt. 40: [2022-11-25 11:23:09,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt 40: [2022-11-25 11:23:09,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt 40: [2022-11-25 11:23:09,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt 40: [2022-11-25 11:23:09,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt 40: [2022-11-25 11:23:09,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt 40: [2022-11-25 11:23:09,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt 40: [2022-11-25 11:23:09,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 40: [2022-11-25 11:23:09,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt 40: [2022-11-25 11:23:09,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 40: [2022-11-25 11:23:09,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 40: [2022-11-25 11:23:09,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 40: [2022-11-25 11:23:09,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 40: [2022-11-25 11:23:09,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 40: [2022-11-25 11:23:09,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 40: [2022-11-25 11:23:09,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt. 40: [2022-11-25 11:23:09,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt 40: [2022-11-25 11:23:09,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 14: [2022-11-25 11:23:09,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. 14: [2022-11-25 11:23:09,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. 14: [2022-11-25 11:23:09,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. 14: [2022-11-25 11:23:09,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. 14: [2022-11-25 11:23:09,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. 14: [2022-11-25 11:23:09,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. 14: [2022-11-25 11:23:09,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. 14: [2022-11-25 11:23:09,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. 14: [2022-11-25 11:23:09,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt 14: [2022-11-25 11:23:09,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt 14: [2022-11-25 11:23:09,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt 14: [2022-11-25 11:23:09,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt 14: [2022-11-25 11:23:09,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt 14: [2022-11-25 11:23:09,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt 14: [2022-11-25 11:23:09,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt 14: [2022-11-25 11:23:09,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt 14: [2022-11-25 11:23:09,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 14: [2022-11-25 11:23:09,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 14: [2022-11-25 11:23:09,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 14: [2022-11-25 11:23:09,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 14: [2022-11-25 11:23:09,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 14: [2022-11-25 11:23:09,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 14: [2022-11-25 11:23:09,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 14: [2022-11-25 11:23:09,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 59: [2022-11-25 11:23:09,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt. 59: [2022-11-25 11:23:09,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt. 59: [2022-11-25 11:23:09,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt. 50: [2022-11-25 11:23:09,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt. 50: [2022-11-25 11:23:09,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt. 50: [2022-11-25 11:23:09,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt. 50: [2022-11-25 11:23:09,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt. 50: [2022-11-25 11:23:09,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt. 50: [2022-11-25 11:23:09,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt. 59: [2022-11-25 11:23:09,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt. 59: [2022-11-25 11:23:09,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt. 59: [2022-11-25 11:23:09,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt. 59: [2022-11-25 11:23:09,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt. 59: [2022-11-25 11:23:09,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt. 59: [2022-11-25 11:23:09,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt 50: [2022-11-25 11:23:09,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt. 50: [2022-11-25 11:23:09,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt 50: [2022-11-25 11:23:09,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt 59: [2022-11-25 11:23:09,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt 50: [2022-11-25 11:23:09,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt 59: [2022-11-25 11:23:09,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt 50: [2022-11-25 11:23:09,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt. 59: [2022-11-25 11:23:09,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt 59: [2022-11-25 11:23:09,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 59: [2022-11-25 11:23:09,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt 59: [2022-11-25 11:23:09,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 59: [2022-11-25 11:23:09,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 59: [2022-11-25 11:23:09,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt 50: [2022-11-25 11:23:09,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt 50: [2022-11-25 11:23:09,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt 59: [2022-11-25 11:23:09,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt 59: [2022-11-25 11:23:09,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt 50: [2022-11-25 11:23:09,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 59: [2022-11-25 11:23:09,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 59: [2022-11-25 11:23:09,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 50: [2022-11-25 11:23:09,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt 50: [2022-11-25 11:23:09,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 59: [2022-11-25 11:23:09,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 50: [2022-11-25 11:23:09,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt 59: [2022-11-25 11:23:09,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 59: [2022-11-25 11:23:09,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 50: [2022-11-25 11:23:09,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 50: [2022-11-25 11:23:09,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 50: [2022-11-25 11:23:09,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 50: [2022-11-25 11:23:09,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt 50: [2022-11-25 11:23:09,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 50: [2022-11-25 11:23:09,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 50: [2022-11-25 11:23:09,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 37: [2022-11-25 11:23:09,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt. 37: [2022-11-25 11:23:09,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt. 37: [2022-11-25 11:23:09,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt. 37: [2022-11-25 11:23:09,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt. 37: [2022-11-25 11:23:09,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt. 37: [2022-11-25 11:23:09,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt 37: [2022-11-25 11:23:09,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt 37: [2022-11-25 11:23:09,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt 37: [2022-11-25 11:23:09,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt 37: [2022-11-25 11:23:09,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 37: [2022-11-25 11:23:09,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 37: [2022-11-25 11:23:09,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 37: [2022-11-25 11:23:09,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt 37: [2022-11-25 11:23:09,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 37: [2022-11-25 11:23:09,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 35: [2022-11-25 11:23:09,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt. 35: [2022-11-25 11:23:09,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt. 35: [2022-11-25 11:23:09,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt. 35: [2022-11-25 11:23:09,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt. 35: [2022-11-25 11:23:09,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt. 35: [2022-11-25 11:23:09,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt. 35: [2022-11-25 11:23:09,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt. 35: [2022-11-25 11:23:09,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt 35: [2022-11-25 11:23:09,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt 35: [2022-11-25 11:23:09,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt 35: [2022-11-25 11:23:09,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt 35: [2022-11-25 11:23:09,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt 35: [2022-11-25 11:23:09,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt 35: [2022-11-25 11:23:09,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 35: [2022-11-25 11:23:09,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 35: [2022-11-25 11:23:09,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 35: [2022-11-25 11:23:09,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt 35: [2022-11-25 11:23:09,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 35: [2022-11-25 11:23:09,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 35: [2022-11-25 11:23:09,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 35: [2022-11-25 11:23:09,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 37: [2022-11-25 11:23:09,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt. 37: [2022-11-25 11:23:09,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt 37: [2022-11-25 11:23:09,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 60: [2022-11-25 11:23:09,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt. 60: [2022-11-25 11:23:09,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt 60: [2022-11-25 11:23:09,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 26: [2022-11-25 11:23:09,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. 26: [2022-11-25 11:23:09,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. 26: [2022-11-25 11:23:09,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. 26: [2022-11-25 11:23:09,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. 26: [2022-11-25 11:23:09,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. 26: [2022-11-25 11:23:09,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. 26: [2022-11-25 11:23:09,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. 26: [2022-11-25 11:23:09,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt 26: [2022-11-25 11:23:09,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt 26: [2022-11-25 11:23:09,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt 26: [2022-11-25 11:23:09,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt 26: [2022-11-25 11:23:09,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt 26: [2022-11-25 11:23:09,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt 26: [2022-11-25 11:23:09,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 26: [2022-11-25 11:23:09,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt 26: [2022-11-25 11:23:09,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 26: [2022-11-25 11:23:09,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 26: [2022-11-25 11:23:09,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 26: [2022-11-25 11:23:09,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 26: [2022-11-25 11:23:09,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 26: [2022-11-25 11:23:09,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 26: [2022-11-25 11:23:09,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. 26: [2022-11-25 11:23:09,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt 26: [2022-11-25 11:23:09,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 3: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. 3: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. 3: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. 3: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. 3: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. 3: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. 3: [2022-11-25 11:23:09,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt 3: [2022-11-25 11:23:09,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt 15: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. 15: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. 3: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. 15: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. 15: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. 3: [2022-11-25 11:23:09,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt 15: [2022-11-25 11:23:09,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt 3: [2022-11-25 11:23:09,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt 3: [2022-11-25 11:23:09,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt 3: [2022-11-25 11:23:09,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt 3: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 15: [2022-11-25 11:23:09,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt 3: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 15: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. 15: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. 3: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 15: [2022-11-25 11:23:09,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt 15: [2022-11-25 11:23:09,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt 3: [2022-11-25 11:23:09,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt 15: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. 3: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 3: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 15: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 3: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 15: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 39: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt. 39: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt. 39: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt. 39: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt. 3: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 15: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 39: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt. 39: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt. 39: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt. 39: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt. 12: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. 15: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 39: [2022-11-25 11:23:09,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt 39: [2022-11-25 11:23:09,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt 39: [2022-11-25 11:23:09,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt 12: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. 12: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. 12: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. 15: [2022-11-25 11:23:09,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt 39: [2022-11-25 11:23:09,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt 39: [2022-11-25 11:23:09,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt 39: [2022-11-25 11:23:09,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt 39: [2022-11-25 11:23:09,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt 12: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. 12: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. 12: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. 15: [2022-11-25 11:23:09,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt 39: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 39: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 39: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 12: [2022-11-25 11:23:09,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt 15: [2022-11-25 11:23:09,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt 39: [2022-11-25 11:23:09,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt 12: [2022-11-25 11:23:09,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt 12: [2022-11-25 11:23:09,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt 12: [2022-11-25 11:23:09,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt 39: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 39: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 39: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 12: [2022-11-25 11:23:09,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 12: [2022-11-25 11:23:09,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt 15: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 15: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 39: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 12: [2022-11-25 11:23:09,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt 12: [2022-11-25 11:23:09,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt 15: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 39: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 12: [2022-11-25 11:23:09,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 12: [2022-11-25 11:23:09,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 12: [2022-11-25 11:23:09,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 12: [2022-11-25 11:23:09,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 58: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt. 58: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt. 58: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt. 58: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt. 12: [2022-11-25 11:23:09,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 12: [2022-11-25 11:23:09,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 33: [2022-11-25 11:23:09,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt. 33: [2022-11-25 11:23:09,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt 33: [2022-11-25 11:23:09,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 58: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt. 58: [2022-11-25 11:23:09,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt 58: [2022-11-25 11:23:09,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt 58: [2022-11-25 11:23:09,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt 58: [2022-11-25 11:23:09,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt 58: [2022-11-25 11:23:09,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt 58: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 58: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 58: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 58: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 58: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 58: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt. 58: [2022-11-25 11:23:09,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt 58: [2022-11-25 11:23:09,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 31: [2022-11-25 11:23:09,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. 31: [2022-11-25 11:23:09,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. 31: [2022-11-25 11:23:09,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt 31: [2022-11-25 11:23:09,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. 31: [2022-11-25 11:23:09,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 31: [2022-11-25 11:23:09,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt 31: [2022-11-25 11:23:09,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt 31: [2022-11-25 11:23:09,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 31: [2022-11-25 11:23:09,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 31: [2022-11-25 11:23:09,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. 31: [2022-11-25 11:23:09,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. 31: [2022-11-25 11:23:09,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. 31: [2022-11-25 11:23:09,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. 31: [2022-11-25 11:23:09,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt 31: [2022-11-25 11:23:09,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt 31: [2022-11-25 11:23:09,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt 31: [2022-11-25 11:23:09,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 31: [2022-11-25 11:23:09,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 31: [2022-11-25 11:23:09,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt 31: [2022-11-25 11:23:09,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. 31: [2022-11-25 11:23:09,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 31: [2022-11-25 11:23:09,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 31: [2022-11-25 11:23:09,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt 31: [2022-11-25 11:23:09,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 38: [2022-11-25 11:23:09,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt. 38: [2022-11-25 11:23:09,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt 38: [2022-11-25 11:23:09,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 58: [2022-11-25 11:23:09,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt. 58: [2022-11-25 11:23:09,625] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt 58: [2022-11-25 11:23:09,625] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 12: [2022-11-25 11:23:09,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. 12: [2022-11-25 11:23:09,632] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt 0: [2022-11-25 11:23:09,632] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt 12: [2022-11-25 11:23:09,632] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 0: [2022-11-25 11:23:09,632] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 37: [2022-11-25 11:23:09,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt. 37: [2022-11-25 11:23:09,636] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt 37: [2022-11-25 11:23:09,636] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 54: [2022-11-25 11:23:09,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt. 54: [2022-11-25 11:23:09,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt 54: [2022-11-25 11:23:09,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 60: [2022-11-25 11:23:09,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt. 60: [2022-11-25 11:23:09,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt 60: [2022-11-25 11:23:09,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 5: [2022-11-25 11:23:09,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. 5: [2022-11-25 11:23:09,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt 5: [2022-11-25 11:23:09,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 1: [2022-11-25 11:23:09,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. 1: [2022-11-25 11:23:09,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. 1: [2022-11-25 11:23:09,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. 1: [2022-11-25 11:23:09,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. 1: [2022-11-25 11:23:09,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. 1: [2022-11-25 11:23:09,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. 1: [2022-11-25 11:23:09,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt 1: [2022-11-25 11:23:09,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. 1: [2022-11-25 11:23:09,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. 1: [2022-11-25 11:23:09,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt 1: [2022-11-25 11:23:09,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt 1: [2022-11-25 11:23:09,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt 1: [2022-11-25 11:23:09,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt 1: [2022-11-25 11:23:09,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt 1: [2022-11-25 11:23:09,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 1: [2022-11-25 11:23:09,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt 1: [2022-11-25 11:23:09,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 1: [2022-11-25 11:23:09,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 1: [2022-11-25 11:23:09,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 1: [2022-11-25 11:23:09,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt 1: [2022-11-25 11:23:09,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 1: [2022-11-25 11:23:09,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 1: [2022-11-25 11:23:09,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 1: [2022-11-25 11:23:09,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 46: [2022-11-25 11:23:09,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt. 46: [2022-11-25 11:23:09,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt. 46: [2022-11-25 11:23:09,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt. 46: [2022-11-25 11:23:09,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt. 46: [2022-11-25 11:23:09,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt 46: [2022-11-25 11:23:09,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt 46: [2022-11-25 11:23:09,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt 46: [2022-11-25 11:23:09,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt. 46: [2022-11-25 11:23:09,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt. 46: [2022-11-25 11:23:09,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt. 46: [2022-11-25 11:23:09,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt 46: [2022-11-25 11:23:09,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 46: [2022-11-25 11:23:09,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 46: [2022-11-25 11:23:09,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 46: [2022-11-25 11:23:09,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt 46: [2022-11-25 11:23:09,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 46: [2022-11-25 11:23:09,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt 46: [2022-11-25 11:23:09,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt 46: [2022-11-25 11:23:09,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 46: [2022-11-25 11:23:09,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 46: [2022-11-25 11:23:09,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 46: [2022-11-25 11:23:09,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt. 46: [2022-11-25 11:23:09,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt 46: [2022-11-25 11:23:09,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 7: [2022-11-25 11:23:09,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. 7: [2022-11-25 11:23:09,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt 7: [2022-11-25 11:23:09,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 2: [2022-11-25 11:23:09,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. 2: [2022-11-25 11:23:09,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. 2: [2022-11-25 11:23:09,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. 2: [2022-11-25 11:23:09,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. 2: [2022-11-25 11:23:09,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. 2: [2022-11-25 11:23:09,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt 2: [2022-11-25 11:23:09,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt 2: [2022-11-25 11:23:09,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt 2: [2022-11-25 11:23:09,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. 2: [2022-11-25 11:23:09,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. 2: [2022-11-25 11:23:09,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. 2: [2022-11-25 11:23:09,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt 2: [2022-11-25 11:23:09,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 2: [2022-11-25 11:23:09,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt 2: [2022-11-25 11:23:09,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 2: [2022-11-25 11:23:09,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 2: [2022-11-25 11:23:09,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 2: [2022-11-25 11:23:09,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt 2: [2022-11-25 11:23:09,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 2: [2022-11-25 11:23:09,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt 2: [2022-11-25 11:23:09,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt 2: [2022-11-25 11:23:09,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 2: [2022-11-25 11:23:09,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 2: [2022-11-25 11:23:09,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 36: [2022-11-25 11:23:09,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt. 36: [2022-11-25 11:23:09,658] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt 36: [2022-11-25 11:23:09,658] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 15: [2022-11-25 11:23:09,666] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. 15: [2022-11-25 11:23:09,666] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt 15: [2022-11-25 11:23:09,666] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 18: [2022-11-25 11:23:09,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. 18: [2022-11-25 11:23:09,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt 18: [2022-11-25 11:23:09,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 53: [2022-11-25 11:23:09,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt. 53: [2022-11-25 11:23:09,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt 53: [2022-11-25 11:23:09,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 3: [2022-11-25 11:23:09,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. 3: [2022-11-25 11:23:09,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt 3: [2022-11-25 11:23:09,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 45: [2022-11-25 11:23:09,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt. 45: [2022-11-25 11:23:09,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt 45: [2022-11-25 11:23:09,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 35: [2022-11-25 11:23:09,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt. 35: [2022-11-25 11:23:09,689] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt 35: [2022-11-25 11:23:09,689] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 5: [2022-11-25 11:23:09,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. 5: [2022-11-25 11:23:09,689] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt 5: [2022-11-25 11:23:09,690] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 48: [2022-11-25 11:23:09,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt. 48: [2022-11-25 11:23:09,691] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt 48: [2022-11-25 11:23:09,691] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 24: [2022-11-25 11:23:09,691] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. 24: [2022-11-25 11:23:09,691] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt 24: [2022-11-25 11:23:09,691] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 13: [2022-11-25 11:23:09,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. 13: [2022-11-25 11:23:09,692] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt 13: [2022-11-25 11:23:09,692] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 19: [2022-11-25 11:23:09,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. 19: [2022-11-25 11:23:09,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt 19: [2022-11-25 11:23:09,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 34: [2022-11-25 11:23:09,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt. 34: [2022-11-25 11:23:09,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt. 34: [2022-11-25 11:23:09,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt 52: [2022-11-25 11:23:09,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt. 34: [2022-11-25 11:23:09,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt 34: [2022-11-25 11:23:09,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 34: [2022-11-25 11:23:09,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 52: [2022-11-25 11:23:09,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt 52: [2022-11-25 11:23:09,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 34: [2022-11-25 11:23:09,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt. 34: [2022-11-25 11:23:09,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt. 34: [2022-11-25 11:23:09,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt. 34: [2022-11-25 11:23:09,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt 34: [2022-11-25 11:23:09,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 58: [2022-11-25 11:23:09,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt. 34: [2022-11-25 11:23:09,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt 34: [2022-11-25 11:23:09,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt 34: [2022-11-25 11:23:09,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 58: [2022-11-25 11:23:09,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt 34: [2022-11-25 11:23:09,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 58: [2022-11-25 11:23:09,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 34: [2022-11-25 11:23:09,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt. 29: [2022-11-25 11:23:09,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. 29: [2022-11-25 11:23:09,697] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt 34: [2022-11-25 11:23:09,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt. 34: [2022-11-25 11:23:09,697] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt 34: [2022-11-25 11:23:09,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt. 29: [2022-11-25 11:23:09,697] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 34: [2022-11-25 11:23:09,697] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 34: [2022-11-25 11:23:09,697] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt 34: [2022-11-25 11:23:09,697] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt 34: [2022-11-25 11:23:09,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 34: [2022-11-25 11:23:09,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 56: [2022-11-25 11:23:09,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt. 56: [2022-11-25 11:23:09,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt 56: [2022-11-25 11:23:09,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 4: [2022-11-25 11:23:09,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. 4: [2022-11-25 11:23:09,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. 4: [2022-11-25 11:23:09,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. 4: [2022-11-25 11:23:09,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. 4: [2022-11-25 11:23:09,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. 4: [2022-11-25 11:23:09,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. 4: [2022-11-25 11:23:09,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt 4: [2022-11-25 11:23:09,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. 51: [2022-11-25 11:23:09,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt. 4: [2022-11-25 11:23:09,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt 51: [2022-11-25 11:23:09,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt 4: [2022-11-25 11:23:09,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt 4: [2022-11-25 11:23:09,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt 4: [2022-11-25 11:23:09,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt 4: [2022-11-25 11:23:09,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt 4: [2022-11-25 11:23:09,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt 4: [2022-11-25 11:23:09,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 51: [2022-11-25 11:23:09,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 4: [2022-11-25 11:23:09,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 4: [2022-11-25 11:23:09,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 4: [2022-11-25 11:23:09,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 4: [2022-11-25 11:23:09,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 4: [2022-11-25 11:23:09,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 4: [2022-11-25 11:23:09,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 28: [2022-11-25 11:23:09,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. 28: [2022-11-25 11:23:09,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt 28: [2022-11-25 11:23:09,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 23: [2022-11-25 11:23:09,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. 23: [2022-11-25 11:23:09,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt 23: [2022-11-25 11:23:09,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 6: [2022-11-25 11:23:09,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. 6: [2022-11-25 11:23:09,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt 6: [2022-11-25 11:23:09,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 36: [2022-11-25 11:23:09,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt. 36: [2022-11-25 11:23:09,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt 36: [2022-11-25 11:23:09,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 10: [2022-11-25 11:23:09,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. 10: [2022-11-25 11:23:09,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt 10: [2022-11-25 11:23:09,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 49: [2022-11-25 11:23:09,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt. 49: [2022-11-25 11:23:09,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt 49: [2022-11-25 11:23:09,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 36: [2022-11-25 11:23:09,722] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt. 36: [2022-11-25 11:23:09,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt 36: [2022-11-25 11:23:09,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 22: [2022-11-25 11:23:09,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. 24: [2022-11-25 11:23:09,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. 22: [2022-11-25 11:23:09,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt 24: [2022-11-25 11:23:09,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt 22: [2022-11-25 11:23:09,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 24: [2022-11-25 11:23:09,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 38: [2022-11-25 11:23:09,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt. 38: [2022-11-25 11:23:09,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt 38: [2022-11-25 11:23:09,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 55: [2022-11-25 11:23:09,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt. 55: [2022-11-25 11:23:09,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt 55: [2022-11-25 11:23:09,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 6: [2022-11-25 11:23:09,742] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. 6: [2022-11-25 11:23:09,742] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt 6: [2022-11-25 11:23:09,742] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 55: [2022-11-25 11:23:09,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt. 55: [2022-11-25 11:23:09,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt 55: [2022-11-25 11:23:09,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 4: [2022-11-25 11:23:09,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. 4: [2022-11-25 11:23:09,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt 4: [2022-11-25 11:23:09,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 6: [2022-11-25 11:23:09,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. 6: [2022-11-25 11:23:09,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt 6: [2022-11-25 11:23:09,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 28: [2022-11-25 11:23:09,754] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. 28: [2022-11-25 11:23:09,754] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt 28: [2022-11-25 11:23:09,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 41: [2022-11-25 11:23:09,769] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt. 41: [2022-11-25 11:23:09,769] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt 41: [2022-11-25 11:23:09,769] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 37: [2022-11-25 11:23:09,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt. 37: [2022-11-25 11:23:09,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt 37: [2022-11-25 11:23:09,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 16: [2022-11-25 11:23:09,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. 16: [2022-11-25 11:23:09,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step1000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt 16: [2022-11-25 11:23:09,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! 0: successfully saved checkpoint at iteration 1000 to checkpoints_8b7 63: time (ms) | save-checkpoint: 6913.95 63: iteration 1010/ 5494 | consumed samples: 1034240 | consumed tokens: 2118123520 | elapsed time per iteration (s): 6.66 | learning rate: 1.867E-04 | global batch size: 1024 | lm loss: 3.036755E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 153.848 | TFLOPs: 34.40 | 63: iteration 1020/ 5494 | consumed samples: 1044480 | consumed tokens: 2139095040 | elapsed time per iteration (s): 5.83 | learning rate: 1.864E-04 | global batch size: 1024 | lm loss: 3.029409E+00 | grad norm: 0.370 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.712 | TFLOPs: 39.28 | 63: iteration 1030/ 5494 | consumed samples: 1054720 | consumed tokens: 2160066560 | elapsed time per iteration (s): 5.92 | learning rate: 1.861E-04 | global batch size: 1024 | lm loss: 2.990776E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.918 | TFLOPs: 38.66 | 63: iteration 1040/ 5494 | consumed samples: 1064960 | consumed tokens: 2181038080 | elapsed time per iteration (s): 5.66 | learning rate: 1.858E-04 | global batch size: 1024 | lm loss: 2.981724E+00 | grad norm: 0.484 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.788 | TFLOPs: 40.42 | 63: iteration 1050/ 5494 | consumed samples: 1075200 | consumed tokens: 2202009600 | elapsed time per iteration (s): 5.92 | learning rate: 1.855E-04 | global batch size: 1024 | lm loss: 2.964180E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.051 | TFLOPs: 38.69 | 63: iteration 1060/ 5494 | consumed samples: 1085440 | consumed tokens: 2222981120 | elapsed time per iteration (s): 5.85 | learning rate: 1.853E-04 | global batch size: 1024 | lm loss: 2.975879E+00 | grad norm: 0.396 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 174.943 | TFLOPs: 39.11 | 63: iteration 1070/ 5494 | consumed samples: 1095680 | consumed tokens: 2243952640 | elapsed time per iteration (s): 5.66 | learning rate: 1.850E-04 | global batch size: 1024 | lm loss: 2.945570E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.972 | TFLOPs: 40.46 | 63: iteration 1080/ 5494 | consumed samples: 1105920 | consumed tokens: 2264924160 | elapsed time per iteration (s): 5.69 | learning rate: 1.847E-04 | global batch size: 1024 | lm loss: 2.960703E+00 | grad norm: 0.314 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.953 | TFLOPs: 40.23 | 63: iteration 1090/ 5494 | consumed samples: 1116160 | consumed tokens: 2285895680 | elapsed time per iteration (s): 6.31 | learning rate: 1.844E-04 | global batch size: 1024 | lm loss: 2.943051E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 162.261 | TFLOPs: 36.28 | 63: iteration 1100/ 5494 | consumed samples: 1126400 | consumed tokens: 2306867200 | elapsed time per iteration (s): 5.84 | learning rate: 1.841E-04 | global batch size: 1024 | lm loss: 2.942953E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.437 | TFLOPs: 39.22 | 63: iteration 1110/ 5494 | consumed samples: 1136640 | consumed tokens: 2327838720 | elapsed time per iteration (s): 5.58 | learning rate: 1.838E-04 | global batch size: 1024 | lm loss: 2.900695E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 183.664 | TFLOPs: 41.06 | 63: iteration 1120/ 5494 | consumed samples: 1146880 | consumed tokens: 2348810240 | elapsed time per iteration (s): 5.95 | learning rate: 1.835E-04 | global batch size: 1024 | lm loss: 2.893202E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.027 | TFLOPs: 38.46 | 63: iteration 1130/ 5494 | consumed samples: 1157120 | consumed tokens: 2369781760 | elapsed time per iteration (s): 5.54 | learning rate: 1.832E-04 | global batch size: 1024 | lm loss: 2.898812E+00 | grad norm: 0.249 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.803 | TFLOPs: 41.32 | 63: iteration 1140/ 5494 | consumed samples: 1167360 | consumed tokens: 2390753280 | elapsed time per iteration (s): 5.65 | learning rate: 1.829E-04 | global batch size: 1024 | lm loss: 2.903909E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.271 | TFLOPs: 40.53 | 63: iteration 1150/ 5494 | consumed samples: 1177600 | consumed tokens: 2411724800 | elapsed time per iteration (s): 6.01 | learning rate: 1.826E-04 | global batch size: 1024 | lm loss: 2.869392E+00 | grad norm: 0.176 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 170.521 | TFLOPs: 38.12 | 63: iteration 1160/ 5494 | consumed samples: 1187840 | consumed tokens: 2432696320 | elapsed time per iteration (s): 5.42 | learning rate: 1.823E-04 | global batch size: 1024 | lm loss: 2.875452E+00 | grad norm: 0.251 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.028 | TFLOPs: 42.26 | 63: iteration 1170/ 5494 | consumed samples: 1198080 | consumed tokens: 2453667840 | elapsed time per iteration (s): 5.78 | learning rate: 1.820E-04 | global batch size: 1024 | lm loss: 2.842657E+00 | grad norm: 0.245 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.040 | TFLOPs: 39.58 | 63: iteration 1180/ 5494 | consumed samples: 1208320 | consumed tokens: 2474639360 | elapsed time per iteration (s): 5.92 | learning rate: 1.817E-04 | global batch size: 1024 | lm loss: 2.843097E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.862 | TFLOPs: 38.65 | 63: iteration 1190/ 5494 | consumed samples: 1218560 | consumed tokens: 2495610880 | elapsed time per iteration (s): 5.67 | learning rate: 1.813E-04 | global batch size: 1024 | lm loss: 2.829501E+00 | grad norm: 0.304 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.494 | TFLOPs: 40.35 | 63: iteration 1200/ 5494 | consumed samples: 1228800 | consumed tokens: 2516582400 | elapsed time per iteration (s): 5.68 | learning rate: 1.810E-04 | global batch size: 1024 | lm loss: 2.844222E+00 | grad norm: 0.240 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.367 | TFLOPs: 40.32 | 63: iteration 1210/ 5494 | consumed samples: 1239040 | consumed tokens: 2537553920 | elapsed time per iteration (s): 5.55 | learning rate: 1.807E-04 | global batch size: 1024 | lm loss: 2.807220E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.560 | TFLOPs: 41.26 | 63: iteration 1220/ 5494 | consumed samples: 1249280 | consumed tokens: 2558525440 | elapsed time per iteration (s): 5.68 | learning rate: 1.804E-04 | global batch size: 1024 | lm loss: 2.828106E+00 | grad norm: 0.338 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.295 | TFLOPs: 40.31 | 63: iteration 1230/ 5494 | consumed samples: 1259520 | consumed tokens: 2579496960 | elapsed time per iteration (s): 5.55 | learning rate: 1.801E-04 | global batch size: 1024 | lm loss: 2.818738E+00 | grad norm: 0.274 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.410 | TFLOPs: 41.23 | 63: iteration 1240/ 5494 | consumed samples: 1269760 | consumed tokens: 2600468480 | elapsed time per iteration (s): 5.55 | learning rate: 1.797E-04 | global batch size: 1024 | lm loss: 2.817350E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.463 | TFLOPs: 41.24 | 63: iteration 1250/ 5494 | consumed samples: 1280000 | consumed tokens: 2621440000 | elapsed time per iteration (s): 5.82 | learning rate: 1.794E-04 | global batch size: 1024 | lm loss: 2.787200E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.863 | TFLOPs: 39.32 | 63: iteration 1260/ 5494 | consumed samples: 1290240 | consumed tokens: 2642411520 | elapsed time per iteration (s): 6.01 | learning rate: 1.791E-04 | global batch size: 1024 | lm loss: 2.768824E+00 | grad norm: 0.277 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 170.383 | TFLOPs: 38.09 | 63: iteration 1270/ 5494 | consumed samples: 1300480 | consumed tokens: 2663383040 | elapsed time per iteration (s): 5.58 | learning rate: 1.787E-04 | global batch size: 1024 | lm loss: 2.766957E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 183.354 | TFLOPs: 40.99 | 63: iteration 1280/ 5494 | consumed samples: 1310720 | consumed tokens: 2684354560 | elapsed time per iteration (s): 5.55 | learning rate: 1.784E-04 | global batch size: 1024 | lm loss: 2.779185E+00 | grad norm: 0.267 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.472 | TFLOPs: 41.24 | 63: iteration 1290/ 5494 | consumed samples: 1320960 | consumed tokens: 2705326080 | elapsed time per iteration (s): 5.83 | learning rate: 1.781E-04 | global batch size: 1024 | lm loss: 2.781754E+00 | grad norm: 0.391 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.779 | TFLOPs: 39.30 | 63: iteration 1300/ 5494 | consumed samples: 1331200 | consumed tokens: 2726297600 | elapsed time per iteration (s): 5.59 | learning rate: 1.777E-04 | global batch size: 1024 | lm loss: 2.774968E+00 | grad norm: 0.258 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 183.254 | TFLOPs: 40.97 | 63: iteration 1310/ 5494 | consumed samples: 1341440 | consumed tokens: 2747269120 | elapsed time per iteration (s): 6.06 | learning rate: 1.774E-04 | global batch size: 1024 | lm loss: 2.748240E+00 | grad norm: 0.161 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 168.881 | TFLOPs: 37.76 | 63: iteration 1320/ 5494 | consumed samples: 1351680 | consumed tokens: 2768240640 | elapsed time per iteration (s): 5.67 | learning rate: 1.770E-04 | global batch size: 1024 | lm loss: 2.752272E+00 | grad norm: 0.250 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.507 | TFLOPs: 40.36 | 63: iteration 1330/ 5494 | consumed samples: 1361920 | consumed tokens: 2789212160 | elapsed time per iteration (s): 5.67 | learning rate: 1.767E-04 | global batch size: 1024 | lm loss: 2.734439E+00 | grad norm: 0.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.449 | TFLOPs: 40.34 | 63: iteration 1340/ 5494 | consumed samples: 1372160 | consumed tokens: 2810183680 | elapsed time per iteration (s): 5.93 | learning rate: 1.763E-04 | global batch size: 1024 | lm loss: 2.741269E+00 | grad norm: 0.310 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.595 | TFLOPs: 38.59 | 63: iteration 1350/ 5494 | consumed samples: 1382400 | consumed tokens: 2831155200 | elapsed time per iteration (s): 5.81 | learning rate: 1.760E-04 | global batch size: 1024 | lm loss: 2.721545E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.355 | TFLOPs: 39.43 | 63: iteration 1360/ 5494 | consumed samples: 1392640 | consumed tokens: 2852126720 | elapsed time per iteration (s): 5.51 | learning rate: 1.756E-04 | global batch size: 1024 | lm loss: 2.727749E+00 | grad norm: 0.233 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.699 | TFLOPs: 41.52 | 63: iteration 1370/ 5494 | consumed samples: 1402880 | consumed tokens: 2873098240 | elapsed time per iteration (s): 5.72 | learning rate: 1.753E-04 | global batch size: 1024 | lm loss: 2.725698E+00 | grad norm: 0.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.994 | TFLOPs: 40.02 | 63: iteration 1380/ 5494 | consumed samples: 1413120 | consumed tokens: 2894069760 | elapsed time per iteration (s): 5.66 | learning rate: 1.749E-04 | global batch size: 1024 | lm loss: 2.717303E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.992 | TFLOPs: 40.46 | 63: iteration 1390/ 5494 | consumed samples: 1423360 | consumed tokens: 2915041280 | elapsed time per iteration (s): 5.54 | learning rate: 1.745E-04 | global batch size: 1024 | lm loss: 2.719729E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.864 | TFLOPs: 41.33 | 63: iteration 1400/ 5494 | consumed samples: 1433600 | consumed tokens: 2936012800 | elapsed time per iteration (s): 5.71 | learning rate: 1.742E-04 | global batch size: 1024 | lm loss: 2.698977E+00 | grad norm: 0.215 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.184 | TFLOPs: 40.06 | 63: iteration 1410/ 5494 | consumed samples: 1443840 | consumed tokens: 2956984320 | elapsed time per iteration (s): 5.81 | learning rate: 1.738E-04 | global batch size: 1024 | lm loss: 2.699133E+00 | grad norm: 0.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.257 | TFLOPs: 39.41 | 63: iteration 1420/ 5494 | consumed samples: 1454080 | consumed tokens: 2977955840 | elapsed time per iteration (s): 6.02 | learning rate: 1.734E-04 | global batch size: 1024 | lm loss: 2.699722E+00 | grad norm: 0.270 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 170.006 | TFLOPs: 38.01 | 63: iteration 1430/ 5494 | consumed samples: 1464320 | consumed tokens: 2998927360 | elapsed time per iteration (s): 5.54 | learning rate: 1.731E-04 | global batch size: 1024 | lm loss: 2.715108E+00 | grad norm: 0.307 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.871 | TFLOPs: 41.33 | 63: iteration 1440/ 5494 | consumed samples: 1474560 | consumed tokens: 3019898880 | elapsed time per iteration (s): 5.54 | learning rate: 1.727E-04 | global batch size: 1024 | lm loss: 3.847990E+00 | grad norm: 4.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.959 | TFLOPs: 41.35 | 63: iteration 1450/ 5494 | consumed samples: 1484800 | consumed tokens: 3040870400 | elapsed time per iteration (s): 5.77 | learning rate: 1.723E-04 | global batch size: 1024 | lm loss: 4.427395E+00 | grad norm: 2.806 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.479 | TFLOPs: 39.68 | 63: iteration 1460/ 5494 | consumed samples: 1495040 | consumed tokens: 3061841920 | elapsed time per iteration (s): 5.41 | learning rate: 1.720E-04 | global batch size: 1024 | lm loss: 3.847665E+00 | grad norm: 1.406 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.174 | TFLOPs: 42.29 | 63: iteration 1470/ 5494 | consumed samples: 1505280 | consumed tokens: 3082813440 | elapsed time per iteration (s): 5.80 | learning rate: 1.716E-04 | global batch size: 1024 | lm loss: 3.419533E+00 | grad norm: 1.482 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.613 | TFLOPs: 39.48 | 63: iteration 1480/ 5494 | consumed samples: 1515520 | consumed tokens: 3103784960 | elapsed time per iteration (s): 5.55 | learning rate: 1.712E-04 | global batch size: 1024 | lm loss: 3.177305E+00 | grad norm: 0.460 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.424 | TFLOPs: 41.23 | 63: iteration 1490/ 5494 | consumed samples: 1525760 | consumed tokens: 3124756480 | elapsed time per iteration (s): 5.41 | learning rate: 1.708E-04 | global batch size: 1024 | lm loss: 3.020963E+00 | grad norm: 0.393 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.359 | TFLOPs: 42.33 | 63: iteration 1500/ 5494 | consumed samples: 1536000 | consumed tokens: 3145728000 | elapsed time per iteration (s): 5.53 | learning rate: 1.704E-04 | global batch size: 1024 | lm loss: 2.910913E+00 | grad norm: 0.507 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.085 | TFLOPs: 41.38 | 63: iteration 1510/ 5494 | consumed samples: 1546240 | consumed tokens: 3166699520 | elapsed time per iteration (s): 5.69 | learning rate: 1.700E-04 | global batch size: 1024 | lm loss: 2.836571E+00 | grad norm: 0.265 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.901 | TFLOPs: 40.22 | 63: iteration 1520/ 5494 | consumed samples: 1556480 | consumed tokens: 3187671040 | elapsed time per iteration (s): 5.53 | learning rate: 1.697E-04 | global batch size: 1024 | lm loss: 2.795061E+00 | grad norm: 0.356 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.236 | TFLOPs: 41.41 | 63: iteration 1530/ 5494 | consumed samples: 1566720 | consumed tokens: 3208642560 | elapsed time per iteration (s): 5.94 | learning rate: 1.693E-04 | global batch size: 1024 | lm loss: 2.754069E+00 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.354 | TFLOPs: 38.53 | 63: iteration 1540/ 5494 | consumed samples: 1576960 | consumed tokens: 3229614080 | elapsed time per iteration (s): 5.65 | learning rate: 1.689E-04 | global batch size: 1024 | lm loss: 2.737738E+00 | grad norm: 0.323 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.250 | TFLOPs: 40.52 | 63: iteration 1550/ 5494 | consumed samples: 1587200 | consumed tokens: 3250585600 | elapsed time per iteration (s): 5.53 | learning rate: 1.685E-04 | global batch size: 1024 | lm loss: 2.734557E+00 | grad norm: 0.212 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.252 | TFLOPs: 41.42 | 63: iteration 1560/ 5494 | consumed samples: 1597440 | consumed tokens: 3271557120 | elapsed time per iteration (s): 5.40 | learning rate: 1.681E-04 | global batch size: 1024 | lm loss: 2.744709E+00 | grad norm: 0.266 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.568 | TFLOPs: 42.38 | 63: iteration 1570/ 5494 | consumed samples: 1607680 | consumed tokens: 3292528640 | elapsed time per iteration (s): 5.71 | learning rate: 1.677E-04 | global batch size: 1024 | lm loss: 2.703711E+00 | grad norm: 0.201 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.269 | TFLOPs: 40.08 | 63: iteration 1580/ 5494 | consumed samples: 1617920 | consumed tokens: 3313500160 | elapsed time per iteration (s): 5.84 | learning rate: 1.673E-04 | global batch size: 1024 | lm loss: 2.690000E+00 | grad norm: 0.247 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.227 | TFLOPs: 39.18 | 63: iteration 1590/ 5494 | consumed samples: 1628160 | consumed tokens: 3334471680 | elapsed time per iteration (s): 5.55 | learning rate: 1.669E-04 | global batch size: 1024 | lm loss: 2.685124E+00 | grad norm: 0.176 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.385 | TFLOPs: 41.22 | 63: iteration 1600/ 5494 | consumed samples: 1638400 | consumed tokens: 3355443200 | elapsed time per iteration (s): 5.80 | learning rate: 1.665E-04 | global batch size: 1024 | lm loss: 2.678199E+00 | grad norm: 0.204 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.581 | TFLOPs: 39.48 | 63: iteration 1610/ 5494 | consumed samples: 1648640 | consumed tokens: 3376414720 | elapsed time per iteration (s): 5.53 | learning rate: 1.661E-04 | global batch size: 1024 | lm loss: 2.667587E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.218 | TFLOPs: 41.41 | 63: iteration 1620/ 5494 | consumed samples: 1658880 | consumed tokens: 3397386240 | elapsed time per iteration (s): 5.68 | learning rate: 1.657E-04 | global batch size: 1024 | lm loss: 2.646988E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.134 | TFLOPs: 40.27 | 63: iteration 1630/ 5494 | consumed samples: 1669120 | consumed tokens: 3418357760 | elapsed time per iteration (s): 5.53 | learning rate: 1.653E-04 | global batch size: 1024 | lm loss: 2.647671E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.096 | TFLOPs: 41.38 | 63: iteration 1640/ 5494 | consumed samples: 1679360 | consumed tokens: 3439329280 | elapsed time per iteration (s): 5.92 | learning rate: 1.648E-04 | global batch size: 1024 | lm loss: 2.643419E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.070 | TFLOPs: 38.69 | 63: iteration 1650/ 5494 | consumed samples: 1689600 | consumed tokens: 3460300800 | elapsed time per iteration (s): 5.93 | learning rate: 1.644E-04 | global batch size: 1024 | lm loss: 2.645143E+00 | grad norm: 0.177 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.676 | TFLOPs: 38.60 | 63: iteration 1660/ 5494 | consumed samples: 1699840 | consumed tokens: 3481272320 | elapsed time per iteration (s): 5.96 | learning rate: 1.640E-04 | global batch size: 1024 | lm loss: 2.629522E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 171.748 | TFLOPs: 38.40 | 63: iteration 1670/ 5494 | consumed samples: 1710080 | consumed tokens: 3502243840 | elapsed time per iteration (s): 5.67 | learning rate: 1.636E-04 | global batch size: 1024 | lm loss: 2.620667E+00 | grad norm: 0.194 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.749 | TFLOPs: 40.41 | 63: iteration 1680/ 5494 | consumed samples: 1720320 | consumed tokens: 3523215360 | elapsed time per iteration (s): 5.67 | learning rate: 1.632E-04 | global batch size: 1024 | lm loss: 2.629012E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.577 | TFLOPs: 40.37 | 63: iteration 1690/ 5494 | consumed samples: 1730560 | consumed tokens: 3544186880 | elapsed time per iteration (s): 5.70 | learning rate: 1.628E-04 | global batch size: 1024 | lm loss: 2.623705E+00 | grad norm: 0.179 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.655 | TFLOPs: 40.16 | 63: iteration 1700/ 5494 | consumed samples: 1740800 | consumed tokens: 3565158400 | elapsed time per iteration (s): 5.66 | learning rate: 1.623E-04 | global batch size: 1024 | lm loss: 2.613983E+00 | grad norm: 0.235 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.003 | TFLOPs: 40.47 | 63: iteration 1710/ 5494 | consumed samples: 1751040 | consumed tokens: 3586129920 | elapsed time per iteration (s): 5.52 | learning rate: 1.619E-04 | global batch size: 1024 | lm loss: 2.618659E+00 | grad norm: 0.294 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.579 | TFLOPs: 41.49 | 63: iteration 1720/ 5494 | consumed samples: 1761280 | consumed tokens: 3607101440 | elapsed time per iteration (s): 5.82 | learning rate: 1.615E-04 | global batch size: 1024 | lm loss: 2.640071E+00 | grad norm: 0.255 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.953 | TFLOPs: 39.34 | 63: iteration 1730/ 5494 | consumed samples: 1771520 | consumed tokens: 3628072960 | elapsed time per iteration (s): 5.67 | learning rate: 1.611E-04 | global batch size: 1024 | lm loss: 2.589859E+00 | grad norm: 0.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.461 | TFLOPs: 40.35 | 63: iteration 1740/ 5494 | consumed samples: 1781760 | consumed tokens: 3649044480 | elapsed time per iteration (s): 5.69 | learning rate: 1.606E-04 | global batch size: 1024 | lm loss: 2.607697E+00 | grad norm: 0.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.850 | TFLOPs: 40.21 | 63: iteration 1750/ 5494 | consumed samples: 1792000 | consumed tokens: 3670016000 | elapsed time per iteration (s): 5.94 | learning rate: 1.602E-04 | global batch size: 1024 | lm loss: 2.594390E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.432 | TFLOPs: 38.55 | 63: iteration 1760/ 5494 | consumed samples: 1802240 | consumed tokens: 3690987520 | elapsed time per iteration (s): 5.53 | learning rate: 1.598E-04 | global batch size: 1024 | lm loss: 2.597325E+00 | grad norm: 0.175 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.013 | TFLOPs: 41.36 | 63: iteration 1770/ 5494 | consumed samples: 1812480 | consumed tokens: 3711959040 | elapsed time per iteration (s): 5.63 | learning rate: 1.593E-04 | global batch size: 1024 | lm loss: 2.587263E+00 | grad norm: 0.174 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.731 | TFLOPs: 40.63 | 63: iteration 1780/ 5494 | consumed samples: 1822720 | consumed tokens: 3732930560 | elapsed time per iteration (s): 5.52 | learning rate: 1.589E-04 | global batch size: 1024 | lm loss: 2.590788E+00 | grad norm: 0.214 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.513 | TFLOPs: 41.47 | 63: iteration 1790/ 5494 | consumed samples: 1832960 | consumed tokens: 3753902080 | elapsed time per iteration (s): 5.41 | learning rate: 1.585E-04 | global batch size: 1024 | lm loss: 2.601267E+00 | grad norm: 0.280 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.237 | TFLOPs: 42.31 | 63: iteration 1800/ 5494 | consumed samples: 1843200 | consumed tokens: 3774873600 | elapsed time per iteration (s): 6.04 | learning rate: 1.580E-04 | global batch size: 1024 | lm loss: 2.589921E+00 | grad norm: 0.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 169.505 | TFLOPs: 37.90 | 63: iteration 1810/ 5494 | consumed samples: 1853440 | consumed tokens: 3795845120 | elapsed time per iteration (s): 6.24 | learning rate: 1.576E-04 | global batch size: 1024 | lm loss: 2.586520E+00 | grad norm: 0.196 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 164.167 | TFLOPs: 36.70 | 63: iteration 1820/ 5494 | consumed samples: 1863680 | consumed tokens: 3816816640 | elapsed time per iteration (s): 5.71 | learning rate: 1.571E-04 | global batch size: 1024 | lm loss: 2.588548E+00 | grad norm: 0.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.353 | TFLOPs: 40.10 | 63: iteration 1830/ 5494 | consumed samples: 1873920 | consumed tokens: 3837788160 | elapsed time per iteration (s): 5.42 | learning rate: 1.567E-04 | global batch size: 1024 | lm loss: 2.578699E+00 | grad norm: 0.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.007 | TFLOPs: 42.26 | 63: iteration 1840/ 5494 | consumed samples: 1884160 | consumed tokens: 3858759680 | elapsed time per iteration (s): 5.92 | learning rate: 1.563E-04 | global batch size: 1024 | lm loss: 2.567197E+00 | grad norm: 0.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.856 | TFLOPs: 38.64 | 63: iteration 1850/ 5494 | consumed samples: 1894400 | consumed tokens: 3879731200 | elapsed time per iteration (s): 6.21 | learning rate: 1.558E-04 | global batch size: 1024 | lm loss: 2.560737E+00 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 164.945 | TFLOPs: 36.88 | 63: iteration 1860/ 5494 | consumed samples: 1904640 | consumed tokens: 3900702720 | elapsed time per iteration (s): 5.88 | learning rate: 1.554E-04 | global batch size: 1024 | lm loss: 2.548870E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 174.278 | TFLOPs: 38.96 | 63: iteration 1870/ 5494 | consumed samples: 1914880 | consumed tokens: 3921674240 | elapsed time per iteration (s): 5.69 | learning rate: 1.549E-04 | global batch size: 1024 | lm loss: 2.548226E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.979 | TFLOPs: 40.24 | 63: iteration 1880/ 5494 | consumed samples: 1925120 | consumed tokens: 3942645760 | elapsed time per iteration (s): 5.64 | learning rate: 1.545E-04 | global batch size: 1024 | lm loss: 2.549072E+00 | grad norm: 0.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.586 | TFLOPs: 40.60 | 63: iteration 1890/ 5494 | consumed samples: 1935360 | consumed tokens: 3963617280 | elapsed time per iteration (s): 5.52 | learning rate: 1.540E-04 | global batch size: 1024 | lm loss: 2.553756E+00 | grad norm: 0.183 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.523 | TFLOPs: 41.48 | 63: iteration 1900/ 5494 | consumed samples: 1945600 | consumed tokens: 3984588800 | elapsed time per iteration (s): 5.69 | learning rate: 1.536E-04 | global batch size: 1024 | lm loss: 2.555726E+00 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.986 | TFLOPs: 40.24 | 63: iteration 1910/ 5494 | consumed samples: 1955840 | consumed tokens: 4005560320 | elapsed time per iteration (s): 5.53 | learning rate: 1.531E-04 | global batch size: 1024 | lm loss: 2.544932E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.232 | TFLOPs: 41.41 | 63: iteration 1920/ 5494 | consumed samples: 1966080 | consumed tokens: 4026531840 | elapsed time per iteration (s): 5.93 | learning rate: 1.526E-04 | global batch size: 1024 | lm loss: 2.532160E+00 | grad norm: 0.208 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.543 | TFLOPs: 38.58 | 63: iteration 1930/ 5494 | consumed samples: 1976320 | consumed tokens: 4047503360 | elapsed time per iteration (s): 5.65 | learning rate: 1.522E-04 | global batch size: 1024 | lm loss: 2.541253E+00 | grad norm: 0.213 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.202 | TFLOPs: 40.51 | 63: iteration 1940/ 5494 | consumed samples: 1986560 | consumed tokens: 4068474880 | elapsed time per iteration (s): 5.80 | learning rate: 1.517E-04 | global batch size: 1024 | lm loss: 2.549542E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.622 | TFLOPs: 39.49 | 63: iteration 1950/ 5494 | consumed samples: 1996800 | consumed tokens: 4089446400 | elapsed time per iteration (s): 5.88 | learning rate: 1.513E-04 | global batch size: 1024 | lm loss: 2.539379E+00 | grad norm: 0.175 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 174.268 | TFLOPs: 38.96 | 63: iteration 1960/ 5494 | consumed samples: 2007040 | consumed tokens: 4110417920 | elapsed time per iteration (s): 5.66 | learning rate: 1.508E-04 | global batch size: 1024 | lm loss: 2.527486E+00 | grad norm: 0.169 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.055 | TFLOPs: 40.48 | 63: iteration 1970/ 5494 | consumed samples: 2017280 | consumed tokens: 4131389440 | elapsed time per iteration (s): 5.61 | learning rate: 1.503E-04 | global batch size: 1024 | lm loss: 2.530867E+00 | grad norm: 0.198 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.383 | TFLOPs: 40.77 | 63: iteration 1980/ 5494 | consumed samples: 2027520 | consumed tokens: 4152360960 | elapsed time per iteration (s): 5.83 | learning rate: 1.499E-04 | global batch size: 1024 | lm loss: 2.519202E+00 | grad norm: 0.173 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.689 | TFLOPs: 39.28 | 63: iteration 1990/ 5494 | consumed samples: 2037760 | consumed tokens: 4173332480 | elapsed time per iteration (s): 5.72 | learning rate: 1.494E-04 | global batch size: 1024 | lm loss: 2.526312E+00 | grad norm: 0.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.931 | TFLOPs: 40.00 | 0: [2022-11-25 12:58:26,228] [INFO] [logging.py:68:log_dist] [Rank 0] step=2000, skipped=0, lr=[0.00014893268673031913, 0.00014893268673031913, 0.00014893268673031913], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] 63: iteration 2000/ 5494 | consumed samples: 2048000 | consumed tokens: 4194304000 | elapsed time per iteration (s): 5.52 | learning rate: 1.489E-04 | global batch size: 1024 | lm loss: 2.530874E+00 | grad norm: 0.246 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.410 | TFLOPs: 41.45 | 0: steps: 2000 loss: 2.5557 iter time (s): 5.914 samples/sec: 173.144 63: ------------------------------------------------------------------------------------------ 63: valid loss at iteration 2000 | lm loss value: 2.529820E+00 | lm loss PPL: 1.255125E+01 | 63: ------------------------------------------------------------------------------------------ 0: saving checkpoint at iteration 2000 to checkpoints_8b7 0: [2022-11-25 12:58:28,088] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step2000 is begin to save! 0: [2022-11-25 12:58:28,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_01-model_00-model_states.pt... 0: [2022-11-25 12:58:28,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_01-model_01-model_states.pt... 32: [2022-11-25 12:58:28,145] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_24-model_01-model_states.pt... 32: [2022-11-25 12:58:28,145] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_24-model_00-model_states.pt... 32: [2022-11-25 12:58:28,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_24-model_00-model_states.pt. 32: [2022-11-25 12:58:28,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_24-model_01-model_states.pt. 32: [2022-11-25 12:58:28,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_25-model_01-model_states.pt... 32: [2022-11-25 12:58:28,576] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_25-model_00-model_states.pt... 0: [2022-11-25 12:58:28,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_01-model_00-model_states.pt. 0: [2022-11-25 12:58:28,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_01-model_01-model_states.pt. 0: [2022-11-25 12:58:28,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_03-model_00-model_states.pt... 0: [2022-11-25 12:58:28,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_03-model_01-model_states.pt... 32: [2022-11-25 12:58:28,815] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_25-model_00-model_states.pt. 32: [2022-11-25 12:58:28,816] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_26-model_00-model_states.pt... 32: [2022-11-25 12:58:28,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_25-model_01-model_states.pt. 32: [2022-11-25 12:58:28,827] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_26-model_01-model_states.pt... 0: [2022-11-25 12:58:28,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_03-model_01-model_states.pt. 0: [2022-11-25 12:58:28,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_04-model_01-model_states.pt... 0: [2022-11-25 12:58:28,844] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_03-model_00-model_states.pt. 0: [2022-11-25 12:58:28,844] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_04-model_00-model_states.pt... 32: [2022-11-25 12:58:29,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_26-model_00-model_states.pt. 32: [2022-11-25 12:58:29,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_26-model_01-model_states.pt. 32: [2022-11-25 12:58:29,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_27-model_00-model_states.pt... 32: [2022-11-25 12:58:29,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_27-model_01-model_states.pt... 0: [2022-11-25 12:58:29,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_04-model_00-model_states.pt. 0: [2022-11-25 12:58:29,106] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_05-model_00-model_states.pt... 0: [2022-11-25 12:58:29,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_04-model_01-model_states.pt. 0: [2022-11-25 12:58:29,117] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_05-model_01-model_states.pt... 32: [2022-11-25 12:58:29,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_27-model_00-model_states.pt. 32: [2022-11-25 12:58:29,319] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_28-model_00-model_states.pt... 32: [2022-11-25 12:58:29,332] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_27-model_01-model_states.pt. 32: [2022-11-25 12:58:29,333] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_28-model_01-model_states.pt... 0: [2022-11-25 12:58:29,357] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_05-model_01-model_states.pt. 0: [2022-11-25 12:58:29,357] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_06-model_01-model_states.pt... 0: [2022-11-25 12:58:29,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_05-model_00-model_states.pt. 0: [2022-11-25 12:58:29,378] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_06-model_00-model_states.pt... 32: [2022-11-25 12:58:29,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_28-model_00-model_states.pt. 32: [2022-11-25 12:58:29,574] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_29-model_00-model_states.pt... 32: [2022-11-25 12:58:29,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_28-model_01-model_states.pt. 32: [2022-11-25 12:58:29,580] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_29-model_01-model_states.pt... 0: [2022-11-25 12:58:29,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_06-model_01-model_states.pt. 0: [2022-11-25 12:58:29,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_07-model_01-model_states.pt... 0: [2022-11-25 12:58:29,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_06-model_00-model_states.pt. 0: [2022-11-25 12:58:29,607] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_07-model_00-model_states.pt... 32: [2022-11-25 12:58:29,810] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_29-model_01-model_states.pt. 32: [2022-11-25 12:58:29,810] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_30-model_01-model_states.pt... 32: [2022-11-25 12:58:29,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_29-model_00-model_states.pt. 32: [2022-11-25 12:58:29,832] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_30-model_00-model_states.pt... 0: [2022-11-25 12:58:29,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_07-model_00-model_states.pt. 0: [2022-11-25 12:58:29,877] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_07-model_01-model_states.pt. 0: [2022-11-25 12:58:29,877] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_08-model_00-model_states.pt... 0: [2022-11-25 12:58:29,877] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_08-model_01-model_states.pt... 32: [2022-11-25 12:58:30,055] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_30-model_00-model_states.pt. 32: [2022-11-25 12:58:30,056] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_31-model_00-model_states.pt... 32: [2022-11-25 12:58:30,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_30-model_01-model_states.pt. 32: [2022-11-25 12:58:30,073] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_31-model_01-model_states.pt... 0: [2022-11-25 12:58:30,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_08-model_00-model_states.pt. 0: [2022-11-25 12:58:30,113] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_09-model_00-model_states.pt... 0: [2022-11-25 12:58:30,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_08-model_01-model_states.pt. 0: [2022-11-25 12:58:30,118] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_09-model_01-model_states.pt... 32: [2022-11-25 12:58:30,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_31-model_01-model_states.pt. 32: [2022-11-25 12:58:30,306] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_32-model_01-model_states.pt... 32: [2022-11-25 12:58:30,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_31-model_00-model_states.pt. 32: [2022-11-25 12:58:30,327] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_32-model_00-model_states.pt... 0: [2022-11-25 12:58:30,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_09-model_00-model_states.pt. 0: [2022-11-25 12:58:30,381] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_10-model_00-model_states.pt... 0: [2022-11-25 12:58:30,390] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_09-model_01-model_states.pt. 0: [2022-11-25 12:58:30,390] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_10-model_01-model_states.pt... 32: [2022-11-25 12:58:30,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_32-model_01-model_states.pt. 32: [2022-11-25 12:58:30,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_32-model_00-model_states.pt. 32: [2022-11-25 12:58:30,565] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_33-model_01-model_states.pt... 32: [2022-11-25 12:58:30,565] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_33-model_00-model_states.pt... 0: [2022-11-25 12:58:30,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_10-model_00-model_states.pt. 0: [2022-11-25 12:58:30,635] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_11-model_00-model_states.pt... 0: [2022-11-25 12:58:30,646] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_10-model_01-model_states.pt. 0: [2022-11-25 12:58:30,646] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_11-model_01-model_states.pt... 32: [2022-11-25 12:58:30,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_33-model_00-model_states.pt. 32: [2022-11-25 12:58:30,801] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_34-model_00-model_states.pt... 32: [2022-11-25 12:58:30,808] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_33-model_01-model_states.pt. 32: [2022-11-25 12:58:30,808] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_34-model_01-model_states.pt... 0: [2022-11-25 12:58:30,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_11-model_01-model_states.pt. 0: [2022-11-25 12:58:30,909] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_12-model_01-model_states.pt... 0: [2022-11-25 12:58:30,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_11-model_00-model_states.pt. 0: [2022-11-25 12:58:30,918] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_12-model_00-model_states.pt... 32: [2022-11-25 12:58:31,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_34-model_00-model_states.pt. 32: [2022-11-25 12:58:31,032] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_35-model_00-model_states.pt... 32: [2022-11-25 12:58:31,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_34-model_01-model_states.pt. 32: [2022-11-25 12:58:31,040] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_35-model_01-model_states.pt... 0: [2022-11-25 12:58:31,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_12-model_01-model_states.pt. 0: [2022-11-25 12:58:31,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_13-model_01-model_states.pt... 0: [2022-11-25 12:58:31,182] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_12-model_00-model_states.pt. 0: [2022-11-25 12:58:31,182] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_13-model_00-model_states.pt... 32: [2022-11-25 12:58:31,265] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_35-model_00-model_states.pt. 32: [2022-11-25 12:58:31,265] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_36-model_00-model_states.pt... 32: [2022-11-25 12:58:31,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_35-model_01-model_states.pt. 32: [2022-11-25 12:58:31,274] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_36-model_01-model_states.pt... 0: [2022-11-25 12:58:31,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_13-model_01-model_states.pt. 0: [2022-11-25 12:58:31,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_14-model_01-model_states.pt... 0: [2022-11-25 12:58:31,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_13-model_00-model_states.pt. 0: [2022-11-25 12:58:31,419] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_14-model_00-model_states.pt... 32: [2022-11-25 12:58:31,529] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_36-model_00-model_states.pt. 32: [2022-11-25 12:58:31,529] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_37-model_00-model_states.pt... 32: [2022-11-25 12:58:31,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_36-model_01-model_states.pt. 32: [2022-11-25 12:58:31,542] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_37-model_01-model_states.pt... 0: [2022-11-25 12:58:31,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_14-model_01-model_states.pt. 0: [2022-11-25 12:58:31,643] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_15-model_01-model_states.pt... 0: [2022-11-25 12:58:31,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_14-model_00-model_states.pt. 0: [2022-11-25 12:58:31,648] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_15-model_00-model_states.pt... 32: [2022-11-25 12:58:31,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_37-model_01-model_states.pt. 32: [2022-11-25 12:58:31,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_37-model_00-model_states.pt. 32: [2022-11-25 12:58:31,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_38-model_01-model_states.pt... 32: [2022-11-25 12:58:31,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_38-model_00-model_states.pt... 0: [2022-11-25 12:58:31,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_15-model_00-model_states.pt. 0: [2022-11-25 12:58:31,904] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_16-model_00-model_states.pt... 0: [2022-11-25 12:58:31,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_15-model_01-model_states.pt. 0: [2022-11-25 12:58:31,907] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_16-model_01-model_states.pt... 32: [2022-11-25 12:58:32,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_38-model_01-model_states.pt. 32: [2022-11-25 12:58:32,015] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_39-model_01-model_states.pt... 32: [2022-11-25 12:58:32,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_38-model_00-model_states.pt. 32: [2022-11-25 12:58:32,036] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_39-model_00-model_states.pt... 0: [2022-11-25 12:58:32,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_16-model_01-model_states.pt. 0: [2022-11-25 12:58:32,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_16-model_00-model_states.pt. 0: [2022-11-25 12:58:32,145] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_17-model_00-model_states.pt... 0: [2022-11-25 12:58:32,145] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_17-model_01-model_states.pt... 32: [2022-11-25 12:58:32,259] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_39-model_01-model_states.pt. 32: [2022-11-25 12:58:32,259] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_40-model_01-model_states.pt... 32: [2022-11-25 12:58:32,284] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_39-model_00-model_states.pt. 32: [2022-11-25 12:58:32,284] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_40-model_00-model_states.pt... 0: [2022-11-25 12:58:32,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_17-model_01-model_states.pt. 0: [2022-11-25 12:58:32,387] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_18-model_01-model_states.pt... 0: [2022-11-25 12:58:32,396] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_17-model_00-model_states.pt. 0: [2022-11-25 12:58:32,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_18-model_00-model_states.pt... 32: [2022-11-25 12:58:32,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_40-model_00-model_states.pt. 32: [2022-11-25 12:58:32,508] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_41-model_00-model_states.pt... 32: [2022-11-25 12:58:32,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_40-model_01-model_states.pt. 32: [2022-11-25 12:58:32,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_41-model_01-model_states.pt... 0: [2022-11-25 12:58:32,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_18-model_01-model_states.pt. 0: [2022-11-25 12:58:32,630] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_19-model_01-model_states.pt... 0: [2022-11-25 12:58:32,636] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_18-model_00-model_states.pt. 0: [2022-11-25 12:58:32,636] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_19-model_00-model_states.pt... 32: [2022-11-25 12:58:32,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_41-model_00-model_states.pt. 32: [2022-11-25 12:58:32,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_41-model_01-model_states.pt. 32: [2022-11-25 12:58:32,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_42-model_00-model_states.pt... 32: [2022-11-25 12:58:32,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_42-model_01-model_states.pt... 0: [2022-11-25 12:58:32,890] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_19-model_00-model_states.pt. 0: [2022-11-25 12:58:32,890] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_20-model_00-model_states.pt... 0: [2022-11-25 12:58:32,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_19-model_01-model_states.pt. 0: [2022-11-25 12:58:32,895] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_20-model_01-model_states.pt... 32: [2022-11-25 12:58:33,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_42-model_01-model_states.pt. 32: [2022-11-25 12:58:33,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_42-model_00-model_states.pt. 32: [2022-11-25 12:58:33,019] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_43-model_01-model_states.pt... 32: [2022-11-25 12:58:33,019] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_43-model_00-model_states.pt... 0: [2022-11-25 12:58:33,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_20-model_00-model_states.pt. 0: [2022-11-25 12:58:33,130] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_21-model_00-model_states.pt... 0: [2022-11-25 12:58:33,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_20-model_01-model_states.pt. 0: [2022-11-25 12:58:33,134] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_21-model_01-model_states.pt... 32: [2022-11-25 12:58:33,290] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_43-model_01-model_states.pt. 32: [2022-11-25 12:58:33,290] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_44-model_01-model_states.pt... 32: [2022-11-25 12:58:33,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_43-model_00-model_states.pt. 32: [2022-11-25 12:58:33,303] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_44-model_00-model_states.pt... 0: [2022-11-25 12:58:33,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_21-model_00-model_states.pt. 0: [2022-11-25 12:58:33,412] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_22-model_00-model_states.pt... 0: [2022-11-25 12:58:33,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_21-model_01-model_states.pt. 0: [2022-11-25 12:58:33,415] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_22-model_01-model_states.pt... 32: [2022-11-25 12:58:33,549] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_44-model_00-model_states.pt. 32: [2022-11-25 12:58:33,549] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_44-model_01-model_states.pt. 32: [2022-11-25 12:58:33,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_46-model_01-model_states.pt... 32: [2022-11-25 12:58:33,549] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_46-model_00-model_states.pt... 32: [2022-11-25 12:58:33,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_46-model_00-model_states.pt. 32: [2022-11-25 12:58:33,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_46-model_01-model_states.pt. 32: [2022-11-25 12:58:33,577] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/mp_rank_02_model_states.pt... 32: [2022-11-25 12:58:33,577] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/mp_rank_03_model_states.pt... 32: [2022-11-25 12:58:33,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/mp_rank_02_model_states.pt. 32: [2022-11-25 12:58:33,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/mp_rank_03_model_states.pt. 0: [2022-11-25 12:58:33,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_22-model_01-model_states.pt. 0: [2022-11-25 12:58:33,649] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_23-model_01-model_states.pt... 0: [2022-11-25 12:58:33,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_22-model_00-model_states.pt. 0: [2022-11-25 12:58:33,656] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/layer_23-model_00-model_states.pt... 0: [2022-11-25 12:58:33,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_23-model_01-model_states.pt. 0: [2022-11-25 12:58:33,904] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: checkpoints_8b7/global_step2000/mp_rank_01_model_states.pt 0: [2022-11-25 12:58:33,904] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/mp_rank_01_model_states.pt... 0: [2022-11-25 12:58:33,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/layer_23-model_00-model_states.pt. 0: [2022-11-25 12:58:33,906] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_8b7/global_step2000/mp_rank_00_model_states.pt 0: [2022-11-25 12:58:33,906] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/mp_rank_00_model_states.pt... 0: [2022-11-25 12:58:33,935] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/mp_rank_00_model_states.pt. 0: [2022-11-25 12:58:33,935] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/mp_rank_01_model_states.pt. 57: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt... 57: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt... 57: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt... 57: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt... 57: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt... 57: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt... 63: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt... 63: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt... 63: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt... 63: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt... 39: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt... 39: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt... 39: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt... 53: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt... 53: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt... 53: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt... 53: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt... 53: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt... 53: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt... 55: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt... 55: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt... 55: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt... 55: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt... 55: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt... 55: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt... 51: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt... 51: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt... 51: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt... 51: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt... 33: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt... 33: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt... 33: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt... 33: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt... 33: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt... 1: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... 1: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... 1: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... 1: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... 61: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt... 61: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt... 61: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt... 61: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt... 59: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt... 59: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt... 59: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt... 59: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt... 59: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt... 59: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt... 13: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... 13: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... 13: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... 35: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt... 35: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt... 35: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt... 35: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt... 35: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt... 35: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt... 5: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... 5: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... 5: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... 29: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... 37: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt... 37: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt... 37: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt... 37: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt... 37: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt... 58: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt... 58: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt... 58: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt... 58: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt... 58: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt... 58: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt... 40: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt... 40: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt... 40: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt... 40: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt... 40: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt... 40: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt... 60: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt... 60: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt... 60: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt... 60: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt... 60: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt... 60: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt... 52: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt... 52: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt... 52: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt... 52: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt... 52: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt... 52: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt... 30: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... 30: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... 4: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... 4: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... 56: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt... 56: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt... 56: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt... 56: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt... 56: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt... 56: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt... 54: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt... 54: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt... 54: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt... 54: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt... 54: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt... 54: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt... 62: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt... 62: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt... 62: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt... 62: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt... 62: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt... 62: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt... 36: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt... 36: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt... 36: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt... 36: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt... 36: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt... 36: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt... 28: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... 28: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... 28: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... 38: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt... 38: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt... 38: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt... 38: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt... 38: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt... 38: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt... 0: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... 2: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... 2: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... 24: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... 24: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... 24: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... 24: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... 49: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt... 49: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt... 49: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt... 49: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt... 49: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt... 49: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt... 47: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt... 47: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt... 47: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt... 47: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt... 47: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt... 47: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt... 41: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt... 41: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt... 41: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt... 41: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt... 41: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt... 41: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt... 45: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt... 45: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt... 45: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt... 45: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt... 45: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt... 9: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... 9: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... 43: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt... 43: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt... 43: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt... 43: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt... 43: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt... 43: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt... 27: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... 27: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... 27: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... 25: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... 25: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... 25: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... 3: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... 3: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... 3: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... 3: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... 7: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... 7: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... 7: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... 7: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... 7: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... 17: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... 23: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... 23: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... 23: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... 23: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... 11: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... 11: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... 11: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... 11: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... 31: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... 31: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... 19: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... 19: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... 34: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt... 34: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt... 34: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt... 34: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt... 34: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt... 34: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt... 46: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt... 46: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt... 46: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt... 46: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt... 46: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt... 46: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt... 44: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt... 44: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt... 44: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt... 44: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt... 44: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt... 44: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt... 6: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... 16: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... 16: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... 16: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... 16: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... 18: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... 18: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... 18: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... 18: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... 18: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... 10: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... 20: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... 8: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... 8: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... 8: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... 8: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... 48: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt... 48: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt... 48: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt... 48: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt... 48: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt... 48: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt... 50: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt... 50: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt... 50: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt... 50: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt... 50: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt... 50: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt... 42: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt... 42: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt... 42: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt... 42: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt... 42: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt... 42: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt... 32: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt... 32: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt... 12: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... 12: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... 12: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... 26: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... 26: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... 14: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... 14: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... 14: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... 14: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... 14: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... 15: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... 15: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... 15: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... 22: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... 22: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... 21: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... 21: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... 21: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... 57: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt... 63: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt... 63: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt... 63: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt... 39: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt... 39: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt... 53: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt... 55: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt... 55: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt... 51: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt... 33: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt... 33: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt... 1: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... 61: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt... 59: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt... 59: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt... 13: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... 13: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... 13: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... 13: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... 35: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt... 35: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt... 5: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... 5: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... 5: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... 29: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... 29: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... 37: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt... 58: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt... 40: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt... 60: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt... 52: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt... 52: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt... 30: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... 30: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... 30: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... 4: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... 4: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... 4: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... 4: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... 4: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... 4: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... 56: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt... 54: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt... 54: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt... 62: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt... 62: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt... 36: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt... 28: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... 28: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... 38: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt... 0: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... 2: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... 2: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... 2: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... 2: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... 24: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... 49: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt... 49: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt... 47: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt... 41: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt... 41: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt... 45: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt... 9: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... 9: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... 43: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt... 27: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... 27: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... 27: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... 25: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... 3: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... 3: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... 7: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... 7: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... 17: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... 17: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... 23: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... 11: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... 11: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... 31: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... 31: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... 31: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... 19: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... 19: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... 19: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... 19: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... 34: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt... 46: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt... 44: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt... 6: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... 16: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... 16: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... 16: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... 18: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... 18: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... 10: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... 10: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... 10: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... 20: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... 20: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... 8: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... 48: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt... 50: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt... 42: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt... 32: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt... 32: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt... 32: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt... 32: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt... 12: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... 26: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... 26: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... 26: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... 26: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... 26: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... 14: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... 14: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... 15: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... 22: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... 21: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... 57: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt... 63: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt... 39: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt... 53: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt... 51: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt... 33: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt... 1: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... 1: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... 61: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt... 61: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt... 13: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... 5: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... 29: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... 29: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... 29: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... 37: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt... 58: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt... 40: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt... 60: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt... 30: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... 30: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... 30: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... 56: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt... 36: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt... 28: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... 38: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt... 0: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... 2: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... 2: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... 24: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... 47: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt... 45: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt... 45: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt... 9: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... 9: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... 43: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt... 27: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... 27: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... 25: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... 25: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... 25: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... 3: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... 3: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... 7: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... 17: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... 17: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... 17: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... 23: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... 23: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... 11: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... 31: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... 19: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... 34: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt... 46: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt... 44: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt... 6: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... 6: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... 16: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... 18: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... 10: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... 10: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... 10: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... 20: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... 20: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... 20: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... 20: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... 8: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... 48: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt... 50: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt... 42: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt... 32: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt... 32: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt... 26: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... 14: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... 15: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... 15: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... 22: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... 22: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... 21: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... 39: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt... 51: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt... 51: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt... 1: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... 61: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt... 5: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... 29: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... 37: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt... 28: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... 0: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... 24: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... 9: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... 25: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... 17: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... 23: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... 11: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... 31: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... 19: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... 6: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... 10: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... 8: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... 12: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... 15: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... 21: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... 39: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt... 29: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... 28: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... 0: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... 24: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... 9: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... 17: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... 31: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... 6: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... 20: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... 8: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... 12: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... 22: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... 21: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... 0: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... 6: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... 12: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... 15: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... 22: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... 21: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... 0: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... 6: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... 12: [2022-11-25 12:58:34,158] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... 22: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... 0: [2022-11-25 12:58:34,157] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... 32: [2022-11-25 12:58:34,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt. 32: [2022-11-25 12:58:34,426] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt 32: [2022-11-25 12:58:34,426] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 32: [2022-11-25 12:58:34,428] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt. 32: [2022-11-25 12:58:34,428] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt 32: [2022-11-25 12:58:34,428] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 32: [2022-11-25 12:58:34,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt. 32: [2022-11-25 12:58:34,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt 32: [2022-11-25 12:58:34,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 0: [2022-11-25 12:58:34,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. 0: [2022-11-25 12:58:34,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. 0: [2022-11-25 12:58:34,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. 0: [2022-11-25 12:58:34,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt 0: [2022-11-25 12:58:34,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt 0: [2022-11-25 12:58:34,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. 0: [2022-11-25 12:58:34,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 0: [2022-11-25 12:58:34,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 0: [2022-11-25 12:58:34,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt 0: [2022-11-25 12:58:34,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 0: [2022-11-25 12:58:34,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. 0: [2022-11-25 12:58:34,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt 0: [2022-11-25 12:58:34,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 0: [2022-11-25 12:58:34,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. 0: [2022-11-25 12:58:34,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt 0: [2022-11-25 12:58:34,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 0: [2022-11-25 12:58:34,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. 0: [2022-11-25 12:58:34,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt 0: [2022-11-25 12:58:34,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. 0: [2022-11-25 12:58:34,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 0: [2022-11-25 12:58:34,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt 0: [2022-11-25 12:58:34,473] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 32: [2022-11-25 12:58:34,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt. 32: [2022-11-25 12:58:34,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt. 32: [2022-11-25 12:58:34,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt 32: [2022-11-25 12:58:34,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt 32: [2022-11-25 12:58:34,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 32: [2022-11-25 12:58:34,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 32: [2022-11-25 12:58:34,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt. 32: [2022-11-25 12:58:34,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt. 32: [2022-11-25 12:58:34,525] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt 32: [2022-11-25 12:58:34,525] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt 32: [2022-11-25 12:58:34,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt. 32: [2022-11-25 12:58:34,525] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 32: [2022-11-25 12:58:34,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 32: [2022-11-25 12:58:34,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt 32: [2022-11-25 12:58:34,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 44: [2022-11-25 12:58:34,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt. 44: [2022-11-25 12:58:34,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt. 44: [2022-11-25 12:58:34,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt. 44: [2022-11-25 12:58:34,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt. 44: [2022-11-25 12:58:34,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt. 44: [2022-11-25 12:58:34,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt 44: [2022-11-25 12:58:34,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt 44: [2022-11-25 12:58:34,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt 44: [2022-11-25 12:58:34,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 44: [2022-11-25 12:58:34,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt 44: [2022-11-25 12:58:34,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt 44: [2022-11-25 12:58:34,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 44: [2022-11-25 12:58:34,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 44: [2022-11-25 12:58:34,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 44: [2022-11-25 12:58:34,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 50: [2022-11-25 12:58:34,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt. 50: [2022-11-25 12:58:34,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt. 50: [2022-11-25 12:58:34,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt. 50: [2022-11-25 12:58:34,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt. 50: [2022-11-25 12:58:34,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt. 50: [2022-11-25 12:58:34,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt. 50: [2022-11-25 12:58:34,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt. 50: [2022-11-25 12:58:34,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt 50: [2022-11-25 12:58:34,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt 50: [2022-11-25 12:58:34,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 50: [2022-11-25 12:58:34,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt 50: [2022-11-25 12:58:34,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt 50: [2022-11-25 12:58:34,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt 50: [2022-11-25 12:58:34,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt 50: [2022-11-25 12:58:34,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt 50: [2022-11-25 12:58:34,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 50: [2022-11-25 12:58:34,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 50: [2022-11-25 12:58:34,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 50: [2022-11-25 12:58:34,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 50: [2022-11-25 12:58:34,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 50: [2022-11-25 12:58:34,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 19: [2022-11-25 12:58:34,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. 19: [2022-11-25 12:58:34,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. 62: [2022-11-25 12:58:34,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt. 62: [2022-11-25 12:58:34,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt. 62: [2022-11-25 12:58:34,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt. 62: [2022-11-25 12:58:34,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt. 62: [2022-11-25 12:58:34,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt. 62: [2022-11-25 12:58:34,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt. 62: [2022-11-25 12:58:34,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt 62: [2022-11-25 12:58:34,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt 62: [2022-11-25 12:58:34,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt. 62: [2022-11-25 12:58:34,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt 62: [2022-11-25 12:58:34,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt 62: [2022-11-25 12:58:34,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 62: [2022-11-25 12:58:34,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 62: [2022-11-25 12:58:34,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt 62: [2022-11-25 12:58:34,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt 62: [2022-11-25 12:58:34,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 62: [2022-11-25 12:58:34,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 62: [2022-11-25 12:58:34,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt 62: [2022-11-25 12:58:34,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 62: [2022-11-25 12:58:34,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 62: [2022-11-25 12:58:34,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 60: [2022-11-25 12:58:34,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt. 60: [2022-11-25 12:58:34,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt. 60: [2022-11-25 12:58:34,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt. 60: [2022-11-25 12:58:34,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt. 60: [2022-11-25 12:58:34,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt. 60: [2022-11-25 12:58:34,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt. 60: [2022-11-25 12:58:34,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt 60: [2022-11-25 12:58:34,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt 60: [2022-11-25 12:58:34,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt 60: [2022-11-25 12:58:34,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt 60: [2022-11-25 12:58:34,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt 60: [2022-11-25 12:58:34,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt 60: [2022-11-25 12:58:34,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 60: [2022-11-25 12:58:34,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 60: [2022-11-25 12:58:34,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 60: [2022-11-25 12:58:34,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 60: [2022-11-25 12:58:34,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 60: [2022-11-25 12:58:34,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 6: [2022-11-25 12:58:34,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. 6: [2022-11-25 12:58:34,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. 6: [2022-11-25 12:58:34,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. 6: [2022-11-25 12:58:34,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. 6: [2022-11-25 12:58:34,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. 6: [2022-11-25 12:58:34,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. 6: [2022-11-25 12:58:34,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. 6: [2022-11-25 12:58:34,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt 6: [2022-11-25 12:58:34,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt 6: [2022-11-25 12:58:34,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt 6: [2022-11-25 12:58:34,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt 6: [2022-11-25 12:58:34,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt 6: [2022-11-25 12:58:34,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt 6: [2022-11-25 12:58:34,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 6: [2022-11-25 12:58:34,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt 6: [2022-11-25 12:58:34,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 6: [2022-11-25 12:58:34,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 6: [2022-11-25 12:58:34,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 6: [2022-11-25 12:58:34,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 6: [2022-11-25 12:58:34,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 6: [2022-11-25 12:58:34,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 19: [2022-11-25 12:58:34,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt 19: [2022-11-25 12:58:34,581] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt 19: [2022-11-25 12:58:34,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 19: [2022-11-25 12:58:34,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 19: [2022-11-25 12:58:34,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. 19: [2022-11-25 12:58:34,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt 19: [2022-11-25 12:58:34,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 19: [2022-11-25 12:58:34,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. 19: [2022-11-25 12:58:34,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt 19: [2022-11-25 12:58:34,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 25: [2022-11-25 12:58:34,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. 25: [2022-11-25 12:58:34,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. 25: [2022-11-25 12:58:34,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. 33: [2022-11-25 12:58:34,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt. 33: [2022-11-25 12:58:34,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt. 33: [2022-11-25 12:58:34,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt. 33: [2022-11-25 12:58:34,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt. 33: [2022-11-25 12:58:34,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt. 33: [2022-11-25 12:58:34,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt. 33: [2022-11-25 12:58:34,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt. 41: [2022-11-25 12:58:34,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt. 41: [2022-11-25 12:58:34,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt. 41: [2022-11-25 12:58:34,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt. 41: [2022-11-25 12:58:34,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt. 33: [2022-11-25 12:58:34,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt 33: [2022-11-25 12:58:34,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt 33: [2022-11-25 12:58:34,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt 33: [2022-11-25 12:58:34,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt 33: [2022-11-25 12:58:34,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt 41: [2022-11-25 12:58:34,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt 42: [2022-11-25 12:58:34,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt. 42: [2022-11-25 12:58:34,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt. 42: [2022-11-25 12:58:34,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt. 33: [2022-11-25 12:58:34,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt 41: [2022-11-25 12:58:34,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt 33: [2022-11-25 12:58:34,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt 41: [2022-11-25 12:58:34,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt 41: [2022-11-25 12:58:34,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt 33: [2022-11-25 12:58:34,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 33: [2022-11-25 12:58:34,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 33: [2022-11-25 12:58:34,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 33: [2022-11-25 12:58:34,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 33: [2022-11-25 12:58:34,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 33: [2022-11-25 12:58:34,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 41: [2022-11-25 12:58:34,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 33: [2022-11-25 12:58:34,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 41: [2022-11-25 12:58:34,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 41: [2022-11-25 12:58:34,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 41: [2022-11-25 12:58:34,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 42: [2022-11-25 12:58:34,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt. 42: [2022-11-25 12:58:34,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt. 42: [2022-11-25 12:58:34,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt. 42: [2022-11-25 12:58:34,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt. 42: [2022-11-25 12:58:34,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt 42: [2022-11-25 12:58:34,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt 42: [2022-11-25 12:58:34,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt. 42: [2022-11-25 12:58:34,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 42: [2022-11-25 12:58:34,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt 42: [2022-11-25 12:58:34,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 42: [2022-11-25 12:58:34,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt 42: [2022-11-25 12:58:34,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt 42: [2022-11-25 12:58:34,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt 42: [2022-11-25 12:58:34,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt 42: [2022-11-25 12:58:34,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt 42: [2022-11-25 12:58:34,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 42: [2022-11-25 12:58:34,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 42: [2022-11-25 12:58:34,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 42: [2022-11-25 12:58:34,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 42: [2022-11-25 12:58:34,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 42: [2022-11-25 12:58:34,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 41: [2022-11-25 12:58:34,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt. 41: [2022-11-25 12:58:34,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt 41: [2022-11-25 12:58:34,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 41: [2022-11-25 12:58:34,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt. 41: [2022-11-25 12:58:34,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt 41: [2022-11-25 12:58:34,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 62: [2022-11-25 12:58:34,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt. 41: [2022-11-25 12:58:34,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt. 41: [2022-11-25 12:58:34,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt 41: [2022-11-25 12:58:34,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 62: [2022-11-25 12:58:34,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt 62: [2022-11-25 12:58:34,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 30: [2022-11-25 12:58:34,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. 30: [2022-11-25 12:58:34,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. 30: [2022-11-25 12:58:34,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. 30: [2022-11-25 12:58:34,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. 30: [2022-11-25 12:58:34,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. 30: [2022-11-25 12:58:34,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. 30: [2022-11-25 12:58:34,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. 30: [2022-11-25 12:58:34,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt 30: [2022-11-25 12:58:34,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt 30: [2022-11-25 12:58:34,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. 30: [2022-11-25 12:58:34,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt 30: [2022-11-25 12:58:34,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt 30: [2022-11-25 12:58:34,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt 30: [2022-11-25 12:58:34,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 30: [2022-11-25 12:58:34,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 30: [2022-11-25 12:58:34,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt 30: [2022-11-25 12:58:34,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt 30: [2022-11-25 12:58:34,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 30: [2022-11-25 12:58:34,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 30: [2022-11-25 12:58:34,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 30: [2022-11-25 12:58:34,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt 30: [2022-11-25 12:58:34,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 30: [2022-11-25 12:58:34,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 30: [2022-11-25 12:58:34,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 21: [2022-11-25 12:58:34,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. 21: [2022-11-25 12:58:34,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. 21: [2022-11-25 12:58:34,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. 25: [2022-11-25 12:58:34,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. 25: [2022-11-25 12:58:34,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. 25: [2022-11-25 12:58:34,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt 25: [2022-11-25 12:58:34,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. 25: [2022-11-25 12:58:34,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. 25: [2022-11-25 12:58:34,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt 25: [2022-11-25 12:58:34,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. 25: [2022-11-25 12:58:34,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt 25: [2022-11-25 12:58:34,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 25: [2022-11-25 12:58:34,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 25: [2022-11-25 12:58:34,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 25: [2022-11-25 12:58:34,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt 25: [2022-11-25 12:58:34,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt 25: [2022-11-25 12:58:34,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt 25: [2022-11-25 12:58:34,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt 25: [2022-11-25 12:58:34,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt 25: [2022-11-25 12:58:34,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 25: [2022-11-25 12:58:34,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 25: [2022-11-25 12:58:34,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 25: [2022-11-25 12:58:34,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 25: [2022-11-25 12:58:34,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 21: [2022-11-25 12:58:34,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. 21: [2022-11-25 12:58:34,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. 21: [2022-11-25 12:58:34,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. 21: [2022-11-25 12:58:34,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. 21: [2022-11-25 12:58:34,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. 21: [2022-11-25 12:58:34,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt 21: [2022-11-25 12:58:34,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt 21: [2022-11-25 12:58:34,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt 21: [2022-11-25 12:58:34,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt 21: [2022-11-25 12:58:34,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt 21: [2022-11-25 12:58:34,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt 21: [2022-11-25 12:58:34,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 21: [2022-11-25 12:58:34,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 21: [2022-11-25 12:58:34,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 21: [2022-11-25 12:58:34,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 21: [2022-11-25 12:58:34,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 21: [2022-11-25 12:58:34,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt 21: [2022-11-25 12:58:34,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt 21: [2022-11-25 12:58:34,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 21: [2022-11-25 12:58:34,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 21: [2022-11-25 12:58:34,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 29: [2022-11-25 12:58:34,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. 29: [2022-11-25 12:58:34,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. 29: [2022-11-25 12:58:34,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. 29: [2022-11-25 12:58:34,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. 29: [2022-11-25 12:58:34,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. 12: [2022-11-25 12:58:34,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. 19: [2022-11-25 12:58:34,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. 19: [2022-11-25 12:58:34,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt 19: [2022-11-25 12:58:34,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. 19: [2022-11-25 12:58:34,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 19: [2022-11-25 12:58:34,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt 19: [2022-11-25 12:58:34,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 12: [2022-11-25 12:58:34,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. 12: [2022-11-25 12:58:34,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. 12: [2022-11-25 12:58:34,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. 12: [2022-11-25 12:58:34,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. 12: [2022-11-25 12:58:34,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. 12: [2022-11-25 12:58:34,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt 12: [2022-11-25 12:58:34,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt 12: [2022-11-25 12:58:34,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. 12: [2022-11-25 12:58:34,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt 12: [2022-11-25 12:58:34,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt 12: [2022-11-25 12:58:34,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 12: [2022-11-25 12:58:34,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt 12: [2022-11-25 12:58:34,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt 12: [2022-11-25 12:58:34,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 12: [2022-11-25 12:58:34,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 12: [2022-11-25 12:58:34,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 58: [2022-11-25 12:58:34,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt. 58: [2022-11-25 12:58:34,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt. 58: [2022-11-25 12:58:34,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt. 58: [2022-11-25 12:58:34,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt. 58: [2022-11-25 12:58:34,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt. 58: [2022-11-25 12:58:34,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt. 12: [2022-11-25 12:58:34,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt 12: [2022-11-25 12:58:34,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 12: [2022-11-25 12:58:34,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 12: [2022-11-25 12:58:34,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 29: [2022-11-25 12:58:34,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. 29: [2022-11-25 12:58:34,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. 29: [2022-11-25 12:58:34,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt 29: [2022-11-25 12:58:34,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt 29: [2022-11-25 12:58:34,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt 29: [2022-11-25 12:58:34,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt 29: [2022-11-25 12:58:34,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt 29: [2022-11-25 12:58:34,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt 29: [2022-11-25 12:58:34,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 29: [2022-11-25 12:58:34,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 29: [2022-11-25 12:58:34,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 29: [2022-11-25 12:58:34,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt 29: [2022-11-25 12:58:34,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 29: [2022-11-25 12:58:34,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 29: [2022-11-25 12:58:34,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 29: [2022-11-25 12:58:34,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 24: [2022-11-25 12:58:34,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. 24: [2022-11-25 12:58:34,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. 24: [2022-11-25 12:58:34,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. 24: [2022-11-25 12:58:34,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. 24: [2022-11-25 12:58:34,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. 58: [2022-11-25 12:58:34,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt. 58: [2022-11-25 12:58:34,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt 58: [2022-11-25 12:58:34,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt 58: [2022-11-25 12:58:34,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt 58: [2022-11-25 12:58:34,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt 58: [2022-11-25 12:58:34,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt 58: [2022-11-25 12:58:34,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt 58: [2022-11-25 12:58:34,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt 58: [2022-11-25 12:58:34,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 58: [2022-11-25 12:58:34,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 58: [2022-11-25 12:58:34,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 58: [2022-11-25 12:58:34,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 58: [2022-11-25 12:58:34,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 58: [2022-11-25 12:58:34,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 58: [2022-11-25 12:58:34,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 56: [2022-11-25 12:58:34,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt. 56: [2022-11-25 12:58:34,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt. 56: [2022-11-25 12:58:34,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt. 56: [2022-11-25 12:58:34,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt. 56: [2022-11-25 12:58:34,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt. 56: [2022-11-25 12:58:34,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt. 24: [2022-11-25 12:58:34,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. 24: [2022-11-25 12:58:34,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. 24: [2022-11-25 12:58:34,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt 24: [2022-11-25 12:58:34,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt 24: [2022-11-25 12:58:34,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt 24: [2022-11-25 12:58:34,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt 24: [2022-11-25 12:58:34,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt 24: [2022-11-25 12:58:34,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 24: [2022-11-25 12:58:34,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 24: [2022-11-25 12:58:34,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 24: [2022-11-25 12:58:34,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 24: [2022-11-25 12:58:34,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt 24: [2022-11-25 12:58:34,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt 24: [2022-11-25 12:58:34,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 24: [2022-11-25 12:58:34,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 24: [2022-11-25 12:58:34,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 26: [2022-11-25 12:58:34,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. 26: [2022-11-25 12:58:34,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. 26: [2022-11-25 12:58:34,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. 26: [2022-11-25 12:58:34,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. 24: [2022-11-25 12:58:34,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. 24: [2022-11-25 12:58:34,625] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt 24: [2022-11-25 12:58:34,625] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 26: [2022-11-25 12:58:34,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. 26: [2022-11-25 12:58:34,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt 26: [2022-11-25 12:58:34,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt 56: [2022-11-25 12:58:34,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt. 26: [2022-11-25 12:58:34,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt 56: [2022-11-25 12:58:34,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt 56: [2022-11-25 12:58:34,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt 56: [2022-11-25 12:58:34,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt 56: [2022-11-25 12:58:34,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt 56: [2022-11-25 12:58:34,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt 26: [2022-11-25 12:58:34,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt 56: [2022-11-25 12:58:34,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt 26: [2022-11-25 12:58:34,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 26: [2022-11-25 12:58:34,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 26: [2022-11-25 12:58:34,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt 26: [2022-11-25 12:58:34,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 56: [2022-11-25 12:58:34,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt 26: [2022-11-25 12:58:34,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 56: [2022-11-25 12:58:34,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 56: [2022-11-25 12:58:34,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 56: [2022-11-25 12:58:34,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 56: [2022-11-25 12:58:34,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 56: [2022-11-25 12:58:34,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 26: [2022-11-25 12:58:34,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 56: [2022-11-25 12:58:34,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 56: [2022-11-25 12:58:34,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 45: [2022-11-25 12:58:34,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt. 45: [2022-11-25 12:58:34,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt. 45: [2022-11-25 12:58:34,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt. 45: [2022-11-25 12:58:34,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt. 45: [2022-11-25 12:58:34,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt. 45: [2022-11-25 12:58:34,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt. 52: [2022-11-25 12:58:34,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt. 52: [2022-11-25 12:58:34,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt. 52: [2022-11-25 12:58:34,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt. 52: [2022-11-25 12:58:34,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt. 52: [2022-11-25 12:58:34,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt. 52: [2022-11-25 12:58:34,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt. 45: [2022-11-25 12:58:34,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt 45: [2022-11-25 12:58:34,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt 45: [2022-11-25 12:58:34,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt 45: [2022-11-25 12:58:34,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt 45: [2022-11-25 12:58:34,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt 45: [2022-11-25 12:58:34,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt. 45: [2022-11-25 12:58:34,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt 45: [2022-11-25 12:58:34,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 45: [2022-11-25 12:58:34,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 45: [2022-11-25 12:58:34,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 45: [2022-11-25 12:58:34,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 45: [2022-11-25 12:58:34,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 45: [2022-11-25 12:58:34,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 45: [2022-11-25 12:58:34,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt 45: [2022-11-25 12:58:34,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 45: [2022-11-25 12:58:34,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt. 45: [2022-11-25 12:58:34,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt 45: [2022-11-25 12:58:34,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 6: [2022-11-25 12:58:34,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. 38: [2022-11-25 12:58:34,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt. 38: [2022-11-25 12:58:34,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt. 38: [2022-11-25 12:58:34,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt. 38: [2022-11-25 12:58:34,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt. 38: [2022-11-25 12:58:34,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt. 38: [2022-11-25 12:58:34,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt. 52: [2022-11-25 12:58:34,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt. 52: [2022-11-25 12:58:34,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt. 52: [2022-11-25 12:58:34,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt 52: [2022-11-25 12:58:34,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt 52: [2022-11-25 12:58:34,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt 52: [2022-11-25 12:58:34,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt 52: [2022-11-25 12:58:34,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt 52: [2022-11-25 12:58:34,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt 52: [2022-11-25 12:58:34,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt 52: [2022-11-25 12:58:34,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 52: [2022-11-25 12:58:34,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt 52: [2022-11-25 12:58:34,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 52: [2022-11-25 12:58:34,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 52: [2022-11-25 12:58:34,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 52: [2022-11-25 12:58:34,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 52: [2022-11-25 12:58:34,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 52: [2022-11-25 12:58:34,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 52: [2022-11-25 12:58:34,632] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 35: [2022-11-25 12:58:34,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt. 35: [2022-11-25 12:58:34,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt. 35: [2022-11-25 12:58:34,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt. 35: [2022-11-25 12:58:34,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt. 35: [2022-11-25 12:58:34,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt. 6: [2022-11-25 12:58:34,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt 6: [2022-11-25 12:58:34,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 10: [2022-11-25 12:58:34,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. 10: [2022-11-25 12:58:34,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. 10: [2022-11-25 12:58:34,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. 38: [2022-11-25 12:58:34,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt. 38: [2022-11-25 12:58:34,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt. 38: [2022-11-25 12:58:34,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt 38: [2022-11-25 12:58:34,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt 38: [2022-11-25 12:58:34,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt 38: [2022-11-25 12:58:34,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt 38: [2022-11-25 12:58:34,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt 38: [2022-11-25 12:58:34,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt 38: [2022-11-25 12:58:34,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt 38: [2022-11-25 12:58:34,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 38: [2022-11-25 12:58:34,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 38: [2022-11-25 12:58:34,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt 38: [2022-11-25 12:58:34,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 38: [2022-11-25 12:58:34,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 38: [2022-11-25 12:58:34,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 38: [2022-11-25 12:58:34,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 38: [2022-11-25 12:58:34,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 38: [2022-11-25 12:58:34,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 10: [2022-11-25 12:58:34,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. 10: [2022-11-25 12:58:34,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. 10: [2022-11-25 12:58:34,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt 10: [2022-11-25 12:58:34,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt 10: [2022-11-25 12:58:34,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt 10: [2022-11-25 12:58:34,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt 10: [2022-11-25 12:58:34,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt 1: [2022-11-25 12:58:34,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. 1: [2022-11-25 12:58:34,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. 1: [2022-11-25 12:58:34,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. 1: [2022-11-25 12:58:34,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. 1: [2022-11-25 12:58:34,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. 1: [2022-11-25 12:58:34,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. 10: [2022-11-25 12:58:34,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. 10: [2022-11-25 12:58:34,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 10: [2022-11-25 12:58:34,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 10: [2022-11-25 12:58:34,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 10: [2022-11-25 12:58:34,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 10: [2022-11-25 12:58:34,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 10: [2022-11-25 12:58:34,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt 10: [2022-11-25 12:58:34,640] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 1: [2022-11-25 12:58:34,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. 1: [2022-11-25 12:58:34,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. 1: [2022-11-25 12:58:34,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt 1: [2022-11-25 12:58:34,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt 1: [2022-11-25 12:58:34,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt 1: [2022-11-25 12:58:34,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt 1: [2022-11-25 12:58:34,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt 1: [2022-11-25 12:58:34,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt 1: [2022-11-25 12:58:34,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt 1: [2022-11-25 12:58:34,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 1: [2022-11-25 12:58:34,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 1: [2022-11-25 12:58:34,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 1: [2022-11-25 12:58:34,643] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt 1: [2022-11-25 12:58:34,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 1: [2022-11-25 12:58:34,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 1: [2022-11-25 12:58:34,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 1: [2022-11-25 12:58:34,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 1: [2022-11-25 12:58:34,643] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 35: [2022-11-25 12:58:34,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt. 35: [2022-11-25 12:58:34,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt. 35: [2022-11-25 12:58:34,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt 35: [2022-11-25 12:58:34,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt 35: [2022-11-25 12:58:34,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt 35: [2022-11-25 12:58:34,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt 35: [2022-11-25 12:58:34,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt 35: [2022-11-25 12:58:34,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 35: [2022-11-25 12:58:34,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 35: [2022-11-25 12:58:34,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 35: [2022-11-25 12:58:34,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt 35: [2022-11-25 12:58:34,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt 35: [2022-11-25 12:58:34,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 35: [2022-11-25 12:58:34,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 35: [2022-11-25 12:58:34,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 35: [2022-11-25 12:58:34,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 37: [2022-11-25 12:58:34,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt. 37: [2022-11-25 12:58:34,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt. 37: [2022-11-25 12:58:34,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt. 37: [2022-11-25 12:58:34,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt. 20: [2022-11-25 12:58:34,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. 20: [2022-11-25 12:58:34,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. 20: [2022-11-25 12:58:34,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. 20: [2022-11-25 12:58:34,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. 37: [2022-11-25 12:58:34,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt. 37: [2022-11-25 12:58:34,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt. 37: [2022-11-25 12:58:34,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt. 37: [2022-11-25 12:58:34,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt 37: [2022-11-25 12:58:34,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt 37: [2022-11-25 12:58:34,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt 37: [2022-11-25 12:58:34,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt 37: [2022-11-25 12:58:34,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt 37: [2022-11-25 12:58:34,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt 37: [2022-11-25 12:58:34,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 37: [2022-11-25 12:58:34,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 37: [2022-11-25 12:58:34,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt 37: [2022-11-25 12:58:34,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 37: [2022-11-25 12:58:34,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 37: [2022-11-25 12:58:34,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 37: [2022-11-25 12:58:34,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 37: [2022-11-25 12:58:34,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 31: [2022-11-25 12:58:34,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. 31: [2022-11-25 12:58:34,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. 31: [2022-11-25 12:58:34,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. 31: [2022-11-25 12:58:34,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. 31: [2022-11-25 12:58:34,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. 31: [2022-11-25 12:58:34,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. 31: [2022-11-25 12:58:34,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. 31: [2022-11-25 12:58:34,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. 31: [2022-11-25 12:58:34,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt 31: [2022-11-25 12:58:34,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt 31: [2022-11-25 12:58:34,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt 31: [2022-11-25 12:58:34,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt 31: [2022-11-25 12:58:34,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt 31: [2022-11-25 12:58:34,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt 31: [2022-11-25 12:58:34,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt 31: [2022-11-25 12:58:34,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt 31: [2022-11-25 12:58:34,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 31: [2022-11-25 12:58:34,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 31: [2022-11-25 12:58:34,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 31: [2022-11-25 12:58:34,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 31: [2022-11-25 12:58:34,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 31: [2022-11-25 12:58:34,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 31: [2022-11-25 12:58:34,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 31: [2022-11-25 12:58:34,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 5: [2022-11-25 12:58:34,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. 5: [2022-11-25 12:58:34,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. 5: [2022-11-25 12:58:34,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. 49: [2022-11-25 12:58:34,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt. 49: [2022-11-25 12:58:34,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt. 20: [2022-11-25 12:58:34,644] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt 20: [2022-11-25 12:58:34,644] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt 20: [2022-11-25 12:58:34,644] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt 20: [2022-11-25 12:58:34,644] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt 20: [2022-11-25 12:58:34,644] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 20: [2022-11-25 12:58:34,644] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 20: [2022-11-25 12:58:34,644] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 20: [2022-11-25 12:58:34,644] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 20: [2022-11-25 12:58:34,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. 20: [2022-11-25 12:58:34,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt 20: [2022-11-25 12:58:34,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 20: [2022-11-25 12:58:34,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. 20: [2022-11-25 12:58:34,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. 20: [2022-11-25 12:58:34,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. 20: [2022-11-25 12:58:34,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt 20: [2022-11-25 12:58:34,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt 20: [2022-11-25 12:58:34,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt 27: [2022-11-25 12:58:34,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. 27: [2022-11-25 12:58:34,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. 27: [2022-11-25 12:58:34,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. 27: [2022-11-25 12:58:34,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. 27: [2022-11-25 12:58:34,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. 27: [2022-11-25 12:58:34,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. 20: [2022-11-25 12:58:34,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 20: [2022-11-25 12:58:34,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 20: [2022-11-25 12:58:34,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 49: [2022-11-25 12:58:34,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt. 49: [2022-11-25 12:58:34,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt. 49: [2022-11-25 12:58:34,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt. 49: [2022-11-25 12:58:34,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt. 49: [2022-11-25 12:58:34,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt. 49: [2022-11-25 12:58:34,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt 49: [2022-11-25 12:58:34,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt 49: [2022-11-25 12:58:34,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt. 49: [2022-11-25 12:58:34,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt 49: [2022-11-25 12:58:34,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 49: [2022-11-25 12:58:34,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 49: [2022-11-25 12:58:34,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt 49: [2022-11-25 12:58:34,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt 49: [2022-11-25 12:58:34,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt 49: [2022-11-25 12:58:34,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt 49: [2022-11-25 12:58:34,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt 49: [2022-11-25 12:58:34,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 49: [2022-11-25 12:58:34,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 49: [2022-11-25 12:58:34,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 49: [2022-11-25 12:58:34,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 49: [2022-11-25 12:58:34,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 49: [2022-11-25 12:58:34,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 12: [2022-11-25 12:58:34,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. 5: [2022-11-25 12:58:34,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. 5: [2022-11-25 12:58:34,647] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. 5: [2022-11-25 12:58:34,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt 5: [2022-11-25 12:58:34,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt 5: [2022-11-25 12:58:34,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt 5: [2022-11-25 12:58:34,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt 5: [2022-11-25 12:58:34,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 5: [2022-11-25 12:58:34,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 5: [2022-11-25 12:58:34,647] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt 5: [2022-11-25 12:58:34,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 5: [2022-11-25 12:58:34,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 5: [2022-11-25 12:58:34,647] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 5: [2022-11-25 12:58:34,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. 5: [2022-11-25 12:58:34,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt 5: [2022-11-25 12:58:34,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 27: [2022-11-25 12:58:34,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt 27: [2022-11-25 12:58:34,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. 27: [2022-11-25 12:58:34,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt 27: [2022-11-25 12:58:34,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt 27: [2022-11-25 12:58:34,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt 27: [2022-11-25 12:58:34,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt 27: [2022-11-25 12:58:34,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt 27: [2022-11-25 12:58:34,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 27: [2022-11-25 12:58:34,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 27: [2022-11-25 12:58:34,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 27: [2022-11-25 12:58:34,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 27: [2022-11-25 12:58:34,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 27: [2022-11-25 12:58:34,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt 27: [2022-11-25 12:58:34,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 27: [2022-11-25 12:58:34,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 27: [2022-11-25 12:58:34,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. 27: [2022-11-25 12:58:34,652] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt 27: [2022-11-25 12:58:34,652] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 44: [2022-11-25 12:58:34,655] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt. 12: [2022-11-25 12:58:34,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt 12: [2022-11-25 12:58:34,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 44: [2022-11-25 12:58:34,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt 44: [2022-11-25 12:58:34,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 55: [2022-11-25 12:58:34,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt. 55: [2022-11-25 12:58:34,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt. 55: [2022-11-25 12:58:34,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt. 55: [2022-11-25 12:58:34,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt. 44: [2022-11-25 12:58:34,661] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt. 44: [2022-11-25 12:58:34,661] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt 44: [2022-11-25 12:58:34,661] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 55: [2022-11-25 12:58:34,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt. 55: [2022-11-25 12:58:34,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt. 55: [2022-11-25 12:58:34,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt 55: [2022-11-25 12:58:34,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt 55: [2022-11-25 12:58:34,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt 55: [2022-11-25 12:58:34,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt 55: [2022-11-25 12:58:34,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt. 55: [2022-11-25 12:58:34,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 55: [2022-11-25 12:58:34,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt 55: [2022-11-25 12:58:34,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 55: [2022-11-25 12:58:34,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt 55: [2022-11-25 12:58:34,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 55: [2022-11-25 12:58:34,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 55: [2022-11-25 12:58:34,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 55: [2022-11-25 12:58:34,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt 55: [2022-11-25 12:58:34,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 55: [2022-11-25 12:58:34,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 5: [2022-11-25 12:58:34,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. 18: [2022-11-25 12:58:34,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. 18: [2022-11-25 12:58:34,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. 18: [2022-11-25 12:58:34,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. 18: [2022-11-25 12:58:34,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. 18: [2022-11-25 12:58:34,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. 18: [2022-11-25 12:58:34,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt 18: [2022-11-25 12:58:34,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt 18: [2022-11-25 12:58:34,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt 18: [2022-11-25 12:58:34,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 18: [2022-11-25 12:58:34,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 18: [2022-11-25 12:58:34,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt 18: [2022-11-25 12:58:34,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt 61: [2022-11-25 12:58:34,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt. 61: [2022-11-25 12:58:34,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt. 61: [2022-11-25 12:58:34,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt. 18: [2022-11-25 12:58:34,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 18: [2022-11-25 12:58:34,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 18: [2022-11-25 12:58:34,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 18: [2022-11-25 12:58:34,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. 18: [2022-11-25 12:58:34,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt 18: [2022-11-25 12:58:34,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 37: [2022-11-25 12:58:34,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt. 15: [2022-11-25 12:58:34,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. 15: [2022-11-25 12:58:34,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. 15: [2022-11-25 12:58:34,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. 15: [2022-11-25 12:58:34,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. 15: [2022-11-25 12:58:34,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. 15: [2022-11-25 12:58:34,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. 5: [2022-11-25 12:58:34,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt 5: [2022-11-25 12:58:34,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 15: [2022-11-25 12:58:34,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. 15: [2022-11-25 12:58:34,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. 5: [2022-11-25 12:58:34,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. 15: [2022-11-25 12:58:34,670] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt 15: [2022-11-25 12:58:34,670] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt 15: [2022-11-25 12:58:34,670] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt 5: [2022-11-25 12:58:34,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt 15: [2022-11-25 12:58:34,670] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt 15: [2022-11-25 12:58:34,670] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt 15: [2022-11-25 12:58:34,670] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt 5: [2022-11-25 12:58:34,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 15: [2022-11-25 12:58:34,670] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 15: [2022-11-25 12:58:34,670] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 15: [2022-11-25 12:58:34,670] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt 15: [2022-11-25 12:58:34,670] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt 15: [2022-11-25 12:58:34,670] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 15: [2022-11-25 12:58:34,670] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 15: [2022-11-25 12:58:34,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 15: [2022-11-25 12:58:34,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 15: [2022-11-25 12:58:34,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 15: [2022-11-25 12:58:34,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 37: [2022-11-25 12:58:34,671] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt 37: [2022-11-25 12:58:34,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 61: [2022-11-25 12:58:34,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt. 61: [2022-11-25 12:58:34,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt. 61: [2022-11-25 12:58:34,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt. 61: [2022-11-25 12:58:34,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt 61: [2022-11-25 12:58:34,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt 61: [2022-11-25 12:58:34,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt 61: [2022-11-25 12:58:34,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt 61: [2022-11-25 12:58:34,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt 61: [2022-11-25 12:58:34,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 61: [2022-11-25 12:58:34,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 61: [2022-11-25 12:58:34,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 61: [2022-11-25 12:58:34,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt 61: [2022-11-25 12:58:34,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 61: [2022-11-25 12:58:34,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 61: [2022-11-25 12:58:34,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 61: [2022-11-25 12:58:34,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt. 61: [2022-11-25 12:58:34,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt 61: [2022-11-25 12:58:34,669] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 2: [2022-11-25 12:58:34,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. 2: [2022-11-25 12:58:34,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. 2: [2022-11-25 12:58:34,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. 2: [2022-11-25 12:58:34,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. 51: [2022-11-25 12:58:34,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt. 51: [2022-11-25 12:58:34,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt. 51: [2022-11-25 12:58:34,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt. 2: [2022-11-25 12:58:34,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. 2: [2022-11-25 12:58:34,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. 2: [2022-11-25 12:58:34,675] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. 2: [2022-11-25 12:58:34,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt 2: [2022-11-25 12:58:34,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt 2: [2022-11-25 12:58:34,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt 2: [2022-11-25 12:58:34,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt 2: [2022-11-25 12:58:34,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt 2: [2022-11-25 12:58:34,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt 2: [2022-11-25 12:58:34,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt 2: [2022-11-25 12:58:34,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 2: [2022-11-25 12:58:34,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 2: [2022-11-25 12:58:34,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 2: [2022-11-25 12:58:34,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 2: [2022-11-25 12:58:34,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 2: [2022-11-25 12:58:34,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 2: [2022-11-25 12:58:34,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 51: [2022-11-25 12:58:34,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt. 51: [2022-11-25 12:58:34,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt. 51: [2022-11-25 12:58:34,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt. 51: [2022-11-25 12:58:34,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt. 51: [2022-11-25 12:58:34,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt 51: [2022-11-25 12:58:34,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt 51: [2022-11-25 12:58:34,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt 51: [2022-11-25 12:58:34,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt 51: [2022-11-25 12:58:34,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt 51: [2022-11-25 12:58:34,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt 51: [2022-11-25 12:58:34,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt 51: [2022-11-25 12:58:34,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 51: [2022-11-25 12:58:34,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 51: [2022-11-25 12:58:34,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 51: [2022-11-25 12:58:34,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 51: [2022-11-25 12:58:34,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 51: [2022-11-25 12:58:34,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 51: [2022-11-25 12:58:34,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 51: [2022-11-25 12:58:34,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt. 51: [2022-11-25 12:58:34,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt 51: [2022-11-25 12:58:34,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 33: [2022-11-25 12:58:34,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt. 28: [2022-11-25 12:58:34,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. 28: [2022-11-25 12:58:34,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. 28: [2022-11-25 12:58:34,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. 28: [2022-11-25 12:58:34,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. 28: [2022-11-25 12:58:34,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. 28: [2022-11-25 12:58:34,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. 28: [2022-11-25 12:58:34,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt 28: [2022-11-25 12:58:34,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt 28: [2022-11-25 12:58:34,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt 28: [2022-11-25 12:58:34,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt 28: [2022-11-25 12:58:34,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt 28: [2022-11-25 12:58:34,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt 28: [2022-11-25 12:58:34,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 28: [2022-11-25 12:58:34,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 28: [2022-11-25 12:58:34,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 28: [2022-11-25 12:58:34,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 28: [2022-11-25 12:58:34,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 28: [2022-11-25 12:58:34,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 33: [2022-11-25 12:58:34,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt 33: [2022-11-25 12:58:34,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 53: [2022-11-25 12:58:34,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt. 53: [2022-11-25 12:58:34,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt. 53: [2022-11-25 12:58:34,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt. 53: [2022-11-25 12:58:34,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt. 53: [2022-11-25 12:58:34,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt. 53: [2022-11-25 12:58:34,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt. 53: [2022-11-25 12:58:34,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt 53: [2022-11-25 12:58:34,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt 53: [2022-11-25 12:58:34,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt 53: [2022-11-25 12:58:34,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt 53: [2022-11-25 12:58:34,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt 53: [2022-11-25 12:58:34,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt 53: [2022-11-25 12:58:34,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 53: [2022-11-25 12:58:34,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 53: [2022-11-25 12:58:34,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 53: [2022-11-25 12:58:34,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 53: [2022-11-25 12:58:34,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 53: [2022-11-25 12:58:34,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 16: [2022-11-25 12:58:34,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. 16: [2022-11-25 12:58:34,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. 16: [2022-11-25 12:58:34,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. 16: [2022-11-25 12:58:34,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. 16: [2022-11-25 12:58:34,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. 16: [2022-11-25 12:58:34,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. 35: [2022-11-25 12:58:34,696] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt. 53: [2022-11-25 12:58:34,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt. 35: [2022-11-25 12:58:34,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt 35: [2022-11-25 12:58:34,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 53: [2022-11-25 12:58:34,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt 53: [2022-11-25 12:58:34,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 16: [2022-11-25 12:58:34,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. 16: [2022-11-25 12:58:34,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt 16: [2022-11-25 12:58:34,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt 16: [2022-11-25 12:58:34,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt 16: [2022-11-25 12:58:34,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt 16: [2022-11-25 12:58:34,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt 16: [2022-11-25 12:58:34,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt 16: [2022-11-25 12:58:34,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 16: [2022-11-25 12:58:34,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 16: [2022-11-25 12:58:34,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 16: [2022-11-25 12:58:34,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt 16: [2022-11-25 12:58:34,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 16: [2022-11-25 12:58:34,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 16: [2022-11-25 12:58:34,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 16: [2022-11-25 12:58:34,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 19: [2022-11-25 12:58:34,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. 19: [2022-11-25 12:58:34,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt 19: [2022-11-25 12:58:34,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 18: [2022-11-25 12:58:34,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. 23: [2022-11-25 12:58:34,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. 23: [2022-11-25 12:58:34,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. 23: [2022-11-25 12:58:34,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. 23: [2022-11-25 12:58:34,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. 23: [2022-11-25 12:58:34,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. 23: [2022-11-25 12:58:34,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. 23: [2022-11-25 12:58:34,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt 23: [2022-11-25 12:58:34,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt 23: [2022-11-25 12:58:34,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt 23: [2022-11-25 12:58:34,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 23: [2022-11-25 12:58:34,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 23: [2022-11-25 12:58:34,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt 23: [2022-11-25 12:58:34,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt 23: [2022-11-25 12:58:34,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt 23: [2022-11-25 12:58:34,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 23: [2022-11-25 12:58:34,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 23: [2022-11-25 12:58:34,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 23: [2022-11-25 12:58:34,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 18: [2022-11-25 12:58:34,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt 18: [2022-11-25 12:58:34,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 10: [2022-11-25 12:58:34,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. 10: [2022-11-25 12:58:34,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt 10: [2022-11-25 12:58:34,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 10: [2022-11-25 12:58:34,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. 10: [2022-11-25 12:58:34,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt 10: [2022-11-25 12:58:34,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 58: [2022-11-25 12:58:34,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt. 39: [2022-11-25 12:58:34,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt. 39: [2022-11-25 12:58:34,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt. 39: [2022-11-25 12:58:34,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt. 39: [2022-11-25 12:58:34,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt. 39: [2022-11-25 12:58:34,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt. 39: [2022-11-25 12:58:34,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt. 58: [2022-11-25 12:58:34,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt 58: [2022-11-25 12:58:34,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 39: [2022-11-25 12:58:34,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt 39: [2022-11-25 12:58:34,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt. 39: [2022-11-25 12:58:34,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt 39: [2022-11-25 12:58:34,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt 39: [2022-11-25 12:58:34,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt 39: [2022-11-25 12:58:34,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt 39: [2022-11-25 12:58:34,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt 39: [2022-11-25 12:58:34,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 39: [2022-11-25 12:58:34,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 39: [2022-11-25 12:58:34,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 39: [2022-11-25 12:58:34,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 39: [2022-11-25 12:58:34,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 39: [2022-11-25 12:58:34,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt 39: [2022-11-25 12:58:34,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 39: [2022-11-25 12:58:34,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 48: [2022-11-25 12:58:34,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt. 48: [2022-11-25 12:58:34,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt. 48: [2022-11-25 12:58:34,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt. 59: [2022-11-25 12:58:34,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt. 59: [2022-11-25 12:58:34,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt. 59: [2022-11-25 12:58:34,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt. 59: [2022-11-25 12:58:34,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt. 59: [2022-11-25 12:58:34,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt. 59: [2022-11-25 12:58:34,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt. 59: [2022-11-25 12:58:34,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt. 59: [2022-11-25 12:58:34,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt 59: [2022-11-25 12:58:34,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt 59: [2022-11-25 12:58:34,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt 59: [2022-11-25 12:58:34,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt 59: [2022-11-25 12:58:34,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt 59: [2022-11-25 12:58:34,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 59: [2022-11-25 12:58:34,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 59: [2022-11-25 12:58:34,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 59: [2022-11-25 12:58:34,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 48: [2022-11-25 12:58:34,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt. 48: [2022-11-25 12:58:34,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt. 59: [2022-11-25 12:58:34,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt 40: [2022-11-25 12:58:34,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt. 40: [2022-11-25 12:58:34,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt. 40: [2022-11-25 12:58:34,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt. 48: [2022-11-25 12:58:34,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt. 48: [2022-11-25 12:58:34,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt. 48: [2022-11-25 12:58:34,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt 48: [2022-11-25 12:58:34,717] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt. 48: [2022-11-25 12:58:34,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt 59: [2022-11-25 12:58:34,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt 48: [2022-11-25 12:58:34,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt 59: [2022-11-25 12:58:34,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 48: [2022-11-25 12:58:34,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt 59: [2022-11-25 12:58:34,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 59: [2022-11-25 12:58:34,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 48: [2022-11-25 12:58:34,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt 48: [2022-11-25 12:58:34,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 48: [2022-11-25 12:58:34,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt 40: [2022-11-25 12:58:34,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt. 48: [2022-11-25 12:58:34,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 40: [2022-11-25 12:58:34,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt 40: [2022-11-25 12:58:34,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt 40: [2022-11-25 12:58:34,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt 48: [2022-11-25 12:58:34,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 48: [2022-11-25 12:58:34,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 48: [2022-11-25 12:58:34,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt 40: [2022-11-25 12:58:34,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt 40: [2022-11-25 12:58:34,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 40: [2022-11-25 12:58:34,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 40: [2022-11-25 12:58:34,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 48: [2022-11-25 12:58:34,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt 40: [2022-11-25 12:58:34,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 48: [2022-11-25 12:58:34,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 48: [2022-11-25 12:58:34,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 48: [2022-11-25 12:58:34,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 48: [2022-11-25 12:58:34,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 40: [2022-11-25 12:58:34,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt. 40: [2022-11-25 12:58:34,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt. 40: [2022-11-25 12:58:34,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt 40: [2022-11-25 12:58:34,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt 40: [2022-11-25 12:58:34,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 40: [2022-11-25 12:58:34,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 40: [2022-11-25 12:58:34,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt. 40: [2022-11-25 12:58:34,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt 40: [2022-11-25 12:58:34,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt. 40: [2022-11-25 12:58:34,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 40: [2022-11-25 12:58:34,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt 40: [2022-11-25 12:58:34,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 61: [2022-11-25 12:58:34,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt. 61: [2022-11-25 12:58:34,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt 61: [2022-11-25 12:58:34,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 4: [2022-11-25 12:58:34,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. 4: [2022-11-25 12:58:34,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. 4: [2022-11-25 12:58:34,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. 4: [2022-11-25 12:58:34,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. 4: [2022-11-25 12:58:34,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. 4: [2022-11-25 12:58:34,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. 4: [2022-11-25 12:58:34,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. 4: [2022-11-25 12:58:34,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt 4: [2022-11-25 12:58:34,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt 4: [2022-11-25 12:58:34,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt 4: [2022-11-25 12:58:34,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt 4: [2022-11-25 12:58:34,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt 4: [2022-11-25 12:58:34,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt 4: [2022-11-25 12:58:34,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt 4: [2022-11-25 12:58:34,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 4: [2022-11-25 12:58:34,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 4: [2022-11-25 12:58:34,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 4: [2022-11-25 12:58:34,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 16: [2022-11-25 12:58:34,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. 4: [2022-11-25 12:58:34,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 4: [2022-11-25 12:58:34,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 4: [2022-11-25 12:58:34,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 4: [2022-11-25 12:58:34,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. 4: [2022-11-25 12:58:34,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt 4: [2022-11-25 12:58:34,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 60: [2022-11-25 12:58:34,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt. 60: [2022-11-25 12:58:34,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt 60: [2022-11-25 12:58:34,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 13: [2022-11-25 12:58:34,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. 13: [2022-11-25 12:58:34,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. 13: [2022-11-25 12:58:34,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt 13: [2022-11-25 12:58:34,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. 13: [2022-11-25 12:58:34,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. 13: [2022-11-25 12:58:34,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. 13: [2022-11-25 12:58:34,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. 13: [2022-11-25 12:58:34,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt 13: [2022-11-25 12:58:34,736] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. 13: [2022-11-25 12:58:34,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 13: [2022-11-25 12:58:34,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 13: [2022-11-25 12:58:34,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt 13: [2022-11-25 12:58:34,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt 13: [2022-11-25 12:58:34,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt 13: [2022-11-25 12:58:34,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt 13: [2022-11-25 12:58:34,736] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt 13: [2022-11-25 12:58:34,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 13: [2022-11-25 12:58:34,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 13: [2022-11-25 12:58:34,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 13: [2022-11-25 12:58:34,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 13: [2022-11-25 12:58:34,736] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 16: [2022-11-25 12:58:34,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt 16: [2022-11-25 12:58:34,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 53: [2022-11-25 12:58:34,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt. 53: [2022-11-25 12:58:34,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt 53: [2022-11-25 12:58:34,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 11: [2022-11-25 12:58:34,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. 11: [2022-11-25 12:58:34,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. 11: [2022-11-25 12:58:34,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. 11: [2022-11-25 12:58:34,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. 63: [2022-11-25 12:58:34,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt. 63: [2022-11-25 12:58:34,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt. 63: [2022-11-25 12:58:34,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt. 63: [2022-11-25 12:58:34,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt. 63: [2022-11-25 12:58:34,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt. 63: [2022-11-25 12:58:34,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt. 11: [2022-11-25 12:58:34,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. 11: [2022-11-25 12:58:34,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. 8: [2022-11-25 12:58:34,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. 63: [2022-11-25 12:58:34,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt. 11: [2022-11-25 12:58:34,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt 11: [2022-11-25 12:58:34,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt 63: [2022-11-25 12:58:34,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt 63: [2022-11-25 12:58:34,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt 63: [2022-11-25 12:58:34,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt 63: [2022-11-25 12:58:34,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt 11: [2022-11-25 12:58:34,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt 11: [2022-11-25 12:58:34,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt 63: [2022-11-25 12:58:34,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt 11: [2022-11-25 12:58:34,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt 11: [2022-11-25 12:58:34,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 11: [2022-11-25 12:58:34,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 63: [2022-11-25 12:58:34,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt 11: [2022-11-25 12:58:34,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt 63: [2022-11-25 12:58:34,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt 11: [2022-11-25 12:58:34,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 63: [2022-11-25 12:58:34,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 63: [2022-11-25 12:58:34,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 63: [2022-11-25 12:58:34,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 63: [2022-11-25 12:58:34,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 63: [2022-11-25 12:58:34,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 63: [2022-11-25 12:58:34,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 63: [2022-11-25 12:58:34,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 11: [2022-11-25 12:58:34,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 11: [2022-11-25 12:58:34,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 11: [2022-11-25 12:58:34,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 11: [2022-11-25 12:58:34,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. 11: [2022-11-25 12:58:34,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt 11: [2022-11-25 12:58:34,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 11: [2022-11-25 12:58:34,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. 11: [2022-11-25 12:58:34,745] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt 11: [2022-11-25 12:58:34,745] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 8: [2022-11-25 12:58:34,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. 8: [2022-11-25 12:58:34,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. 8: [2022-11-25 12:58:34,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. 8: [2022-11-25 12:58:34,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. 8: [2022-11-25 12:58:34,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. 8: [2022-11-25 12:58:34,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt 8: [2022-11-25 12:58:34,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt 8: [2022-11-25 12:58:34,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 8: [2022-11-25 12:58:34,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt 8: [2022-11-25 12:58:34,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt 8: [2022-11-25 12:58:34,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt 8: [2022-11-25 12:58:34,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 8: [2022-11-25 12:58:34,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt 8: [2022-11-25 12:58:34,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 8: [2022-11-25 12:58:34,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 8: [2022-11-25 12:58:34,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 8: [2022-11-25 12:58:34,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 8: [2022-11-25 12:58:34,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. 8: [2022-11-25 12:58:34,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt 8: [2022-11-25 12:58:34,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 56: [2022-11-25 12:58:34,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt. 56: [2022-11-25 12:58:34,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt 56: [2022-11-25 12:58:34,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 59: [2022-11-25 12:58:34,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt. 59: [2022-11-25 12:58:34,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt 57: [2022-11-25 12:58:34,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt. 57: [2022-11-25 12:58:34,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt. 57: [2022-11-25 12:58:34,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt. 57: [2022-11-25 12:58:34,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt. 57: [2022-11-25 12:58:34,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt. 57: [2022-11-25 12:58:34,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt. 59: [2022-11-25 12:58:34,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 57: [2022-11-25 12:58:34,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt. 57: [2022-11-25 12:58:34,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt 57: [2022-11-25 12:58:34,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt 57: [2022-11-25 12:58:34,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt 57: [2022-11-25 12:58:34,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt. 57: [2022-11-25 12:58:34,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt 57: [2022-11-25 12:58:34,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt 57: [2022-11-25 12:58:34,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt 57: [2022-11-25 12:58:34,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt 57: [2022-11-25 12:58:34,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 57: [2022-11-25 12:58:34,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 57: [2022-11-25 12:58:34,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 57: [2022-11-25 12:58:34,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 57: [2022-11-25 12:58:34,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 57: [2022-11-25 12:58:34,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 57: [2022-11-25 12:58:34,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt 57: [2022-11-25 12:58:34,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 57: [2022-11-25 12:58:34,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 22: [2022-11-25 12:58:34,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. 22: [2022-11-25 12:58:34,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. 22: [2022-11-25 12:58:34,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. 22: [2022-11-25 12:58:34,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. 22: [2022-11-25 12:58:34,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. 22: [2022-11-25 12:58:34,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. 22: [2022-11-25 12:58:34,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. 22: [2022-11-25 12:58:34,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. 22: [2022-11-25 12:58:34,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt 22: [2022-11-25 12:58:34,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt 22: [2022-11-25 12:58:34,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt 22: [2022-11-25 12:58:34,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt 22: [2022-11-25 12:58:34,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt 22: [2022-11-25 12:58:34,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt 22: [2022-11-25 12:58:34,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt 22: [2022-11-25 12:58:34,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt 22: [2022-11-25 12:58:34,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 22: [2022-11-25 12:58:34,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 22: [2022-11-25 12:58:34,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 22: [2022-11-25 12:58:34,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 22: [2022-11-25 12:58:34,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 22: [2022-11-25 12:58:34,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 22: [2022-11-25 12:58:34,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 22: [2022-11-25 12:58:34,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 2: [2022-11-25 12:58:34,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. 2: [2022-11-25 12:58:34,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt 2: [2022-11-25 12:58:34,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 9: [2022-11-25 12:58:34,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. 9: [2022-11-25 12:58:34,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. 9: [2022-11-25 12:58:34,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. 9: [2022-11-25 12:58:34,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. 9: [2022-11-25 12:58:34,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. 9: [2022-11-25 12:58:34,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. 9: [2022-11-25 12:58:34,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. 9: [2022-11-25 12:58:34,777] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. 9: [2022-11-25 12:58:34,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt 9: [2022-11-25 12:58:34,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt 9: [2022-11-25 12:58:34,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt 9: [2022-11-25 12:58:34,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt 9: [2022-11-25 12:58:34,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt 9: [2022-11-25 12:58:34,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt 9: [2022-11-25 12:58:34,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt 9: [2022-11-25 12:58:34,777] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt 9: [2022-11-25 12:58:34,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 9: [2022-11-25 12:58:34,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 9: [2022-11-25 12:58:34,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 9: [2022-11-25 12:58:34,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 9: [2022-11-25 12:58:34,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 9: [2022-11-25 12:58:34,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 9: [2022-11-25 12:58:34,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 9: [2022-11-25 12:58:34,777] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 63: [2022-11-25 12:58:34,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt. 63: [2022-11-25 12:58:34,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt 63: [2022-11-25 12:58:34,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 36: [2022-11-25 12:58:34,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt. 36: [2022-11-25 12:58:34,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt. 36: [2022-11-25 12:58:34,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt 36: [2022-11-25 12:58:34,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt 36: [2022-11-25 12:58:34,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 36: [2022-11-25 12:58:34,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 36: [2022-11-25 12:58:34,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt. 36: [2022-11-25 12:58:34,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt. 36: [2022-11-25 12:58:34,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt. 36: [2022-11-25 12:58:34,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt. 36: [2022-11-25 12:58:34,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt. 36: [2022-11-25 12:58:34,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt 36: [2022-11-25 12:58:34,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt 36: [2022-11-25 12:58:34,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt 36: [2022-11-25 12:58:34,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt 36: [2022-11-25 12:58:34,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 36: [2022-11-25 12:58:34,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt 36: [2022-11-25 12:58:34,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 36: [2022-11-25 12:58:34,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 36: [2022-11-25 12:58:34,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 36: [2022-11-25 12:58:34,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 36: [2022-11-25 12:58:34,780] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt. 36: [2022-11-25 12:58:34,780] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt 36: [2022-11-25 12:58:34,780] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 0: [2022-11-25 12:58:34,790] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt 0: [2022-11-25 12:58:34,791] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 17: [2022-11-25 12:58:34,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. 17: [2022-11-25 12:58:34,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. 17: [2022-11-25 12:58:34,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. 17: [2022-11-25 12:58:34,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. 17: [2022-11-25 12:58:34,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. 17: [2022-11-25 12:58:34,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. 17: [2022-11-25 12:58:34,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. 17: [2022-11-25 12:58:34,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. 17: [2022-11-25 12:58:34,794] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt 17: [2022-11-25 12:58:34,794] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt 17: [2022-11-25 12:58:34,794] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt 17: [2022-11-25 12:58:34,794] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt 17: [2022-11-25 12:58:34,794] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt 17: [2022-11-25 12:58:34,794] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt 17: [2022-11-25 12:58:34,794] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt 17: [2022-11-25 12:58:34,794] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt 17: [2022-11-25 12:58:34,794] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 17: [2022-11-25 12:58:34,794] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 17: [2022-11-25 12:58:34,794] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 17: [2022-11-25 12:58:34,794] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 17: [2022-11-25 12:58:34,794] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 17: [2022-11-25 12:58:34,794] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 17: [2022-11-25 12:58:34,794] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 17: [2022-11-25 12:58:34,794] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 34: [2022-11-25 12:58:34,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt. 34: [2022-11-25 12:58:34,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt. 34: [2022-11-25 12:58:34,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt. 34: [2022-11-25 12:58:34,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt. 34: [2022-11-25 12:58:34,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt. 34: [2022-11-25 12:58:34,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt. 34: [2022-11-25 12:58:34,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt 34: [2022-11-25 12:58:34,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt. 34: [2022-11-25 12:58:34,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt. 34: [2022-11-25 12:58:34,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt 34: [2022-11-25 12:58:34,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt 34: [2022-11-25 12:58:34,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt 34: [2022-11-25 12:58:34,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt 34: [2022-11-25 12:58:34,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt 34: [2022-11-25 12:58:34,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 34: [2022-11-25 12:58:34,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 34: [2022-11-25 12:58:34,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt 34: [2022-11-25 12:58:34,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt 34: [2022-11-25 12:58:34,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 34: [2022-11-25 12:58:34,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 34: [2022-11-25 12:58:34,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 34: [2022-11-25 12:58:34,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 34: [2022-11-25 12:58:34,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 34: [2022-11-25 12:58:34,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 7: [2022-11-25 12:58:34,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. 7: [2022-11-25 12:58:34,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. 7: [2022-11-25 12:58:34,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. 7: [2022-11-25 12:58:34,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. 7: [2022-11-25 12:58:34,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt 7: [2022-11-25 12:58:34,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt 7: [2022-11-25 12:58:34,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. 7: [2022-11-25 12:58:34,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. 7: [2022-11-25 12:58:34,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. 7: [2022-11-25 12:58:34,806] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. 7: [2022-11-25 12:58:34,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt 7: [2022-11-25 12:58:34,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt 7: [2022-11-25 12:58:34,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 7: [2022-11-25 12:58:34,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 7: [2022-11-25 12:58:34,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 7: [2022-11-25 12:58:34,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt 7: [2022-11-25 12:58:34,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt 7: [2022-11-25 12:58:34,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt 7: [2022-11-25 12:58:34,806] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt 7: [2022-11-25 12:58:34,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 7: [2022-11-25 12:58:34,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 7: [2022-11-25 12:58:34,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 7: [2022-11-25 12:58:34,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 7: [2022-11-25 12:58:34,806] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 14: [2022-11-25 12:58:34,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. 14: [2022-11-25 12:58:34,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. 14: [2022-11-25 12:58:34,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt 46: [2022-11-25 12:58:34,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt. 46: [2022-11-25 12:58:34,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt. 14: [2022-11-25 12:58:34,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt 14: [2022-11-25 12:58:34,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. 14: [2022-11-25 12:58:34,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 14: [2022-11-25 12:58:34,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. 14: [2022-11-25 12:58:34,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 14: [2022-11-25 12:58:34,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt 14: [2022-11-25 12:58:34,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt 46: [2022-11-25 12:58:34,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt. 46: [2022-11-25 12:58:34,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt. 46: [2022-11-25 12:58:34,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt. 14: [2022-11-25 12:58:34,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 14: [2022-11-25 12:58:34,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 46: [2022-11-25 12:58:34,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt 46: [2022-11-25 12:58:34,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt. 46: [2022-11-25 12:58:34,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt 46: [2022-11-25 12:58:34,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt 46: [2022-11-25 12:58:34,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt 14: [2022-11-25 12:58:34,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. 14: [2022-11-25 12:58:34,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. 46: [2022-11-25 12:58:34,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt 14: [2022-11-25 12:58:34,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. 46: [2022-11-25 12:58:34,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 46: [2022-11-25 12:58:34,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt 14: [2022-11-25 12:58:34,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt 46: [2022-11-25 12:58:34,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 14: [2022-11-25 12:58:34,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt 14: [2022-11-25 12:58:34,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt 46: [2022-11-25 12:58:34,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 46: [2022-11-25 12:58:34,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 14: [2022-11-25 12:58:34,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 46: [2022-11-25 12:58:34,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 14: [2022-11-25 12:58:34,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 46: [2022-11-25 12:58:34,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 14: [2022-11-25 12:58:34,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 46: [2022-11-25 12:58:34,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt. 46: [2022-11-25 12:58:34,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt 46: [2022-11-25 12:58:34,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 14: [2022-11-25 12:58:34,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. 14: [2022-11-25 12:58:34,812] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt 14: [2022-11-25 12:58:34,812] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 47: [2022-11-25 12:58:34,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt. 47: [2022-11-25 12:58:34,820] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt 47: [2022-11-25 12:58:34,820] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 47: [2022-11-25 12:58:34,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt. 47: [2022-11-25 12:58:34,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt. 47: [2022-11-25 12:58:34,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt. 47: [2022-11-25 12:58:34,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt. 47: [2022-11-25 12:58:34,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt. 47: [2022-11-25 12:58:34,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt. 47: [2022-11-25 12:58:34,820] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt 47: [2022-11-25 12:58:34,820] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt 47: [2022-11-25 12:58:34,820] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt 47: [2022-11-25 12:58:34,820] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt 47: [2022-11-25 12:58:34,820] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt 47: [2022-11-25 12:58:34,820] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt 47: [2022-11-25 12:58:34,820] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 47: [2022-11-25 12:58:34,820] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 47: [2022-11-25 12:58:34,820] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 47: [2022-11-25 12:58:34,820] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt. 47: [2022-11-25 12:58:34,820] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 47: [2022-11-25 12:58:34,820] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 47: [2022-11-25 12:58:34,820] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 47: [2022-11-25 12:58:34,820] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt 47: [2022-11-25 12:58:34,821] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 50: [2022-11-25 12:58:34,823] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt. 50: [2022-11-25 12:58:34,823] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt 50: [2022-11-25 12:58:34,823] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 13: [2022-11-25 12:58:34,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. 13: [2022-11-25 12:58:34,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt 13: [2022-11-25 12:58:34,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 28: [2022-11-25 12:58:34,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. 28: [2022-11-25 12:58:34,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt 28: [2022-11-25 12:58:34,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 54: [2022-11-25 12:58:34,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt. 54: [2022-11-25 12:58:34,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt. 54: [2022-11-25 12:58:34,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt. 54: [2022-11-25 12:58:34,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt. 54: [2022-11-25 12:58:34,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt. 54: [2022-11-25 12:58:34,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt 54: [2022-11-25 12:58:34,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt. 54: [2022-11-25 12:58:34,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt. 54: [2022-11-25 12:58:34,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt 54: [2022-11-25 12:58:34,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt. 54: [2022-11-25 12:58:34,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 54: [2022-11-25 12:58:34,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt 54: [2022-11-25 12:58:34,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt 54: [2022-11-25 12:58:34,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt 54: [2022-11-25 12:58:34,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 54: [2022-11-25 12:58:34,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt 54: [2022-11-25 12:58:34,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt 54: [2022-11-25 12:58:34,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt 54: [2022-11-25 12:58:34,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 54: [2022-11-25 12:58:34,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 54: [2022-11-25 12:58:34,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 54: [2022-11-25 12:58:34,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 54: [2022-11-25 12:58:34,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 54: [2022-11-25 12:58:34,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 46: [2022-11-25 12:58:34,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt. 46: [2022-11-25 12:58:34,842] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt 46: [2022-11-25 12:58:34,842] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 60: [2022-11-25 12:58:34,849] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt. 60: [2022-11-25 12:58:34,849] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt 60: [2022-11-25 12:58:34,849] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 43: [2022-11-25 12:58:34,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt. 43: [2022-11-25 12:58:34,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt. 43: [2022-11-25 12:58:34,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt. 43: [2022-11-25 12:58:34,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt 43: [2022-11-25 12:58:34,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt. 43: [2022-11-25 12:58:34,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt. 43: [2022-11-25 12:58:34,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt. 43: [2022-11-25 12:58:34,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt. 43: [2022-11-25 12:58:34,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt 43: [2022-11-25 12:58:34,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 43: [2022-11-25 12:58:34,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt 43: [2022-11-25 12:58:34,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt 43: [2022-11-25 12:58:34,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt 43: [2022-11-25 12:58:34,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt 43: [2022-11-25 12:58:34,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt 43: [2022-11-25 12:58:34,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 43: [2022-11-25 12:58:34,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 43: [2022-11-25 12:58:34,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 43: [2022-11-25 12:58:34,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 43: [2022-11-25 12:58:34,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 43: [2022-11-25 12:58:34,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 43: [2022-11-25 12:58:34,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt. 43: [2022-11-25 12:58:34,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt 43: [2022-11-25 12:58:34,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 28: [2022-11-25 12:58:34,853] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. 28: [2022-11-25 12:58:34,853] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt 28: [2022-11-25 12:58:34,853] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 55: [2022-11-25 12:58:34,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt. 55: [2022-11-25 12:58:34,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt 39: [2022-11-25 12:58:34,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt. 39: [2022-11-25 12:58:34,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt 55: [2022-11-25 12:58:34,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 39: [2022-11-25 12:58:34,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 26: [2022-11-25 12:58:34,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. 26: [2022-11-25 12:58:34,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt 26: [2022-11-25 12:58:34,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 26: [2022-11-25 12:58:34,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. 26: [2022-11-25 12:58:34,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt 26: [2022-11-25 12:58:34,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 26: [2022-11-25 12:58:34,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. 26: [2022-11-25 12:58:34,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt 26: [2022-11-25 12:58:34,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 8: [2022-11-25 12:58:34,887] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. 8: [2022-11-25 12:58:34,887] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt 8: [2022-11-25 12:58:34,887] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 19: [2022-11-25 12:58:34,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. 19: [2022-11-25 12:58:34,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt 19: [2022-11-25 12:58:34,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 3: [2022-11-25 12:58:34,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. 3: [2022-11-25 12:58:34,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. 3: [2022-11-25 12:58:34,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt 3: [2022-11-25 12:58:34,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. 3: [2022-11-25 12:58:34,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 3: [2022-11-25 12:58:34,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. 3: [2022-11-25 12:58:34,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. 3: [2022-11-25 12:58:34,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt 3: [2022-11-25 12:58:34,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt 3: [2022-11-25 12:58:34,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 3: [2022-11-25 12:58:34,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 3: [2022-11-25 12:58:34,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt 3: [2022-11-25 12:58:34,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt 3: [2022-11-25 12:58:34,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 3: [2022-11-25 12:58:34,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 3: [2022-11-25 12:58:34,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. 3: [2022-11-25 12:58:34,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt 3: [2022-11-25 12:58:34,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. 3: [2022-11-25 12:58:34,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 3: [2022-11-25 12:58:34,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. 3: [2022-11-25 12:58:34,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt 3: [2022-11-25 12:58:34,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 3: [2022-11-25 12:58:34,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt 3: [2022-11-25 12:58:34,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 18: [2022-11-25 12:58:34,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. 18: [2022-11-25 12:58:34,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt 18: [2022-11-25 12:58:34,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 23: [2022-11-25 12:58:34,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. 23: [2022-11-25 12:58:34,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt 23: [2022-11-25 12:58:34,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 23: [2022-11-25 12:58:34,938] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. 23: [2022-11-25 12:58:34,938] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt 23: [2022-11-25 12:58:34,938] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 41: [2022-11-25 12:58:34,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt. 41: [2022-11-25 12:58:34,943] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt 41: [2022-11-25 12:58:34,943] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 29: [2022-11-25 12:58:34,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. 29: [2022-11-25 12:58:34,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt 29: [2022-11-25 12:58:34,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 44: [2022-11-25 12:58:34,982] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt. 44: [2022-11-25 12:58:34,982] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step2000/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt 44: [2022-11-25 12:58:34,982] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! 0: successfully saved checkpoint at iteration 2000 to checkpoints_8b7 63: time (ms) | save-checkpoint: 6932.13 63: iteration 2010/ 5494 | consumed samples: 2058240 | consumed tokens: 4215275520 | elapsed time per iteration (s): 6.64 | learning rate: 1.485E-04 | global batch size: 1024 | lm loss: 2.519866E+00 | grad norm: 0.170 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 154.268 | TFLOPs: 34.49 | 63: iteration 2020/ 5494 | consumed samples: 2068480 | consumed tokens: 4236247040 | elapsed time per iteration (s): 5.75 | learning rate: 1.480E-04 | global batch size: 1024 | lm loss: 2.512274E+00 | grad norm: 0.191 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.050 | TFLOPs: 39.81 | 63: iteration 2030/ 5494 | consumed samples: 2078720 | consumed tokens: 4257218560 | elapsed time per iteration (s): 5.52 | learning rate: 1.475E-04 | global batch size: 1024 | lm loss: 2.501692E+00 | grad norm: 0.177 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.501 | TFLOPs: 41.47 | 63: iteration 2040/ 5494 | consumed samples: 2088960 | consumed tokens: 4278190080 | elapsed time per iteration (s): 5.40 | learning rate: 1.470E-04 | global batch size: 1024 | lm loss: 2.505953E+00 | grad norm: 0.169 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.751 | TFLOPs: 42.42 | 63: iteration 2050/ 5494 | consumed samples: 2099200 | consumed tokens: 4299161600 | elapsed time per iteration (s): 5.52 | learning rate: 1.466E-04 | global batch size: 1024 | lm loss: 2.518596E+00 | grad norm: 0.203 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.359 | TFLOPs: 41.44 | 63: iteration 2060/ 5494 | consumed samples: 2109440 | consumed tokens: 4320133120 | elapsed time per iteration (s): 5.83 | learning rate: 1.461E-04 | global batch size: 1024 | lm loss: 2.498713E+00 | grad norm: 0.181 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.540 | TFLOPs: 39.24 | 63: iteration 2070/ 5494 | consumed samples: 2119680 | consumed tokens: 4341104640 | elapsed time per iteration (s): 5.52 | learning rate: 1.456E-04 | global batch size: 1024 | lm loss: 2.509071E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.589 | TFLOPs: 41.49 | 63: iteration 2080/ 5494 | consumed samples: 2129920 | consumed tokens: 4362076160 | elapsed time per iteration (s): 5.61 | learning rate: 1.451E-04 | global batch size: 1024 | lm loss: 2.522034E+00 | grad norm: 0.241 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.561 | TFLOPs: 40.81 | 63: iteration 2090/ 5494 | consumed samples: 2140160 | consumed tokens: 4383047680 | elapsed time per iteration (s): 5.51 | learning rate: 1.447E-04 | global batch size: 1024 | lm loss: 2.520369E+00 | grad norm: 0.177 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.747 | TFLOPs: 41.53 | 63: iteration 2100/ 5494 | consumed samples: 2150400 | consumed tokens: 4404019200 | elapsed time per iteration (s): 5.76 | learning rate: 1.442E-04 | global batch size: 1024 | lm loss: 2.478450E+00 | grad norm: 0.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.751 | TFLOPs: 39.74 | 63: iteration 2110/ 5494 | consumed samples: 2160640 | consumed tokens: 4424990720 | elapsed time per iteration (s): 5.64 | learning rate: 1.437E-04 | global batch size: 1024 | lm loss: 2.476984E+00 | grad norm: 0.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.623 | TFLOPs: 40.60 | 63: iteration 2120/ 5494 | consumed samples: 2170880 | consumed tokens: 4445962240 | elapsed time per iteration (s): 5.59 | learning rate: 1.432E-04 | global batch size: 1024 | lm loss: 2.489861E+00 | grad norm: 0.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 183.150 | TFLOPs: 40.95 | 63: iteration 2130/ 5494 | consumed samples: 2181120 | consumed tokens: 4466933760 | elapsed time per iteration (s): 5.51 | learning rate: 1.427E-04 | global batch size: 1024 | lm loss: 2.477504E+00 | grad norm: 0.176 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.699 | TFLOPs: 41.52 | 63: iteration 2140/ 5494 | consumed samples: 2191360 | consumed tokens: 4487905280 | elapsed time per iteration (s): 6.06 | learning rate: 1.423E-04 | global batch size: 1024 | lm loss: 2.476510E+00 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 168.894 | TFLOPs: 37.76 | 63: iteration 2150/ 5494 | consumed samples: 2201600 | consumed tokens: 4508876800 | elapsed time per iteration (s): 5.77 | learning rate: 1.418E-04 | global batch size: 1024 | lm loss: 2.493169E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.489 | TFLOPs: 39.68 | 63: iteration 2160/ 5494 | consumed samples: 2211840 | consumed tokens: 4529848320 | elapsed time per iteration (s): 5.50 | learning rate: 1.413E-04 | global batch size: 1024 | lm loss: 2.476371E+00 | grad norm: 0.161 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.094 | TFLOPs: 41.60 | 63: iteration 2170/ 5494 | consumed samples: 2222080 | consumed tokens: 4550819840 | elapsed time per iteration (s): 5.76 | learning rate: 1.408E-04 | global batch size: 1024 | lm loss: 2.468200E+00 | grad norm: 0.200 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.687 | TFLOPs: 39.72 | 63: iteration 2180/ 5494 | consumed samples: 2232320 | consumed tokens: 4571791360 | elapsed time per iteration (s): 5.66 | learning rate: 1.403E-04 | global batch size: 1024 | lm loss: 2.457799E+00 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.862 | TFLOPs: 40.43 | 63: iteration 2190/ 5494 | consumed samples: 2242560 | consumed tokens: 4592762880 | elapsed time per iteration (s): 5.53 | learning rate: 1.398E-04 | global batch size: 1024 | lm loss: 2.472074E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.120 | TFLOPs: 41.39 | 63: iteration 2200/ 5494 | consumed samples: 2252800 | consumed tokens: 4613734400 | elapsed time per iteration (s): 5.73 | learning rate: 1.393E-04 | global batch size: 1024 | lm loss: 2.457623E+00 | grad norm: 0.179 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.603 | TFLOPs: 39.93 | 63: iteration 2210/ 5494 | consumed samples: 2263040 | consumed tokens: 4634705920 | elapsed time per iteration (s): 6.03 | learning rate: 1.388E-04 | global batch size: 1024 | lm loss: 2.468433E+00 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 169.923 | TFLOPs: 37.99 | 63: iteration 2220/ 5494 | consumed samples: 2273280 | consumed tokens: 4655677440 | elapsed time per iteration (s): 5.82 | learning rate: 1.383E-04 | global batch size: 1024 | lm loss: 2.457516E+00 | grad norm: 0.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.939 | TFLOPs: 39.33 | 63: iteration 2230/ 5494 | consumed samples: 2283520 | consumed tokens: 4676648960 | elapsed time per iteration (s): 5.69 | learning rate: 1.378E-04 | global batch size: 1024 | lm loss: 2.451455E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.844 | TFLOPs: 40.21 | 63: iteration 2240/ 5494 | consumed samples: 2293760 | consumed tokens: 4697620480 | elapsed time per iteration (s): 5.65 | learning rate: 1.373E-04 | global batch size: 1024 | lm loss: 2.449718E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.330 | TFLOPs: 40.54 | 63: iteration 2250/ 5494 | consumed samples: 2304000 | consumed tokens: 4718592000 | elapsed time per iteration (s): 5.76 | learning rate: 1.369E-04 | global batch size: 1024 | lm loss: 2.444774E+00 | grad norm: 0.174 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.638 | TFLOPs: 39.71 | 63: iteration 2260/ 5494 | consumed samples: 2314240 | consumed tokens: 4739563520 | elapsed time per iteration (s): 5.53 | learning rate: 1.364E-04 | global batch size: 1024 | lm loss: 2.459115E+00 | grad norm: 0.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.195 | TFLOPs: 41.40 | 63: iteration 2270/ 5494 | consumed samples: 2324480 | consumed tokens: 4760535040 | elapsed time per iteration (s): 5.84 | learning rate: 1.359E-04 | global batch size: 1024 | lm loss: 2.470336E+00 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.242 | TFLOPs: 39.18 | 63: iteration 2280/ 5494 | consumed samples: 2334720 | consumed tokens: 4781506560 | elapsed time per iteration (s): 6.10 | learning rate: 1.354E-04 | global batch size: 1024 | lm loss: 2.446955E+00 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 167.770 | TFLOPs: 37.51 | 63: iteration 2290/ 5494 | consumed samples: 2344960 | consumed tokens: 4802478080 | elapsed time per iteration (s): 5.66 | learning rate: 1.349E-04 | global batch size: 1024 | lm loss: 2.452139E+00 | grad norm: 0.222 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.871 | TFLOPs: 40.44 | 63: iteration 2300/ 5494 | consumed samples: 2355200 | consumed tokens: 4823449600 | elapsed time per iteration (s): 5.42 | learning rate: 1.344E-04 | global batch size: 1024 | lm loss: 2.463716E+00 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.094 | TFLOPs: 42.28 | 63: iteration 2310/ 5494 | consumed samples: 2365440 | consumed tokens: 4844421120 | elapsed time per iteration (s): 6.73 | learning rate: 1.339E-04 | global batch size: 1024 | lm loss: 2.433751E+00 | grad norm: 0.199 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 152.238 | TFLOPs: 34.04 | 63: iteration 2320/ 5494 | consumed samples: 2375680 | consumed tokens: 4865392640 | elapsed time per iteration (s): 5.76 | learning rate: 1.334E-04 | global batch size: 1024 | lm loss: 2.456863E+00 | grad norm: 0.231 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.898 | TFLOPs: 39.77 | 63: iteration 2330/ 5494 | consumed samples: 2385920 | consumed tokens: 4886364160 | elapsed time per iteration (s): 5.65 | learning rate: 1.329E-04 | global batch size: 1024 | lm loss: 2.436933E+00 | grad norm: 0.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.223 | TFLOPs: 40.52 | 63: iteration 2340/ 5494 | consumed samples: 2396160 | consumed tokens: 4907335680 | elapsed time per iteration (s): 5.66 | learning rate: 1.324E-04 | global batch size: 1024 | lm loss: 2.443720E+00 | grad norm: 0.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.785 | TFLOPs: 40.42 | 63: iteration 2350/ 5494 | consumed samples: 2406400 | consumed tokens: 4928307200 | elapsed time per iteration (s): 5.40 | learning rate: 1.318E-04 | global batch size: 1024 | lm loss: 2.435389E+00 | grad norm: 0.218 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.469 | TFLOPs: 42.36 | 63: iteration 2360/ 5494 | consumed samples: 2416640 | consumed tokens: 4949278720 | elapsed time per iteration (s): 5.65 | learning rate: 1.313E-04 | global batch size: 1024 | lm loss: 2.430875E+00 | grad norm: 0.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.198 | TFLOPs: 40.51 | 63: iteration 2370/ 5494 | consumed samples: 2426880 | consumed tokens: 4970250240 | elapsed time per iteration (s): 5.86 | learning rate: 1.308E-04 | global batch size: 1024 | lm loss: 2.448102E+00 | grad norm: 0.181 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 174.713 | TFLOPs: 39.06 | 63: iteration 2380/ 5494 | consumed samples: 2437120 | consumed tokens: 4991221760 | elapsed time per iteration (s): 5.66 | learning rate: 1.303E-04 | global batch size: 1024 | lm loss: 2.424037E+00 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.961 | TFLOPs: 40.46 | 63: iteration 2390/ 5494 | consumed samples: 2447360 | consumed tokens: 5012193280 | elapsed time per iteration (s): 5.53 | learning rate: 1.298E-04 | global batch size: 1024 | lm loss: 2.446300E+00 | grad norm: 0.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.249 | TFLOPs: 41.42 | 63: iteration 2400/ 5494 | consumed samples: 2457600 | consumed tokens: 5033164800 | elapsed time per iteration (s): 5.90 | learning rate: 1.293E-04 | global batch size: 1024 | lm loss: 2.406252E+00 | grad norm: 0.195 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.696 | TFLOPs: 38.83 | 63: iteration 2410/ 5494 | consumed samples: 2467840 | consumed tokens: 5054136320 | elapsed time per iteration (s): 5.78 | learning rate: 1.288E-04 | global batch size: 1024 | lm loss: 2.429000E+00 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.269 | TFLOPs: 39.63 | 63: iteration 2420/ 5494 | consumed samples: 2478080 | consumed tokens: 5075107840 | elapsed time per iteration (s): 5.91 | learning rate: 1.283E-04 | global batch size: 1024 | lm loss: 2.421251E+00 | grad norm: 0.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.234 | TFLOPs: 38.73 | 63: iteration 2430/ 5494 | consumed samples: 2488320 | consumed tokens: 5096079360 | elapsed time per iteration (s): 5.84 | learning rate: 1.278E-04 | global batch size: 1024 | lm loss: 2.414101E+00 | grad norm: 0.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.198 | TFLOPs: 39.17 | 63: iteration 2440/ 5494 | consumed samples: 2498560 | consumed tokens: 5117050880 | elapsed time per iteration (s): 5.58 | learning rate: 1.273E-04 | global batch size: 1024 | lm loss: 2.428672E+00 | grad norm: 0.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 183.635 | TFLOPs: 41.05 | 63: iteration 2450/ 5494 | consumed samples: 2508800 | consumed tokens: 5138022400 | elapsed time per iteration (s): 5.50 | learning rate: 1.268E-04 | global batch size: 1024 | lm loss: 2.424636E+00 | grad norm: 0.181 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.350 | TFLOPs: 41.66 | 63: iteration 2460/ 5494 | consumed samples: 2519040 | consumed tokens: 5158993920 | elapsed time per iteration (s): 5.65 | learning rate: 1.263E-04 | global batch size: 1024 | lm loss: 2.407756E+00 | grad norm: 0.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.125 | TFLOPs: 40.49 | 63: iteration 2470/ 5494 | consumed samples: 2529280 | consumed tokens: 5179965440 | elapsed time per iteration (s): 5.76 | learning rate: 1.257E-04 | global batch size: 1024 | lm loss: 2.413894E+00 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.712 | TFLOPs: 39.73 | 63: iteration 2480/ 5494 | consumed samples: 2539520 | consumed tokens: 5200936960 | elapsed time per iteration (s): 5.43 | learning rate: 1.252E-04 | global batch size: 1024 | lm loss: 2.440575E+00 | grad norm: 0.178 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 188.634 | TFLOPs: 42.17 | 63: iteration 2490/ 5494 | consumed samples: 2549760 | consumed tokens: 5221908480 | elapsed time per iteration (s): 5.83 | learning rate: 1.247E-04 | global batch size: 1024 | lm loss: 2.407724E+00 | grad norm: 0.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.575 | TFLOPs: 39.25 | 63: iteration 2500/ 5494 | consumed samples: 2560000 | consumed tokens: 5242880000 | elapsed time per iteration (s): 5.77 | learning rate: 1.242E-04 | global batch size: 1024 | lm loss: 2.419824E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.486 | TFLOPs: 39.68 | 63: iteration 2510/ 5494 | consumed samples: 2570240 | consumed tokens: 5263851520 | elapsed time per iteration (s): 5.83 | learning rate: 1.237E-04 | global batch size: 1024 | lm loss: 2.425478E+00 | grad norm: 0.177 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.654 | TFLOPs: 39.27 | 63: iteration 2520/ 5494 | consumed samples: 2580480 | consumed tokens: 5284823040 | elapsed time per iteration (s): 5.52 | learning rate: 1.232E-04 | global batch size: 1024 | lm loss: 2.403011E+00 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.520 | TFLOPs: 41.48 | 63: iteration 2530/ 5494 | consumed samples: 2590720 | consumed tokens: 5305794560 | elapsed time per iteration (s): 5.90 | learning rate: 1.227E-04 | global batch size: 1024 | lm loss: 2.394941E+00 | grad norm: 0.161 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.567 | TFLOPs: 38.80 | 63: iteration 2540/ 5494 | consumed samples: 2600960 | consumed tokens: 5326766080 | elapsed time per iteration (s): 5.55 | learning rate: 1.222E-04 | global batch size: 1024 | lm loss: 2.402253E+00 | grad norm: 0.187 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.523 | TFLOPs: 41.25 | 63: iteration 2550/ 5494 | consumed samples: 2611200 | consumed tokens: 5347737600 | elapsed time per iteration (s): 5.41 | learning rate: 1.216E-04 | global batch size: 1024 | lm loss: 2.408228E+00 | grad norm: 0.182 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.300 | TFLOPs: 42.32 | 63: iteration 2560/ 5494 | consumed samples: 2621440 | consumed tokens: 5368709120 | elapsed time per iteration (s): 5.54 | learning rate: 1.211E-04 | global batch size: 1024 | lm loss: 2.396873E+00 | grad norm: 0.184 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.691 | TFLOPs: 41.29 | 63: iteration 2570/ 5494 | consumed samples: 2631680 | consumed tokens: 5389680640 | elapsed time per iteration (s): 5.77 | learning rate: 1.206E-04 | global batch size: 1024 | lm loss: 2.397542E+00 | grad norm: 0.192 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.433 | TFLOPs: 39.67 | 63: iteration 2580/ 5494 | consumed samples: 2641920 | consumed tokens: 5410652160 | elapsed time per iteration (s): 5.54 | learning rate: 1.201E-04 | global batch size: 1024 | lm loss: 2.407163E+00 | grad norm: 0.185 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.830 | TFLOPs: 41.32 | 63: iteration 2590/ 5494 | consumed samples: 2652160 | consumed tokens: 5431623680 | elapsed time per iteration (s): 5.79 | learning rate: 1.196E-04 | global batch size: 1024 | lm loss: 2.401608E+00 | grad norm: 0.172 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.821 | TFLOPs: 39.53 | 63: iteration 2600/ 5494 | consumed samples: 2662400 | consumed tokens: 5452595200 | elapsed time per iteration (s): 5.51 | learning rate: 1.191E-04 | global batch size: 1024 | lm loss: 2.390104E+00 | grad norm: 0.179 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.742 | TFLOPs: 41.53 | 63: iteration 2610/ 5494 | consumed samples: 2672640 | consumed tokens: 5473566720 | elapsed time per iteration (s): 5.65 | learning rate: 1.185E-04 | global batch size: 1024 | lm loss: 2.395837E+00 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.128 | TFLOPs: 40.49 | 63: iteration 2620/ 5494 | consumed samples: 2682880 | consumed tokens: 5494538240 | elapsed time per iteration (s): 5.77 | learning rate: 1.180E-04 | global batch size: 1024 | lm loss: 2.385051E+00 | grad norm: 0.197 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.605 | TFLOPs: 39.71 | 63: iteration 2630/ 5494 | consumed samples: 2693120 | consumed tokens: 5515509760 | elapsed time per iteration (s): 5.52 | learning rate: 1.175E-04 | global batch size: 1024 | lm loss: 2.400160E+00 | grad norm: 0.202 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.511 | TFLOPs: 41.47 | 63: iteration 2640/ 5494 | consumed samples: 2703360 | consumed tokens: 5536481280 | elapsed time per iteration (s): 5.53 | learning rate: 1.170E-04 | global batch size: 1024 | lm loss: 2.378311E+00 | grad norm: 0.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.264 | TFLOPs: 41.42 | 63: iteration 2650/ 5494 | consumed samples: 2713600 | consumed tokens: 5557452800 | elapsed time per iteration (s): 5.79 | learning rate: 1.165E-04 | global batch size: 1024 | lm loss: 2.384426E+00 | grad norm: 0.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.930 | TFLOPs: 39.56 | 63: iteration 2660/ 5494 | consumed samples: 2723840 | consumed tokens: 5578424320 | elapsed time per iteration (s): 5.93 | learning rate: 1.159E-04 | global batch size: 1024 | lm loss: 2.368144E+00 | grad norm: 0.238 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.556 | TFLOPs: 38.58 | 63: iteration 2670/ 5494 | consumed samples: 2734080 | consumed tokens: 5599395840 | elapsed time per iteration (s): 5.53 | learning rate: 1.154E-04 | global batch size: 1024 | lm loss: 2.391520E+00 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.215 | TFLOPs: 41.41 | 63: iteration 2680/ 5494 | consumed samples: 2744320 | consumed tokens: 5620367360 | elapsed time per iteration (s): 5.64 | learning rate: 1.149E-04 | global batch size: 1024 | lm loss: 2.395207E+00 | grad norm: 0.181 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.417 | TFLOPs: 40.56 | 63: iteration 2690/ 5494 | consumed samples: 2754560 | consumed tokens: 5641338880 | elapsed time per iteration (s): 5.44 | learning rate: 1.144E-04 | global batch size: 1024 | lm loss: 2.373984E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 188.064 | TFLOPs: 42.04 | 63: iteration 2700/ 5494 | consumed samples: 2764800 | consumed tokens: 5662310400 | elapsed time per iteration (s): 5.73 | learning rate: 1.139E-04 | global batch size: 1024 | lm loss: 2.355459E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.679 | TFLOPs: 39.95 | 63: iteration 2710/ 5494 | consumed samples: 2775040 | consumed tokens: 5683281920 | elapsed time per iteration (s): 6.01 | learning rate: 1.134E-04 | global batch size: 1024 | lm loss: 2.368085E+00 | grad norm: 0.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 170.466 | TFLOPs: 38.11 | 63: iteration 2720/ 5494 | consumed samples: 2785280 | consumed tokens: 5704253440 | elapsed time per iteration (s): 5.54 | learning rate: 1.128E-04 | global batch size: 1024 | lm loss: 2.375288E+00 | grad norm: 0.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.904 | TFLOPs: 41.34 | 63: iteration 2730/ 5494 | consumed samples: 2795520 | consumed tokens: 5725224960 | elapsed time per iteration (s): 5.60 | learning rate: 1.123E-04 | global batch size: 1024 | lm loss: 2.369791E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.799 | TFLOPs: 40.87 | 63: iteration 2740/ 5494 | consumed samples: 2805760 | consumed tokens: 5746196480 | elapsed time per iteration (s): 5.66 | learning rate: 1.118E-04 | global batch size: 1024 | lm loss: 2.357708E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.039 | TFLOPs: 40.47 | 63: iteration 2750/ 5494 | consumed samples: 2816000 | consumed tokens: 5767168000 | elapsed time per iteration (s): 5.43 | learning rate: 1.113E-04 | global batch size: 1024 | lm loss: 2.368623E+00 | grad norm: 0.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 188.456 | TFLOPs: 42.13 | 63: iteration 2760/ 5494 | consumed samples: 2826240 | consumed tokens: 5788139520 | elapsed time per iteration (s): 5.63 | learning rate: 1.108E-04 | global batch size: 1024 | lm loss: 2.363576E+00 | grad norm: 0.186 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.018 | TFLOPs: 40.69 | 63: iteration 2770/ 5494 | consumed samples: 2836480 | consumed tokens: 5809111040 | elapsed time per iteration (s): 5.66 | learning rate: 1.102E-04 | global batch size: 1024 | lm loss: 2.363386E+00 | grad norm: 0.188 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.763 | TFLOPs: 40.41 | 63: iteration 2780/ 5494 | consumed samples: 2846720 | consumed tokens: 5830082560 | elapsed time per iteration (s): 5.54 | learning rate: 1.097E-04 | global batch size: 1024 | lm loss: 2.373815E+00 | grad norm: 0.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.842 | TFLOPs: 41.32 | 63: iteration 2790/ 5494 | consumed samples: 2856960 | consumed tokens: 5851054080 | elapsed time per iteration (s): 5.88 | learning rate: 1.092E-04 | global batch size: 1024 | lm loss: 2.355998E+00 | grad norm: 0.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 174.135 | TFLOPs: 38.93 | 63: iteration 2800/ 5494 | consumed samples: 2867200 | consumed tokens: 5872025600 | elapsed time per iteration (s): 5.52 | learning rate: 1.087E-04 | global batch size: 1024 | lm loss: 2.358995E+00 | grad norm: 0.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.362 | TFLOPs: 41.44 | 63: iteration 2810/ 5494 | consumed samples: 2877440 | consumed tokens: 5892997120 | elapsed time per iteration (s): 5.54 | learning rate: 1.082E-04 | global batch size: 1024 | lm loss: 2.375492E+00 | grad norm: 0.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.810 | TFLOPs: 41.32 | 63: iteration 2820/ 5494 | consumed samples: 2887680 | consumed tokens: 5913968640 | elapsed time per iteration (s): 5.55 | learning rate: 1.076E-04 | global batch size: 1024 | lm loss: 2.369288E+00 | grad norm: 0.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.357 | TFLOPs: 41.22 | 63: iteration 2830/ 5494 | consumed samples: 2897920 | consumed tokens: 5934940160 | elapsed time per iteration (s): 5.67 | learning rate: 1.071E-04 | global batch size: 1024 | lm loss: 2.345364E+00 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.654 | TFLOPs: 40.39 | 63: iteration 2840/ 5494 | consumed samples: 2908160 | consumed tokens: 5955911680 | elapsed time per iteration (s): 5.53 | learning rate: 1.066E-04 | global batch size: 1024 | lm loss: 2.362342E+00 | grad norm: 0.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.120 | TFLOPs: 41.39 | 63: iteration 2850/ 5494 | consumed samples: 2918400 | consumed tokens: 5976883200 | elapsed time per iteration (s): 5.62 | learning rate: 1.061E-04 | global batch size: 1024 | lm loss: 2.360204E+00 | grad norm: 0.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.354 | TFLOPs: 40.77 | 63: iteration 2860/ 5494 | consumed samples: 2928640 | consumed tokens: 5997854720 | elapsed time per iteration (s): 5.76 | learning rate: 1.056E-04 | global batch size: 1024 | lm loss: 2.340488E+00 | grad norm: 0.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.903 | TFLOPs: 39.77 | 63: iteration 2870/ 5494 | consumed samples: 2938880 | consumed tokens: 6018826240 | elapsed time per iteration (s): 5.40 | learning rate: 1.050E-04 | global batch size: 1024 | lm loss: 2.355475E+00 | grad norm: 0.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.758 | TFLOPs: 42.42 | 63: iteration 2880/ 5494 | consumed samples: 2949120 | consumed tokens: 6039797760 | elapsed time per iteration (s): 5.59 | learning rate: 1.045E-04 | global batch size: 1024 | lm loss: 2.364615E+00 | grad norm: 0.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 183.047 | TFLOPs: 40.92 | 63: iteration 2890/ 5494 | consumed samples: 2959360 | consumed tokens: 6060769280 | elapsed time per iteration (s): 5.53 | learning rate: 1.040E-04 | global batch size: 1024 | lm loss: 2.354544E+00 | grad norm: 0.177 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.256 | TFLOPs: 41.42 | 63: iteration 2900/ 5494 | consumed samples: 2969600 | consumed tokens: 6081740800 | elapsed time per iteration (s): 5.53 | learning rate: 1.035E-04 | global batch size: 1024 | lm loss: 2.345100E+00 | grad norm: 0.193 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.142 | TFLOPs: 41.39 | 63: iteration 2910/ 5494 | consumed samples: 2979840 | consumed tokens: 6102712320 | elapsed time per iteration (s): 5.51 | learning rate: 1.030E-04 | global batch size: 1024 | lm loss: 2.362236E+00 | grad norm: 0.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.736 | TFLOPs: 41.52 | 63: iteration 2920/ 5494 | consumed samples: 2990080 | consumed tokens: 6123683840 | elapsed time per iteration (s): 5.52 | learning rate: 1.024E-04 | global batch size: 1024 | lm loss: 2.343294E+00 | grad norm: 0.189 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.340 | TFLOPs: 41.44 | 63: iteration 2930/ 5494 | consumed samples: 3000320 | consumed tokens: 6144655360 | elapsed time per iteration (s): 6.07 | learning rate: 1.019E-04 | global batch size: 1024 | lm loss: 2.343387E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 168.633 | TFLOPs: 37.70 | 63: iteration 2940/ 5494 | consumed samples: 3010560 | consumed tokens: 6165626880 | elapsed time per iteration (s): 5.68 | learning rate: 1.014E-04 | global batch size: 1024 | lm loss: 2.338488E+00 | grad norm: 0.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.199 | TFLOPs: 40.29 | 63: iteration 2950/ 5494 | consumed samples: 3020800 | consumed tokens: 6186598400 | elapsed time per iteration (s): 5.74 | learning rate: 1.009E-04 | global batch size: 1024 | lm loss: 2.340277E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.295 | TFLOPs: 39.86 | 63: iteration 2960/ 5494 | consumed samples: 3031040 | consumed tokens: 6207569920 | elapsed time per iteration (s): 5.41 | learning rate: 1.004E-04 | global batch size: 1024 | lm loss: 2.321365E+00 | grad norm: 0.227 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.355 | TFLOPs: 42.33 | 63: iteration 2970/ 5494 | consumed samples: 3041280 | consumed tokens: 6228541440 | elapsed time per iteration (s): 5.56 | learning rate: 9.986E-05 | global batch size: 1024 | lm loss: 2.335113E+00 | grad norm: 0.207 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.150 | TFLOPs: 41.17 | 63: iteration 2980/ 5494 | consumed samples: 3051520 | consumed tokens: 6249512960 | elapsed time per iteration (s): 5.71 | learning rate: 9.934E-05 | global batch size: 1024 | lm loss: 2.348417E+00 | grad norm: 0.168 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.309 | TFLOPs: 40.09 | 63: iteration 2990/ 5494 | consumed samples: 3061760 | consumed tokens: 6270484480 | elapsed time per iteration (s): 5.67 | learning rate: 9.883E-05 | global batch size: 1024 | lm loss: 2.337943E+00 | grad norm: 0.169 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.497 | TFLOPs: 40.35 | 63: iteration 3000/ 5494 | consumed samples: 3072000 | consumed tokens: 6291456000 | elapsed time per iteration (s): 5.53 | learning rate: 9.831E-05 | global batch size: 1024 | lm loss: 2.349378E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.075 | TFLOPs: 41.38 | 63: ------------------------------------------------------------------------------------------ 63: valid loss at iteration 3000 | lm loss value: 2.279929E+00 | lm loss PPL: 9.775988E+00 | 63: ------------------------------------------------------------------------------------------ 0: saving checkpoint at iteration 3000 to checkpoints_8b7 0: [2022-11-25 14:33:05,106] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step3000 is begin to save! 0: [2022-11-25 14:33:05,181] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_01-model_00-model_states.pt... 0: [2022-11-25 14:33:05,181] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_01-model_01-model_states.pt... 32: [2022-11-25 14:33:05,181] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_24-model_01-model_states.pt... 32: [2022-11-25 14:33:05,181] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_24-model_00-model_states.pt... 0: [2022-11-25 14:33:05,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_01-model_01-model_states.pt. 0: [2022-11-25 14:33:05,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_01-model_00-model_states.pt. 32: [2022-11-25 14:33:05,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_24-model_00-model_states.pt. 32: [2022-11-25 14:33:05,744] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_24-model_01-model_states.pt. 0: [2022-11-25 14:33:05,745] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_03-model_00-model_states.pt... 0: [2022-11-25 14:33:05,745] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_03-model_01-model_states.pt... 32: [2022-11-25 14:33:05,745] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_25-model_00-model_states.pt... 32: [2022-11-25 14:33:05,745] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_25-model_01-model_states.pt... 32: [2022-11-25 14:33:05,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_25-model_01-model_states.pt. 32: [2022-11-25 14:33:05,993] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_26-model_01-model_states.pt... 0: [2022-11-25 14:33:05,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_03-model_00-model_states.pt. 0: [2022-11-25 14:33:05,991] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_04-model_00-model_states.pt... 32: [2022-11-25 14:33:06,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_25-model_00-model_states.pt. 32: [2022-11-25 14:33:06,001] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_26-model_00-model_states.pt... 0: [2022-11-25 14:33:06,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_03-model_01-model_states.pt. 0: [2022-11-25 14:33:06,013] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_04-model_01-model_states.pt... 0: [2022-11-25 14:33:06,260] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_04-model_01-model_states.pt. 32: [2022-11-25 14:33:06,261] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_26-model_01-model_states.pt. 0: [2022-11-25 14:33:06,261] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_05-model_01-model_states.pt... 32: [2022-11-25 14:33:06,261] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_27-model_01-model_states.pt... 0: [2022-11-25 14:33:06,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_04-model_00-model_states.pt. 0: [2022-11-25 14:33:06,272] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_05-model_00-model_states.pt... 32: [2022-11-25 14:33:06,273] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_26-model_00-model_states.pt. 32: [2022-11-25 14:33:06,274] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_27-model_00-model_states.pt... 32: [2022-11-25 14:33:06,512] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_27-model_00-model_states.pt. 0: [2022-11-25 14:33:06,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_05-model_01-model_states.pt. 0: [2022-11-25 14:33:06,513] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_06-model_01-model_states.pt... 32: [2022-11-25 14:33:06,513] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_28-model_00-model_states.pt... 0: [2022-11-25 14:33:06,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_05-model_00-model_states.pt. 0: [2022-11-25 14:33:06,524] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_06-model_00-model_states.pt... 32: [2022-11-25 14:33:06,535] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_27-model_01-model_states.pt. 32: [2022-11-25 14:33:06,535] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_28-model_01-model_states.pt... 0: [2022-11-25 14:33:06,766] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_06-model_01-model_states.pt. 0: [2022-11-25 14:33:06,766] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_07-model_01-model_states.pt... 0: [2022-11-25 14:33:06,768] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_06-model_00-model_states.pt. 0: [2022-11-25 14:33:06,768] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_07-model_00-model_states.pt... 32: [2022-11-25 14:33:06,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_28-model_00-model_states.pt. 32: [2022-11-25 14:33:06,779] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_29-model_00-model_states.pt... 32: [2022-11-25 14:33:06,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_28-model_01-model_states.pt. 32: [2022-11-25 14:33:06,799] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_29-model_01-model_states.pt... 32: [2022-11-25 14:33:07,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_29-model_01-model_states.pt. 0: [2022-11-25 14:33:07,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_07-model_01-model_states.pt. 32: [2022-11-25 14:33:07,041] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_29-model_00-model_states.pt. 32: [2022-11-25 14:33:07,041] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_30-model_00-model_states.pt... 0: [2022-11-25 14:33:07,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_08-model_01-model_states.pt... 32: [2022-11-25 14:33:07,042] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_30-model_01-model_states.pt... 0: [2022-11-25 14:33:07,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_07-model_00-model_states.pt. 0: [2022-11-25 14:33:07,066] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_08-model_00-model_states.pt... 32: [2022-11-25 14:33:07,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_30-model_00-model_states.pt. 32: [2022-11-25 14:33:07,294] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_31-model_00-model_states.pt... 0: [2022-11-25 14:33:07,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_08-model_01-model_states.pt. 0: [2022-11-25 14:33:07,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_08-model_00-model_states.pt. 0: [2022-11-25 14:33:07,314] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_09-model_00-model_states.pt... 0: [2022-11-25 14:33:07,314] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_09-model_01-model_states.pt... 32: [2022-11-25 14:33:07,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_30-model_01-model_states.pt. 32: [2022-11-25 14:33:07,315] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_31-model_01-model_states.pt... 32: [2022-11-25 14:33:07,551] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_31-model_01-model_states.pt. 32: [2022-11-25 14:33:07,551] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_32-model_01-model_states.pt... 32: [2022-11-25 14:33:07,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_31-model_00-model_states.pt. 32: [2022-11-25 14:33:07,554] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_32-model_00-model_states.pt... 0: [2022-11-25 14:33:07,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_09-model_01-model_states.pt. 0: [2022-11-25 14:33:07,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_09-model_00-model_states.pt. 0: [2022-11-25 14:33:07,575] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_10-model_01-model_states.pt... 0: [2022-11-25 14:33:07,575] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_10-model_00-model_states.pt... 32: [2022-11-25 14:33:07,808] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_32-model_00-model_states.pt. 32: [2022-11-25 14:33:07,808] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_33-model_00-model_states.pt... 32: [2022-11-25 14:33:07,813] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_32-model_01-model_states.pt. 32: [2022-11-25 14:33:07,814] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_33-model_01-model_states.pt... 0: [2022-11-25 14:33:07,829] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_10-model_00-model_states.pt. 0: [2022-11-25 14:33:07,830] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_11-model_00-model_states.pt... 0: [2022-11-25 14:33:07,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_10-model_01-model_states.pt. 0: [2022-11-25 14:33:07,840] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_11-model_01-model_states.pt... 32: [2022-11-25 14:33:08,083] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_33-model_01-model_states.pt. 32: [2022-11-25 14:33:08,084] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_34-model_01-model_states.pt... 0: [2022-11-25 14:33:08,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_11-model_00-model_states.pt. 0: [2022-11-25 14:33:08,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_12-model_00-model_states.pt... 0: [2022-11-25 14:33:08,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_11-model_01-model_states.pt. 0: [2022-11-25 14:33:08,092] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_12-model_01-model_states.pt... 32: [2022-11-25 14:33:08,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_33-model_00-model_states.pt. 32: [2022-11-25 14:33:08,097] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_34-model_00-model_states.pt... 32: [2022-11-25 14:33:08,334] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_34-model_00-model_states.pt. 32: [2022-11-25 14:33:08,334] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_35-model_00-model_states.pt... 0: [2022-11-25 14:33:08,337] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_12-model_00-model_states.pt. 0: [2022-11-25 14:33:08,337] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_13-model_00-model_states.pt... 0: [2022-11-25 14:33:08,341] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_12-model_01-model_states.pt. 0: [2022-11-25 14:33:08,341] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_13-model_01-model_states.pt... 32: [2022-11-25 14:33:08,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_34-model_01-model_states.pt. 32: [2022-11-25 14:33:08,345] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_35-model_01-model_states.pt... 32: [2022-11-25 14:33:08,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_35-model_00-model_states.pt. 32: [2022-11-25 14:33:08,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_35-model_01-model_states.pt. 32: [2022-11-25 14:33:08,575] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_36-model_00-model_states.pt... 32: [2022-11-25 14:33:08,575] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_36-model_01-model_states.pt... 0: [2022-11-25 14:33:08,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_13-model_01-model_states.pt. 0: [2022-11-25 14:33:08,580] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_14-model_01-model_states.pt... 0: [2022-11-25 14:33:08,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_13-model_00-model_states.pt. 0: [2022-11-25 14:33:08,583] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_14-model_00-model_states.pt... 32: [2022-11-25 14:33:08,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_36-model_01-model_states.pt. 32: [2022-11-25 14:33:08,835] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_37-model_01-model_states.pt... 0: [2022-11-25 14:33:08,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_14-model_01-model_states.pt. 0: [2022-11-25 14:33:08,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_14-model_00-model_states.pt. 0: [2022-11-25 14:33:08,844] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_15-model_00-model_states.pt... 0: [2022-11-25 14:33:08,844] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_15-model_01-model_states.pt... 32: [2022-11-25 14:33:08,845] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_36-model_00-model_states.pt. 32: [2022-11-25 14:33:08,846] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_37-model_00-model_states.pt... 32: [2022-11-25 14:33:09,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_37-model_01-model_states.pt. 32: [2022-11-25 14:33:09,073] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_38-model_01-model_states.pt... 32: [2022-11-25 14:33:09,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_37-model_00-model_states.pt. 32: [2022-11-25 14:33:09,075] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_38-model_00-model_states.pt... 0: [2022-11-25 14:33:09,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_15-model_01-model_states.pt. 0: [2022-11-25 14:33:09,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_16-model_01-model_states.pt... 0: [2022-11-25 14:33:09,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_15-model_00-model_states.pt. 0: [2022-11-25 14:33:09,108] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_16-model_00-model_states.pt... 32: [2022-11-25 14:33:09,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_38-model_00-model_states.pt. 32: [2022-11-25 14:33:09,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_38-model_01-model_states.pt. 32: [2022-11-25 14:33:09,326] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_39-model_00-model_states.pt... 32: [2022-11-25 14:33:09,326] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_39-model_01-model_states.pt... 0: [2022-11-25 14:33:09,342] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_16-model_00-model_states.pt. 0: [2022-11-25 14:33:09,343] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_17-model_00-model_states.pt... 0: [2022-11-25 14:33:09,350] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_16-model_01-model_states.pt. 0: [2022-11-25 14:33:09,350] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_17-model_01-model_states.pt... 32: [2022-11-25 14:33:09,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_39-model_01-model_states.pt. 32: [2022-11-25 14:33:09,569] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_40-model_01-model_states.pt... 32: [2022-11-25 14:33:09,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_39-model_00-model_states.pt. 32: [2022-11-25 14:33:09,581] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_40-model_00-model_states.pt... 0: [2022-11-25 14:33:09,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_17-model_01-model_states.pt. 0: [2022-11-25 14:33:09,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_17-model_00-model_states.pt. 0: [2022-11-25 14:33:09,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_18-model_01-model_states.pt... 0: [2022-11-25 14:33:09,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_18-model_00-model_states.pt... 32: [2022-11-25 14:33:09,821] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_40-model_00-model_states.pt. 32: [2022-11-25 14:33:09,822] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_41-model_00-model_states.pt... 32: [2022-11-25 14:33:09,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_40-model_01-model_states.pt. 32: [2022-11-25 14:33:09,831] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_41-model_01-model_states.pt... 0: [2022-11-25 14:33:09,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_18-model_00-model_states.pt. 0: [2022-11-25 14:33:09,853] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_19-model_00-model_states.pt... 0: [2022-11-25 14:33:09,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_18-model_01-model_states.pt. 0: [2022-11-25 14:33:09,874] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_19-model_01-model_states.pt... 32: [2022-11-25 14:33:10,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_41-model_01-model_states.pt. 32: [2022-11-25 14:33:10,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_41-model_00-model_states.pt. 32: [2022-11-25 14:33:10,076] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_42-model_01-model_states.pt... 32: [2022-11-25 14:33:10,076] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_42-model_00-model_states.pt... 0: [2022-11-25 14:33:10,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_19-model_00-model_states.pt. 0: [2022-11-25 14:33:10,085] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_20-model_00-model_states.pt... 0: [2022-11-25 14:33:10,095] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_19-model_01-model_states.pt. 0: [2022-11-25 14:33:10,096] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_20-model_01-model_states.pt... 32: [2022-11-25 14:33:10,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_42-model_00-model_states.pt. 32: [2022-11-25 14:33:10,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_42-model_01-model_states.pt. 32: [2022-11-25 14:33:10,326] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_43-model_00-model_states.pt... 32: [2022-11-25 14:33:10,326] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_43-model_01-model_states.pt... 0: [2022-11-25 14:33:10,340] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_20-model_00-model_states.pt. 0: [2022-11-25 14:33:10,340] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_21-model_00-model_states.pt... 0: [2022-11-25 14:33:10,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_20-model_01-model_states.pt. 0: [2022-11-25 14:33:10,348] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_21-model_01-model_states.pt... 32: [2022-11-25 14:33:10,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_43-model_00-model_states.pt. 32: [2022-11-25 14:33:10,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_43-model_01-model_states.pt. 32: [2022-11-25 14:33:10,580] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_44-model_01-model_states.pt... 32: [2022-11-25 14:33:10,580] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_44-model_00-model_states.pt... 0: [2022-11-25 14:33:10,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_21-model_00-model_states.pt. 0: [2022-11-25 14:33:10,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_21-model_01-model_states.pt. 0: [2022-11-25 14:33:10,580] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_22-model_00-model_states.pt... 0: [2022-11-25 14:33:10,581] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_22-model_01-model_states.pt... 32: [2022-11-25 14:33:10,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_44-model_00-model_states.pt. 32: [2022-11-25 14:33:10,828] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_46-model_00-model_states.pt... 32: [2022-11-25 14:33:10,840] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_44-model_01-model_states.pt. 32: [2022-11-25 14:33:10,841] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_46-model_01-model_states.pt... 0: [2022-11-25 14:33:10,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_22-model_01-model_states.pt. 0: [2022-11-25 14:33:10,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_23-model_01-model_states.pt... 0: [2022-11-25 14:33:10,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_22-model_00-model_states.pt. 0: [2022-11-25 14:33:10,844] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/layer_23-model_00-model_states.pt... 32: [2022-11-25 14:33:10,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_46-model_00-model_states.pt. 32: [2022-11-25 14:33:10,848] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/mp_rank_02_model_states.pt... 32: [2022-11-25 14:33:10,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_46-model_01-model_states.pt. 32: [2022-11-25 14:33:10,851] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/mp_rank_03_model_states.pt... 32: [2022-11-25 14:33:10,853] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/mp_rank_02_model_states.pt. 32: [2022-11-25 14:33:10,855] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/mp_rank_03_model_states.pt. 0: [2022-11-25 14:33:11,082] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_23-model_00-model_states.pt. 0: [2022-11-25 14:33:11,083] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_8b7/global_step3000/mp_rank_00_model_states.pt 0: [2022-11-25 14:33:11,083] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/mp_rank_00_model_states.pt... 0: [2022-11-25 14:33:11,103] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/layer_23-model_01-model_states.pt. 0: [2022-11-25 14:33:11,104] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/mp_rank_00_model_states.pt. 0: [2022-11-25 14:33:11,104] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: checkpoints_8b7/global_step3000/mp_rank_01_model_states.pt 0: [2022-11-25 14:33:11,104] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/mp_rank_01_model_states.pt... 0: [2022-11-25 14:33:11,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/mp_rank_01_model_states.pt. 37: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt... 37: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt... 37: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt... 37: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt... 58: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt... 58: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt... 58: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt... 58: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt... 40: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt... 40: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt... 40: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt... 40: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt... 60: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt... 60: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt... 60: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt... 60: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt... 52: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt... 52: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt... 52: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt... 52: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt... 30: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... 30: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... 4: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... 4: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... 4: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... 56: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt... 56: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt... 56: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt... 54: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt... 54: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt... 54: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt... 54: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt... 54: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt... 62: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt... 62: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt... 62: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt... 62: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt... 36: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt... 36: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt... 36: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt... 36: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt... 28: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... 38: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt... 38: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt... 38: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt... 0: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... 2: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... 24: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... 49: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt... 49: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt... 49: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt... 49: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt... 49: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt... 49: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt... 47: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt... 47: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt... 47: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt... 41: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt... 41: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt... 41: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt... 41: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt... 41: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt... 41: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt... 45: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt... 45: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt... 45: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt... 45: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt... 9: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... 9: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... 9: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... 27: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... 25: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... 25: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... 3: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... 3: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... 7: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... 7: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... 7: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... 17: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... 23: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... 11: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... 11: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... 31: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... 31: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... 19: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... 19: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... 19: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... 19: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... 34: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt... 34: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt... 34: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt... 46: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt... 46: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt... 46: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt... 46: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt... 44: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt... 44: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt... 44: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt... 6: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... 6: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... 6: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... 6: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... 16: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... 16: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... 18: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... 10: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... 10: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... 10: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... 20: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... 20: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... 20: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... 20: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... 8: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... 8: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... 48: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt... 48: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt... 48: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt... 48: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt... 48: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt... 50: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt... 50: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt... 50: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt... 50: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt... 50: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt... 42: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt... 42: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt... 42: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt... 32: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt... 12: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... 12: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... 12: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... 12: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... 12: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... 26: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... 14: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... 14: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... 15: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... 15: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... 22: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... 22: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... 22: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... 21: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... 57: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt... 57: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt... 57: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt... 57: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt... 63: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt... 63: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt... 63: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt... 63: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt... 39: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt... 39: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt... 39: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt... 39: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt... 53: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt... 53: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt... 53: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt... 55: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt... 55: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt... 55: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt... 55: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt... 55: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt... 55: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt... 51: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt... 51: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt... 51: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt... 33: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt... 33: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt... 33: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt... 33: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt... 1: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... 1: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... 1: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... 1: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... 1: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... 1: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... 61: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt... 61: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt... 61: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt... 61: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt... 61: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt... 59: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt... 59: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt... 59: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt... 59: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt... 13: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... 35: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt... 35: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt... 35: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt... 35: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt... 35: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt... 35: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt... 5: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... 29: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... 29: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... 29: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... 37: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt... 37: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt... 37: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt... 58: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt... 58: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt... 58: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt... 58: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt... 40: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt... 40: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt... 40: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt... 40: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt... 60: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt... 60: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt... 60: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt... 60: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt... 52: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt... 52: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt... 30: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... 30: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... 30: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... 30: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... 4: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... 4: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... 56: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt... 56: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt... 56: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt... 56: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt... 56: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt... 54: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt... 54: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt... 54: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt... 62: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt... 62: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt... 62: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt... 62: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt... 36: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt... 36: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt... 36: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt... 36: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt... 28: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... 28: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... 38: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt... 38: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt... 38: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt... 38: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt... 0: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... 2: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... 2: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... 24: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... 24: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... 49: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt... 47: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt... 47: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt... 47: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt... 47: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt... 41: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt... 41: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt... 45: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt... 45: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt... 45: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt... 45: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt... 9: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... 43: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt... 43: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt... 27: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... 27: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... 25: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... 25: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... 3: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... 3: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... 3: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... 7: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... 7: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... 17: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... 17: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... 23: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... 23: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... 11: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... 11: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... 11: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... 11: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... 11: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... 11: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... 31: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... 19: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... 34: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt... 34: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt... 34: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt... 46: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt... 46: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt... 46: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt... 44: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt... 44: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt... 44: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt... 44: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt... 6: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... 6: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... 6: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... 16: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... 18: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... 18: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... 18: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... 18: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... 10: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... 10: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... 10: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... 10: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... 20: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... 20: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... 8: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... 8: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... 8: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... 48: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt... 48: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt... 50: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt... 50: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt... 50: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt... 42: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt... 42: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt... 42: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt... 32: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt... 12: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... 26: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... 26: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... 14: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... 14: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... 14: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... 14: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... 15: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... 15: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... 15: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... 15: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... 22: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... 21: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... 57: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt... 57: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt... 57: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt... 57: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt... 63: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt... 63: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt... 63: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt... 63: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt... 39: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt... 39: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt... 39: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt... 53: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt... 53: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt... 53: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt... 53: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt... 53: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt... 55: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt... 55: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt... 51: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt... 51: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt... 51: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt... 51: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt... 33: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt... 33: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt... 1: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... 61: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt... 61: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt... 61: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt... 59: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt... 59: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt... 59: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt... 13: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... 35: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt... 35: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt... 5: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... 29: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... 29: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... 29: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... 29: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... 37: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt... 52: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt... 30: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... 30: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... 4: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... 4: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... 28: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... 28: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... 28: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... 28: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... 38: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt... 0: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... 2: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... 2: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... 2: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... 2: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... 24: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... 24: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... 24: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... 49: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt... 47: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt... 9: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... 43: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt... 43: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt... 43: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt... 43: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt... 43: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt... 43: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt... 27: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... 27: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... 25: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... 25: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... 25: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... 3: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... 3: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... 7: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... 7: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... 17: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... 17: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... 23: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... 31: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... 31: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... 19: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... 34: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt... 46: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt... 44: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt... 6: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... 16: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... 16: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... 16: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... 18: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... 10: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... 20: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... 8: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... 48: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt... 42: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt... 32: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt... 32: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt... 32: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt... 32: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt... 32: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt... 12: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... 26: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... 14: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... 15: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... 15: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... 22: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... 22: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... 21: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... 39: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt... 51: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt... 33: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt... 33: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt... 1: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... 59: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt... 13: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... 5: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... 5: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... 29: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... 52: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt... 4: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... 28: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... 0: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... 2: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... 24: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... 24: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... 9: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... 9: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... 27: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... 27: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... 25: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... 3: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... 7: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... 17: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... 23: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... 31: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... 19: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... 34: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt... 16: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... 18: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... 20: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... 8: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... 42: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt... 32: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt... 12: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... 26: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... 26: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... 14: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... 21: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... 21: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... 13: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... 13: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... 5: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... 0: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... 0: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... 9: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... 27: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... 17: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... 23: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... 23: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... 31: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... 19: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... 16: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... 18: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... 8: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... 26: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... 26: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... 22: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... 21: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... 13: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... 13: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... 5: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... 5: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... 0: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... 17: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... 23: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... 31: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... 22: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... 21: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... 13: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... 5: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... 0: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... 21: [2022-11-25 14:33:11,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step3000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... 32: [2022-11-25 14:33:11,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt. 32: [2022-11-25 14:33:11,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt 32: [2022-11-25 14:33:11,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 0: [2022-11-25 14:33:11,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. 0: [2022-11-25 14:33:11,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. 0: [2022-11-25 14:33:11,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt 0: [2022-11-25 14:33:11,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 19: [2022-11-25 14:33:11,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. 26: [2022-11-25 14:33:11,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. 26: [2022-11-25 14:33:11,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt 26: [2022-11-25 14:33:11,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 32: [2022-11-25 14:33:11,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt. 32: [2022-11-25 14:33:11,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt. 32: [2022-11-25 14:33:11,542] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt 32: [2022-11-25 14:33:11,542] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt 32: [2022-11-25 14:33:11,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 32: [2022-11-25 14:33:11,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 19: [2022-11-25 14:33:11,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt 19: [2022-11-25 14:33:11,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 30: [2022-11-25 14:33:11,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. 30: [2022-11-25 14:33:11,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt 25: [2022-11-25 14:33:11,545] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. 30: [2022-11-25 14:33:11,545] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 6: [2022-11-25 14:33:11,546] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. 6: [2022-11-25 14:33:11,546] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt 6: [2022-11-25 14:33:11,546] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 60: [2022-11-25 14:33:11,549] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt. 60: [2022-11-25 14:33:11,549] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt 60: [2022-11-25 14:33:11,549] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 6: [2022-11-25 14:33:11,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. 6: [2022-11-25 14:33:11,550] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt 6: [2022-11-25 14:33:11,550] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 6: [2022-11-25 14:33:11,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. 6: [2022-11-25 14:33:11,551] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt 0: [2022-11-25 14:33:11,551] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. 6: [2022-11-25 14:33:11,551] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 0: [2022-11-25 14:33:11,551] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt 0: [2022-11-25 14:33:11,551] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 25: [2022-11-25 14:33:11,545] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt 25: [2022-11-25 14:33:11,545] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 6: [2022-11-25 14:33:11,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. 6: [2022-11-25 14:33:11,552] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt 6: [2022-11-25 14:33:11,552] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 1: [2022-11-25 14:33:11,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. 1: [2022-11-25 14:33:11,553] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt 1: [2022-11-25 14:33:11,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 1: [2022-11-25 14:33:11,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. 1: [2022-11-25 14:33:11,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt 1: [2022-11-25 14:33:11,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 30: [2022-11-25 14:33:11,555] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. 30: [2022-11-25 14:33:11,555] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt 30: [2022-11-25 14:33:11,555] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 60: [2022-11-25 14:33:11,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt. 60: [2022-11-25 14:33:11,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt 60: [2022-11-25 14:33:11,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 1: [2022-11-25 14:33:11,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. 1: [2022-11-25 14:33:11,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt 1: [2022-11-25 14:33:11,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 60: [2022-11-25 14:33:11,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt. 60: [2022-11-25 14:33:11,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt 60: [2022-11-25 14:33:11,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 26: [2022-11-25 14:33:11,557] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. 26: [2022-11-25 14:33:11,557] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt 26: [2022-11-25 14:33:11,557] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 0: [2022-11-25 14:33:11,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. 0: [2022-11-25 14:33:11,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt 0: [2022-11-25 14:33:11,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 0: [2022-11-25 14:33:11,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. 0: [2022-11-25 14:33:11,559] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt 0: [2022-11-25 14:33:11,559] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 20: [2022-11-25 14:33:11,559] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. 20: [2022-11-25 14:33:11,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt 20: [2022-11-25 14:33:11,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 20: [2022-11-25 14:33:11,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. 20: [2022-11-25 14:33:11,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt 20: [2022-11-25 14:33:11,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 57: [2022-11-25 14:33:11,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt. 57: [2022-11-25 14:33:11,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt 57: [2022-11-25 14:33:11,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 10: [2022-11-25 14:33:11,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. 10: [2022-11-25 14:33:11,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt 57: [2022-11-25 14:33:11,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt. 10: [2022-11-25 14:33:11,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 57: [2022-11-25 14:33:11,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt 57: [2022-11-25 14:33:11,561] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 57: [2022-11-25 14:33:11,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt. 57: [2022-11-25 14:33:11,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt 57: [2022-11-25 14:33:11,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 10: [2022-11-25 14:33:11,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. 10: [2022-11-25 14:33:11,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt 10: [2022-11-25 14:33:11,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 60: [2022-11-25 14:33:11,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt. 60: [2022-11-25 14:33:11,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt 60: [2022-11-25 14:33:11,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 1: [2022-11-25 14:33:11,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. 1: [2022-11-25 14:33:11,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt 1: [2022-11-25 14:33:11,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 19: [2022-11-25 14:33:11,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. 32: [2022-11-25 14:33:11,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt. 32: [2022-11-25 14:33:11,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt 32: [2022-11-25 14:33:11,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 25: [2022-11-25 14:33:11,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. 60: [2022-11-25 14:33:11,572] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt. 60: [2022-11-25 14:33:11,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt 60: [2022-11-25 14:33:11,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 60: [2022-11-25 14:33:11,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt. 60: [2022-11-25 14:33:11,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt 60: [2022-11-25 14:33:11,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 19: [2022-11-25 14:33:11,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt 19: [2022-11-25 14:33:11,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 19: [2022-11-25 14:33:11,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. 19: [2022-11-25 14:33:11,574] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt 19: [2022-11-25 14:33:11,574] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 30: [2022-11-25 14:33:11,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. 30: [2022-11-25 14:33:11,574] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. 30: [2022-11-25 14:33:11,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt 30: [2022-11-25 14:33:11,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt 30: [2022-11-25 14:33:11,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 30: [2022-11-25 14:33:11,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 32: [2022-11-25 14:33:11,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt. 32: [2022-11-25 14:33:11,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt 32: [2022-11-25 14:33:11,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 0: [2022-11-25 14:33:11,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. 0: [2022-11-25 14:33:11,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt 0: [2022-11-25 14:33:11,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 28: [2022-11-25 14:33:11,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. 28: [2022-11-25 14:33:11,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. 28: [2022-11-25 14:33:11,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. 28: [2022-11-25 14:33:11,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt 28: [2022-11-25 14:33:11,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt 28: [2022-11-25 14:33:11,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt 28: [2022-11-25 14:33:11,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. 28: [2022-11-25 14:33:11,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 28: [2022-11-25 14:33:11,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 28: [2022-11-25 14:33:11,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 28: [2022-11-25 14:33:11,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt 28: [2022-11-25 14:33:11,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 3: [2022-11-25 14:33:11,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. 3: [2022-11-25 14:33:11,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. 3: [2022-11-25 14:33:11,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt 3: [2022-11-25 14:33:11,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt 3: [2022-11-25 14:33:11,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 3: [2022-11-25 14:33:11,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 57: [2022-11-25 14:33:11,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt. 57: [2022-11-25 14:33:11,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt 57: [2022-11-25 14:33:11,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 57: [2022-11-25 14:33:11,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt. 57: [2022-11-25 14:33:11,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt 57: [2022-11-25 14:33:11,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 20: [2022-11-25 14:33:11,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. 20: [2022-11-25 14:33:11,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt 20: [2022-11-25 14:33:11,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 20: [2022-11-25 14:33:11,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. 20: [2022-11-25 14:33:11,587] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt 20: [2022-11-25 14:33:11,587] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 10: [2022-11-25 14:33:11,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. 10: [2022-11-25 14:33:11,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt 10: [2022-11-25 14:33:11,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 10: [2022-11-25 14:33:11,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. 10: [2022-11-25 14:33:11,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt 10: [2022-11-25 14:33:11,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 32: [2022-11-25 14:33:11,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt. 32: [2022-11-25 14:33:11,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt 32: [2022-11-25 14:33:11,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 3: [2022-11-25 14:33:11,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. 3: [2022-11-25 14:33:11,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. 3: [2022-11-25 14:33:11,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt 3: [2022-11-25 14:33:11,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt 3: [2022-11-25 14:33:11,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 3: [2022-11-25 14:33:11,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 28: [2022-11-25 14:33:11,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. 28: [2022-11-25 14:33:11,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt 28: [2022-11-25 14:33:11,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 4: [2022-11-25 14:33:11,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. 4: [2022-11-25 14:33:11,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. 4: [2022-11-25 14:33:11,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt 4: [2022-11-25 14:33:11,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt 4: [2022-11-25 14:33:11,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 4: [2022-11-25 14:33:11,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 26: [2022-11-25 14:33:11,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. 26: [2022-11-25 14:33:11,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt 26: [2022-11-25 14:33:11,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 4: [2022-11-25 14:33:11,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. 4: [2022-11-25 14:33:11,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. 4: [2022-11-25 14:33:11,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt 4: [2022-11-25 14:33:11,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt 4: [2022-11-25 14:33:11,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 4: [2022-11-25 14:33:11,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 1: [2022-11-25 14:33:11,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. 1: [2022-11-25 14:33:11,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt 1: [2022-11-25 14:33:11,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 3: [2022-11-25 14:33:11,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. 3: [2022-11-25 14:33:11,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt 3: [2022-11-25 14:33:11,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 25: [2022-11-25 14:33:11,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt 25: [2022-11-25 14:33:11,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 25: [2022-11-25 14:33:11,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. 25: [2022-11-25 14:33:11,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt 25: [2022-11-25 14:33:11,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 1: [2022-11-25 14:33:11,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. 1: [2022-11-25 14:33:11,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt 1: [2022-11-25 14:33:11,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 54: [2022-11-25 14:33:11,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt. 54: [2022-11-25 14:33:11,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt. 54: [2022-11-25 14:33:11,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt 54: [2022-11-25 14:33:11,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt 54: [2022-11-25 14:33:11,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 54: [2022-11-25 14:33:11,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 29: [2022-11-25 14:33:11,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. 29: [2022-11-25 14:33:11,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. 29: [2022-11-25 14:33:11,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. 29: [2022-11-25 14:33:11,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt 29: [2022-11-25 14:33:11,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt 29: [2022-11-25 14:33:11,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt 29: [2022-11-25 14:33:11,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 29: [2022-11-25 14:33:11,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 29: [2022-11-25 14:33:11,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 30: [2022-11-25 14:33:11,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. 26: [2022-11-25 14:33:11,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. 30: [2022-11-25 14:33:11,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt 30: [2022-11-25 14:33:11,626] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 26: [2022-11-25 14:33:11,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt 26: [2022-11-25 14:33:11,626] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 4: [2022-11-25 14:33:11,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. 4: [2022-11-25 14:33:11,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt 4: [2022-11-25 14:33:11,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 5: [2022-11-25 14:33:11,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. 5: [2022-11-25 14:33:11,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. 5: [2022-11-25 14:33:11,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt 5: [2022-11-25 14:33:11,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt 5: [2022-11-25 14:33:11,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 5: [2022-11-25 14:33:11,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 27: [2022-11-25 14:33:11,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. 27: [2022-11-25 14:33:11,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. 27: [2022-11-25 14:33:11,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. 27: [2022-11-25 14:33:11,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt 27: [2022-11-25 14:33:11,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt 27: [2022-11-25 14:33:11,629] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt 27: [2022-11-25 14:33:11,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 27: [2022-11-25 14:33:11,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 27: [2022-11-25 14:33:11,629] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 57: [2022-11-25 14:33:11,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt. 57: [2022-11-25 14:33:11,630] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt 57: [2022-11-25 14:33:11,630] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 19: [2022-11-25 14:33:11,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. 5: [2022-11-25 14:33:11,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. 5: [2022-11-25 14:33:11,633] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt 5: [2022-11-25 14:33:11,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 47: [2022-11-25 14:33:11,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt. 47: [2022-11-25 14:33:11,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt. 47: [2022-11-25 14:33:11,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt 47: [2022-11-25 14:33:11,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt 47: [2022-11-25 14:33:11,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 47: [2022-11-25 14:33:11,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 5: [2022-11-25 14:33:11,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. 5: [2022-11-25 14:33:11,634] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt 5: [2022-11-25 14:33:11,634] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 10: [2022-11-25 14:33:11,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. 28: [2022-11-25 14:33:11,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. 10: [2022-11-25 14:33:11,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt 28: [2022-11-25 14:33:11,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt 10: [2022-11-25 14:33:11,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 28: [2022-11-25 14:33:11,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 19: [2022-11-25 14:33:11,632] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt 19: [2022-11-25 14:33:11,632] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 27: [2022-11-25 14:33:11,638] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. 27: [2022-11-25 14:33:11,638] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt 27: [2022-11-25 14:33:11,638] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 26: [2022-11-25 14:33:11,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. 26: [2022-11-25 14:33:11,640] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt 26: [2022-11-25 14:33:11,641] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 25: [2022-11-25 14:33:11,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. 31: [2022-11-25 14:33:11,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. 31: [2022-11-25 14:33:11,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. 31: [2022-11-25 14:33:11,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. 31: [2022-11-25 14:33:11,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. 31: [2022-11-25 14:33:11,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. 31: [2022-11-25 14:33:11,642] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. 31: [2022-11-25 14:33:11,642] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt 31: [2022-11-25 14:33:11,642] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt 31: [2022-11-25 14:33:11,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 31: [2022-11-25 14:33:11,642] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt 31: [2022-11-25 14:33:11,642] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt 31: [2022-11-25 14:33:11,642] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt 31: [2022-11-25 14:33:11,642] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt 31: [2022-11-25 14:33:11,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 31: [2022-11-25 14:33:11,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 31: [2022-11-25 14:33:11,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 31: [2022-11-25 14:33:11,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 31: [2022-11-25 14:33:11,642] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 25: [2022-11-25 14:33:11,641] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt 25: [2022-11-25 14:33:11,641] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 27: [2022-11-25 14:33:11,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. 27: [2022-11-25 14:33:11,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt 27: [2022-11-25 14:33:11,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 27: [2022-11-25 14:33:11,648] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. 27: [2022-11-25 14:33:11,648] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt 27: [2022-11-25 14:33:11,648] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 10: [2022-11-25 14:33:11,649] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. 10: [2022-11-25 14:33:11,649] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt 10: [2022-11-25 14:33:11,649] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 19: [2022-11-25 14:33:11,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. 30: [2022-11-25 14:33:11,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. 11: [2022-11-25 14:33:11,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. 30: [2022-11-25 14:33:11,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt 30: [2022-11-25 14:33:11,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 11: [2022-11-25 14:33:11,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt 11: [2022-11-25 14:33:11,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 55: [2022-11-25 14:33:11,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt. 55: [2022-11-25 14:33:11,663] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt. 55: [2022-11-25 14:33:11,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt 55: [2022-11-25 14:33:11,663] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt 55: [2022-11-25 14:33:11,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 55: [2022-11-25 14:33:11,663] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 20: [2022-11-25 14:33:11,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. 20: [2022-11-25 14:33:11,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt 17: [2022-11-25 14:33:11,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. 17: [2022-11-25 14:33:11,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt 20: [2022-11-25 14:33:11,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 17: [2022-11-25 14:33:11,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 18: [2022-11-25 14:33:11,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. 18: [2022-11-25 14:33:11,664] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt 18: [2022-11-25 14:33:11,664] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 5: [2022-11-25 14:33:11,664] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. 5: [2022-11-25 14:33:11,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt 5: [2022-11-25 14:33:11,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 52: [2022-11-25 14:33:11,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt. 52: [2022-11-25 14:33:11,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt. 52: [2022-11-25 14:33:11,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt 52: [2022-11-25 14:33:11,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt 52: [2022-11-25 14:33:11,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 52: [2022-11-25 14:33:11,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 41: [2022-11-25 14:33:11,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt. 41: [2022-11-25 14:33:11,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt 41: [2022-11-25 14:33:11,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt. 41: [2022-11-25 14:33:11,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 41: [2022-11-25 14:33:11,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt 41: [2022-11-25 14:33:11,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 37: [2022-11-25 14:33:11,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt. 37: [2022-11-25 14:33:11,667] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt. 37: [2022-11-25 14:33:11,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt 37: [2022-11-25 14:33:11,667] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt 37: [2022-11-25 14:33:11,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 37: [2022-11-25 14:33:11,667] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 11: [2022-11-25 14:33:11,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. 11: [2022-11-25 14:33:11,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt 11: [2022-11-25 14:33:11,668] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 19: [2022-11-25 14:33:11,654] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt 19: [2022-11-25 14:33:11,654] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 19: [2022-11-25 14:33:11,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. 19: [2022-11-25 14:33:11,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt 19: [2022-11-25 14:33:11,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 55: [2022-11-25 14:33:11,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt. 55: [2022-11-25 14:33:11,668] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt 55: [2022-11-25 14:33:11,669] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 55: [2022-11-25 14:33:11,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt. 55: [2022-11-25 14:33:11,668] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt. 55: [2022-11-25 14:33:11,669] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt 55: [2022-11-25 14:33:11,669] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt 55: [2022-11-25 14:33:11,669] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 55: [2022-11-25 14:33:11,669] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 21: [2022-11-25 14:33:11,669] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. 21: [2022-11-25 14:33:11,669] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. 21: [2022-11-25 14:33:11,669] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt 21: [2022-11-25 14:33:11,669] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt 21: [2022-11-25 14:33:11,669] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 21: [2022-11-25 14:33:11,669] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 63: [2022-11-25 14:33:11,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt. 63: [2022-11-25 14:33:11,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt. 63: [2022-11-25 14:33:11,671] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt 63: [2022-11-25 14:33:11,671] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt 63: [2022-11-25 14:33:11,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 63: [2022-11-25 14:33:11,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 17: [2022-11-25 14:33:11,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. 17: [2022-11-25 14:33:11,671] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt 17: [2022-11-25 14:33:11,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 12: [2022-11-25 14:33:11,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. 12: [2022-11-25 14:33:11,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. 12: [2022-11-25 14:33:11,671] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt 12: [2022-11-25 14:33:11,671] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt 12: [2022-11-25 14:33:11,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 12: [2022-11-25 14:33:11,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 5: [2022-11-25 14:33:11,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. 8: [2022-11-25 14:33:11,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. 8: [2022-11-25 14:33:11,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. 5: [2022-11-25 14:33:11,672] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt 8: [2022-11-25 14:33:11,672] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt 8: [2022-11-25 14:33:11,672] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt 8: [2022-11-25 14:33:11,672] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 8: [2022-11-25 14:33:11,672] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 5: [2022-11-25 14:33:11,672] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 40: [2022-11-25 14:33:11,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt. 40: [2022-11-25 14:33:11,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt. 40: [2022-11-25 14:33:11,672] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt 40: [2022-11-25 14:33:11,672] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt 40: [2022-11-25 14:33:11,672] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 40: [2022-11-25 14:33:11,672] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 14: [2022-11-25 14:33:11,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. 14: [2022-11-25 14:33:11,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. 14: [2022-11-25 14:33:11,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt 14: [2022-11-25 14:33:11,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt 14: [2022-11-25 14:33:11,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 14: [2022-11-25 14:33:11,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 20: [2022-11-25 14:33:11,673] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. 7: [2022-11-25 14:33:11,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. 20: [2022-11-25 14:33:11,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt 20: [2022-11-25 14:33:11,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 7: [2022-11-25 14:33:11,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt 7: [2022-11-25 14:33:11,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 18: [2022-11-25 14:33:11,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. 18: [2022-11-25 14:33:11,675] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt 18: [2022-11-25 14:33:11,675] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 36: [2022-11-25 14:33:11,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt. 36: [2022-11-25 14:33:11,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt 34: [2022-11-25 14:33:11,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt. 36: [2022-11-25 14:33:11,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt. 36: [2022-11-25 14:33:11,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 34: [2022-11-25 14:33:11,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt 36: [2022-11-25 14:33:11,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt 36: [2022-11-25 14:33:11,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 34: [2022-11-25 14:33:11,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 34: [2022-11-25 14:33:11,676] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt. 34: [2022-11-25 14:33:11,676] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt 34: [2022-11-25 14:33:11,676] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 3: [2022-11-25 14:33:11,677] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. 3: [2022-11-25 14:33:11,677] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt 3: [2022-11-25 14:33:11,677] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 48: [2022-11-25 14:33:11,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt. 48: [2022-11-25 14:33:11,678] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt. 48: [2022-11-25 14:33:11,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt 48: [2022-11-25 14:33:11,678] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt 48: [2022-11-25 14:33:11,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 48: [2022-11-25 14:33:11,678] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 15: [2022-11-25 14:33:11,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. 15: [2022-11-25 14:33:11,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. 15: [2022-11-25 14:33:11,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt 15: [2022-11-25 14:33:11,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt 15: [2022-11-25 14:33:11,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 15: [2022-11-25 14:33:11,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 55: [2022-11-25 14:33:11,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt. 55: [2022-11-25 14:33:11,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt 55: [2022-11-25 14:33:11,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 13: [2022-11-25 14:33:11,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. 13: [2022-11-25 14:33:11,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. 24: [2022-11-25 14:33:11,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. 24: [2022-11-25 14:33:11,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. 24: [2022-11-25 14:33:11,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. 24: [2022-11-25 14:33:11,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. 24: [2022-11-25 14:33:11,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt 24: [2022-11-25 14:33:11,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt 13: [2022-11-25 14:33:11,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt 13: [2022-11-25 14:33:11,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt 24: [2022-11-25 14:33:11,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt 24: [2022-11-25 14:33:11,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt 24: [2022-11-25 14:33:11,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 24: [2022-11-25 14:33:11,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 24: [2022-11-25 14:33:11,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. 13: [2022-11-25 14:33:11,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 13: [2022-11-25 14:33:11,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 24: [2022-11-25 14:33:11,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 24: [2022-11-25 14:33:11,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 24: [2022-11-25 14:33:11,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt 24: [2022-11-25 14:33:11,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 24: [2022-11-25 14:33:11,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. 24: [2022-11-25 14:33:11,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt 24: [2022-11-25 14:33:11,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 7: [2022-11-25 14:33:11,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. 7: [2022-11-25 14:33:11,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt 7: [2022-11-25 14:33:11,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 59: [2022-11-25 14:33:11,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt. 59: [2022-11-25 14:33:11,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt. 59: [2022-11-25 14:33:11,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt 59: [2022-11-25 14:33:11,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt 59: [2022-11-25 14:33:11,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 59: [2022-11-25 14:33:11,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 9: [2022-11-25 14:33:11,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. 9: [2022-11-25 14:33:11,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. 9: [2022-11-25 14:33:11,684] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt 9: [2022-11-25 14:33:11,684] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt 9: [2022-11-25 14:33:11,684] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 9: [2022-11-25 14:33:11,684] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 45: [2022-11-25 14:33:11,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt. 45: [2022-11-25 14:33:11,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt. 45: [2022-11-25 14:33:11,684] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt 45: [2022-11-25 14:33:11,684] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt 45: [2022-11-25 14:33:11,684] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 45: [2022-11-25 14:33:11,684] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 26: [2022-11-25 14:33:11,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. 26: [2022-11-25 14:33:11,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt 50: [2022-11-25 14:33:11,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt. 50: [2022-11-25 14:33:11,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt. 26: [2022-11-25 14:33:11,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 50: [2022-11-25 14:33:11,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt 50: [2022-11-25 14:33:11,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt 50: [2022-11-25 14:33:11,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 50: [2022-11-25 14:33:11,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 22: [2022-11-25 14:33:11,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. 22: [2022-11-25 14:33:11,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. 22: [2022-11-25 14:33:11,688] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt 22: [2022-11-25 14:33:11,688] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt 22: [2022-11-25 14:33:11,688] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 22: [2022-11-25 14:33:11,688] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 51: [2022-11-25 14:33:11,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt. 51: [2022-11-25 14:33:11,692] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt. 51: [2022-11-25 14:33:11,692] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt 51: [2022-11-25 14:33:11,693] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 51: [2022-11-25 14:33:11,693] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt 51: [2022-11-25 14:33:11,693] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 33: [2022-11-25 14:33:11,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt. 33: [2022-11-25 14:33:11,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt. 33: [2022-11-25 14:33:11,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt 33: [2022-11-25 14:33:11,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt 33: [2022-11-25 14:33:11,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 33: [2022-11-25 14:33:11,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 25: [2022-11-25 14:33:11,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. 25: [2022-11-25 14:33:11,697] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt 25: [2022-11-25 14:33:11,697] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 62: [2022-11-25 14:33:11,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt. 62: [2022-11-25 14:33:11,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt. 38: [2022-11-25 14:33:11,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt. 62: [2022-11-25 14:33:11,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt 62: [2022-11-25 14:33:11,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt 38: [2022-11-25 14:33:11,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt 62: [2022-11-25 14:33:11,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 62: [2022-11-25 14:33:11,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 38: [2022-11-25 14:33:11,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 44: [2022-11-25 14:33:11,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt. 44: [2022-11-25 14:33:11,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt 44: [2022-11-25 14:33:11,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt. 44: [2022-11-25 14:33:11,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 44: [2022-11-25 14:33:11,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt 44: [2022-11-25 14:33:11,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 23: [2022-11-25 14:33:11,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. 23: [2022-11-25 14:33:11,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. 23: [2022-11-25 14:33:11,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt 23: [2022-11-25 14:33:11,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt 23: [2022-11-25 14:33:11,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 23: [2022-11-25 14:33:11,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 38: [2022-11-25 14:33:11,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt. 38: [2022-11-25 14:33:11,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt 38: [2022-11-25 14:33:11,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 0: [2022-11-25 14:33:11,722] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt 0: [2022-11-25 14:33:11,722] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 55: [2022-11-25 14:33:11,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt. 55: [2022-11-25 14:33:11,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt 55: [2022-11-25 14:33:11,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 61: [2022-11-25 14:33:11,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt. 61: [2022-11-25 14:33:11,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt. 61: [2022-11-25 14:33:11,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt. 61: [2022-11-25 14:33:11,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt. 61: [2022-11-25 14:33:11,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt 61: [2022-11-25 14:33:11,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt. 61: [2022-11-25 14:33:11,730] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt. 61: [2022-11-25 14:33:11,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 61: [2022-11-25 14:33:11,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt 61: [2022-11-25 14:33:11,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt 61: [2022-11-25 14:33:11,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt 61: [2022-11-25 14:33:11,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt 61: [2022-11-25 14:33:11,730] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt 61: [2022-11-25 14:33:11,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 61: [2022-11-25 14:33:11,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 61: [2022-11-25 14:33:11,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 61: [2022-11-25 14:33:11,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 61: [2022-11-25 14:33:11,730] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 46: [2022-11-25 14:33:11,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt. 46: [2022-11-25 14:33:11,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt. 46: [2022-11-25 14:33:11,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt. 46: [2022-11-25 14:33:11,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt. 46: [2022-11-25 14:33:11,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt. 46: [2022-11-25 14:33:11,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt 46: [2022-11-25 14:33:11,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt 46: [2022-11-25 14:33:11,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt 46: [2022-11-25 14:33:11,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 46: [2022-11-25 14:33:11,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt 46: [2022-11-25 14:33:11,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt 46: [2022-11-25 14:33:11,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 46: [2022-11-25 14:33:11,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 46: [2022-11-25 14:33:11,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 46: [2022-11-25 14:33:11,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 46: [2022-11-25 14:33:11,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt. 46: [2022-11-25 14:33:11,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt 46: [2022-11-25 14:33:11,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 25: [2022-11-25 14:33:11,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. 25: [2022-11-25 14:33:11,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt 25: [2022-11-25 14:33:11,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 6: [2022-11-25 14:33:11,735] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. 6: [2022-11-25 14:33:11,735] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt 6: [2022-11-25 14:33:11,735] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 39: [2022-11-25 14:33:11,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt. 39: [2022-11-25 14:33:11,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt. 39: [2022-11-25 14:33:11,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt. 39: [2022-11-25 14:33:11,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt. 39: [2022-11-25 14:33:11,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt 39: [2022-11-25 14:33:11,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt 39: [2022-11-25 14:33:11,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt 39: [2022-11-25 14:33:11,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 39: [2022-11-25 14:33:11,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt 39: [2022-11-25 14:33:11,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 39: [2022-11-25 14:33:11,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 39: [2022-11-25 14:33:11,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 2: [2022-11-25 14:33:11,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. 2: [2022-11-25 14:33:11,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. 2: [2022-11-25 14:33:11,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. 2: [2022-11-25 14:33:11,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. 2: [2022-11-25 14:33:11,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt 2: [2022-11-25 14:33:11,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt 39: [2022-11-25 14:33:11,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt. 2: [2022-11-25 14:33:11,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt 2: [2022-11-25 14:33:11,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt 39: [2022-11-25 14:33:11,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt. 2: [2022-11-25 14:33:11,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 2: [2022-11-25 14:33:11,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 6: [2022-11-25 14:33:11,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. 39: [2022-11-25 14:33:11,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt 39: [2022-11-25 14:33:11,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt 2: [2022-11-25 14:33:11,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 39: [2022-11-25 14:33:11,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 39: [2022-11-25 14:33:11,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 2: [2022-11-25 14:33:11,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 2: [2022-11-25 14:33:11,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. 6: [2022-11-25 14:33:11,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt 2: [2022-11-25 14:33:11,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt 6: [2022-11-25 14:33:11,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 2: [2022-11-25 14:33:11,738] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. 2: [2022-11-25 14:33:11,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 2: [2022-11-25 14:33:11,738] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt 2: [2022-11-25 14:33:11,738] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 49: [2022-11-25 14:33:11,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt. 49: [2022-11-25 14:33:11,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt. 49: [2022-11-25 14:33:11,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt. 49: [2022-11-25 14:33:11,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt 49: [2022-11-25 14:33:11,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt 49: [2022-11-25 14:33:11,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt 49: [2022-11-25 14:33:11,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 49: [2022-11-25 14:33:11,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 49: [2022-11-25 14:33:11,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 49: [2022-11-25 14:33:11,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt. 49: [2022-11-25 14:33:11,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt. 49: [2022-11-25 14:33:11,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt. 49: [2022-11-25 14:33:11,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt 49: [2022-11-25 14:33:11,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt 49: [2022-11-25 14:33:11,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 49: [2022-11-25 14:33:11,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt 49: [2022-11-25 14:33:11,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 49: [2022-11-25 14:33:11,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 42: [2022-11-25 14:33:11,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt. 42: [2022-11-25 14:33:11,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt. 42: [2022-11-25 14:33:11,741] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt. 42: [2022-11-25 14:33:11,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt 42: [2022-11-25 14:33:11,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt 42: [2022-11-25 14:33:11,741] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt 42: [2022-11-25 14:33:11,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 42: [2022-11-25 14:33:11,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 42: [2022-11-25 14:33:11,741] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 25: [2022-11-25 14:33:11,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. 25: [2022-11-25 14:33:11,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt 25: [2022-11-25 14:33:11,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 16: [2022-11-25 14:33:11,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. 16: [2022-11-25 14:33:11,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. 16: [2022-11-25 14:33:11,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. 16: [2022-11-25 14:33:11,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. 16: [2022-11-25 14:33:11,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. 16: [2022-11-25 14:33:11,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt 16: [2022-11-25 14:33:11,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. 16: [2022-11-25 14:33:11,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt 16: [2022-11-25 14:33:11,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt 16: [2022-11-25 14:33:11,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt 16: [2022-11-25 14:33:11,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 16: [2022-11-25 14:33:11,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt 16: [2022-11-25 14:33:11,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt 16: [2022-11-25 14:33:11,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 16: [2022-11-25 14:33:11,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 16: [2022-11-25 14:33:11,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 16: [2022-11-25 14:33:11,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 16: [2022-11-25 14:33:11,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 42: [2022-11-25 14:33:11,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt. 42: [2022-11-25 14:33:11,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt. 42: [2022-11-25 14:33:11,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt 42: [2022-11-25 14:33:11,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt 42: [2022-11-25 14:33:11,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 42: [2022-11-25 14:33:11,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 53: [2022-11-25 14:33:11,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt. 53: [2022-11-25 14:33:11,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt 53: [2022-11-25 14:33:11,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 53: [2022-11-25 14:33:11,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt. 53: [2022-11-25 14:33:11,764] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt 53: [2022-11-25 14:33:11,764] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 10: [2022-11-25 14:33:11,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. 10: [2022-11-25 14:33:11,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt 10: [2022-11-25 14:33:11,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 6: [2022-11-25 14:33:11,772] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. 6: [2022-11-25 14:33:11,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt 6: [2022-11-25 14:33:11,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 42: [2022-11-25 14:33:11,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt. 42: [2022-11-25 14:33:11,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt 42: [2022-11-25 14:33:11,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 4: [2022-11-25 14:33:11,774] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. 4: [2022-11-25 14:33:11,774] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt 4: [2022-11-25 14:33:11,774] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 56: [2022-11-25 14:33:11,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt. 56: [2022-11-25 14:33:11,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt. 56: [2022-11-25 14:33:11,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt. 56: [2022-11-25 14:33:11,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt. 56: [2022-11-25 14:33:11,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt. 56: [2022-11-25 14:33:11,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt 56: [2022-11-25 14:33:11,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt 56: [2022-11-25 14:33:11,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt 56: [2022-11-25 14:33:11,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt 56: [2022-11-25 14:33:11,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 56: [2022-11-25 14:33:11,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 56: [2022-11-25 14:33:11,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 56: [2022-11-25 14:33:11,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 56: [2022-11-25 14:33:11,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt 56: [2022-11-25 14:33:11,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 56: [2022-11-25 14:33:11,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt. 56: [2022-11-25 14:33:11,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt 56: [2022-11-25 14:33:11,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 24: [2022-11-25 14:33:11,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. 24: [2022-11-25 14:33:11,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt 24: [2022-11-25 14:33:11,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 19: [2022-11-25 14:33:11,889] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. 19: [2022-11-25 14:33:11,889] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt 19: [2022-11-25 14:33:11,889] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 49: [2022-11-25 14:33:11,892] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt. 49: [2022-11-25 14:33:11,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt 49: [2022-11-25 14:33:11,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 20: [2022-11-25 14:33:11,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. 20: [2022-11-25 14:33:11,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt 20: [2022-11-25 14:33:11,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 3: [2022-11-25 14:33:11,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. 3: [2022-11-25 14:33:11,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt 3: [2022-11-25 14:33:11,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 26: [2022-11-25 14:33:11,937] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. 26: [2022-11-25 14:33:11,937] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt 26: [2022-11-25 14:33:11,937] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 30: [2022-11-25 14:33:11,949] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. 30: [2022-11-25 14:33:11,949] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt 30: [2022-11-25 14:33:11,949] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 2: [2022-11-25 14:33:11,975] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. 2: [2022-11-25 14:33:11,975] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt 2: [2022-11-25 14:33:11,975] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 27: [2022-11-25 14:33:11,978] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. 27: [2022-11-25 14:33:11,979] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt 27: [2022-11-25 14:33:11,979] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 4: [2022-11-25 14:33:11,984] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. 4: [2022-11-25 14:33:11,984] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt 4: [2022-11-25 14:33:11,984] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 29: [2022-11-25 14:33:11,987] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. 29: [2022-11-25 14:33:11,987] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt 29: [2022-11-25 14:33:11,987] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 28: [2022-11-25 14:33:11,988] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. 28: [2022-11-25 14:33:11,988] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt 28: [2022-11-25 14:33:11,988] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 60: [2022-11-25 14:33:11,989] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt. 60: [2022-11-25 14:33:11,989] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt 60: [2022-11-25 14:33:11,989] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 39: [2022-11-25 14:33:11,990] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt. 39: [2022-11-25 14:33:11,990] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt 39: [2022-11-25 14:33:11,990] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 1: [2022-11-25 14:33:11,991] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. 1: [2022-11-25 14:33:11,991] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt 1: [2022-11-25 14:33:11,991] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 5: [2022-11-25 14:33:11,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. 5: [2022-11-25 14:33:11,993] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt 5: [2022-11-25 14:33:11,993] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 0: [2022-11-25 14:33:11,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. 0: [2022-11-25 14:33:11,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt 0: [2022-11-25 14:33:11,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 51: [2022-11-25 14:33:11,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt. 51: [2022-11-25 14:33:11,994] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt 51: [2022-11-25 14:33:11,994] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 37: [2022-11-25 14:33:11,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt. 37: [2022-11-25 14:33:11,999] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt 38: [2022-11-25 14:33:11,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt. 38: [2022-11-25 14:33:11,999] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt 38: [2022-11-25 14:33:11,999] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 37: [2022-11-25 14:33:11,999] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 32: [2022-11-25 14:33:12,000] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt. 32: [2022-11-25 14:33:12,000] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt 32: [2022-11-25 14:33:12,000] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 13: [2022-11-25 14:33:12,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. 13: [2022-11-25 14:33:12,002] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt 13: [2022-11-25 14:33:12,002] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 24: [2022-11-25 14:33:12,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. 24: [2022-11-25 14:33:12,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt 24: [2022-11-25 14:33:12,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 56: [2022-11-25 14:33:12,004] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt. 52: [2022-11-25 14:33:12,005] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt. 52: [2022-11-25 14:33:12,005] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt 52: [2022-11-25 14:33:12,005] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 0: [2022-11-25 14:33:12,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. 0: [2022-11-25 14:33:12,006] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt 0: [2022-11-25 14:33:12,006] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 57: [2022-11-25 14:33:12,006] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt. 57: [2022-11-25 14:33:12,006] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt 57: [2022-11-25 14:33:12,006] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 56: [2022-11-25 14:33:12,004] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt 56: [2022-11-25 14:33:12,004] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 41: [2022-11-25 14:33:12,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt. 41: [2022-11-25 14:33:12,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt 41: [2022-11-25 14:33:12,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 63: [2022-11-25 14:33:12,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt. 63: [2022-11-25 14:33:12,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt 63: [2022-11-25 14:33:12,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 7: [2022-11-25 14:33:12,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. 7: [2022-11-25 14:33:12,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt 7: [2022-11-25 14:33:12,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 10: [2022-11-25 14:33:12,010] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. 10: [2022-11-25 14:33:12,010] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt 10: [2022-11-25 14:33:12,010] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 20: [2022-11-25 14:33:12,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. 20: [2022-11-25 14:33:12,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt 20: [2022-11-25 14:33:12,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 25: [2022-11-25 14:33:12,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. 42: [2022-11-25 14:33:12,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt. 42: [2022-11-25 14:33:12,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt 25: [2022-11-25 14:33:12,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt 48: [2022-11-25 14:33:12,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt. 42: [2022-11-25 14:33:12,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 48: [2022-11-25 14:33:12,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt 25: [2022-11-25 14:33:12,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 48: [2022-11-25 14:33:12,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 55: [2022-11-25 14:33:12,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt. 55: [2022-11-25 14:33:12,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt 3: [2022-11-25 14:33:12,013] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. 55: [2022-11-25 14:33:12,013] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 3: [2022-11-25 14:33:12,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt 3: [2022-11-25 14:33:12,013] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 45: [2022-11-25 14:33:12,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt. 45: [2022-11-25 14:33:12,014] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt 45: [2022-11-25 14:33:12,014] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 8: [2022-11-25 14:33:12,014] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. 8: [2022-11-25 14:33:12,014] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt 8: [2022-11-25 14:33:12,014] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 15: [2022-11-25 14:33:12,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. 15: [2022-11-25 14:33:12,012] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt 15: [2022-11-25 14:33:12,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 14: [2022-11-25 14:33:12,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. 14: [2022-11-25 14:33:12,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt 14: [2022-11-25 14:33:12,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 59: [2022-11-25 14:33:12,016] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt. 59: [2022-11-25 14:33:12,016] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt 59: [2022-11-25 14:33:12,016] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 62: [2022-11-25 14:33:12,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt. 62: [2022-11-25 14:33:12,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt 62: [2022-11-25 14:33:12,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 16: [2022-11-25 14:33:12,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. 16: [2022-11-25 14:33:12,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt 16: [2022-11-25 14:33:12,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 61: [2022-11-25 14:33:12,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt. 61: [2022-11-25 14:33:12,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt 61: [2022-11-25 14:33:12,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 40: [2022-11-25 14:33:12,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt. 40: [2022-11-25 14:33:12,017] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt 40: [2022-11-25 14:33:12,017] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 46: [2022-11-25 14:33:12,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt. 46: [2022-11-25 14:33:12,018] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt 46: [2022-11-25 14:33:12,018] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 47: [2022-11-25 14:33:12,018] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt. 47: [2022-11-25 14:33:12,018] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt 47: [2022-11-25 14:33:12,018] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 17: [2022-11-25 14:33:12,019] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. 17: [2022-11-25 14:33:12,019] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt 17: [2022-11-25 14:33:12,019] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 30: [2022-11-25 14:33:12,021] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. 30: [2022-11-25 14:33:12,021] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt 30: [2022-11-25 14:33:12,021] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 60: [2022-11-25 14:33:12,023] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt. 60: [2022-11-25 14:33:12,023] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt 60: [2022-11-25 14:33:12,023] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 34: [2022-11-25 14:33:12,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt. 34: [2022-11-25 14:33:12,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt 33: [2022-11-25 14:33:12,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt. 33: [2022-11-25 14:33:12,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt 33: [2022-11-25 14:33:12,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 34: [2022-11-25 14:33:12,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 50: [2022-11-25 14:33:12,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt. 19: [2022-11-25 14:33:12,025] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. 50: [2022-11-25 14:33:12,025] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt 50: [2022-11-25 14:33:12,025] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 19: [2022-11-25 14:33:12,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt 19: [2022-11-25 14:33:12,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 18: [2022-11-25 14:33:12,026] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. 18: [2022-11-25 14:33:12,026] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt 18: [2022-11-25 14:33:12,026] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 53: [2022-11-25 14:33:12,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt. 53: [2022-11-25 14:33:12,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt 36: [2022-11-25 14:33:12,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt. 53: [2022-11-25 14:33:12,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 36: [2022-11-25 14:33:12,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt 36: [2022-11-25 14:33:12,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 6: [2022-11-25 14:33:12,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. 6: [2022-11-25 14:33:12,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt 6: [2022-11-25 14:33:12,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 22: [2022-11-25 14:33:12,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. 22: [2022-11-25 14:33:12,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt 22: [2022-11-25 14:33:12,029] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 11: [2022-11-25 14:33:12,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. 11: [2022-11-25 14:33:12,030] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt 11: [2022-11-25 14:33:12,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 44: [2022-11-25 14:33:12,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt. 44: [2022-11-25 14:33:12,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt 44: [2022-11-25 14:33:12,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 9: [2022-11-25 14:33:12,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. 9: [2022-11-25 14:33:12,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt 9: [2022-11-25 14:33:12,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 26: [2022-11-25 14:33:12,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. 2: [2022-11-25 14:33:12,032] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. 26: [2022-11-25 14:33:12,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt 26: [2022-11-25 14:33:12,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 2: [2022-11-25 14:33:12,032] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt 2: [2022-11-25 14:33:12,032] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 12: [2022-11-25 14:33:12,033] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. 12: [2022-11-25 14:33:12,033] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt 12: [2022-11-25 14:33:12,034] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 49: [2022-11-25 14:33:12,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt. 49: [2022-11-25 14:33:12,035] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt 49: [2022-11-25 14:33:12,035] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 5: [2022-11-25 14:33:12,035] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. 5: [2022-11-25 14:33:12,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt 5: [2022-11-25 14:33:12,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 4: [2022-11-25 14:33:12,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. 4: [2022-11-25 14:33:12,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt 4: [2022-11-25 14:33:12,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 21: [2022-11-25 14:33:12,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. 39: [2022-11-25 14:33:12,038] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt. 39: [2022-11-25 14:33:12,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt 39: [2022-11-25 14:33:12,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 31: [2022-11-25 14:33:12,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. 31: [2022-11-25 14:33:12,039] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt 31: [2022-11-25 14:33:12,039] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 27: [2022-11-25 14:33:12,040] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. 27: [2022-11-25 14:33:12,040] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt 27: [2022-11-25 14:33:12,040] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 1: [2022-11-25 14:33:12,042] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. 1: [2022-11-25 14:33:12,042] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt 1: [2022-11-25 14:33:12,042] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 28: [2022-11-25 14:33:12,044] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. 28: [2022-11-25 14:33:12,044] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt 28: [2022-11-25 14:33:12,044] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 51: [2022-11-25 14:33:12,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt. 51: [2022-11-25 14:33:12,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt 51: [2022-11-25 14:33:12,049] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 54: [2022-11-25 14:33:12,049] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt. 54: [2022-11-25 14:33:12,049] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt 54: [2022-11-25 14:33:12,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 21: [2022-11-25 14:33:12,038] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt 21: [2022-11-25 14:33:12,038] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 23: [2022-11-25 14:33:12,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. 23: [2022-11-25 14:33:12,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt 23: [2022-11-25 14:33:12,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 32: [2022-11-25 14:33:12,052] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt. 32: [2022-11-25 14:33:12,052] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt 32: [2022-11-25 14:33:12,052] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 37: [2022-11-25 14:33:12,053] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt. 37: [2022-11-25 14:33:12,053] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt 37: [2022-11-25 14:33:12,053] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 41: [2022-11-25 14:33:12,056] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt. 41: [2022-11-25 14:33:12,056] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt 41: [2022-11-25 14:33:12,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 56: [2022-11-25 14:33:12,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt. 56: [2022-11-25 14:33:12,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt 46: [2022-11-25 14:33:12,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt. 46: [2022-11-25 14:33:12,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt 56: [2022-11-25 14:33:12,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 46: [2022-11-25 14:33:12,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 42: [2022-11-25 14:33:12,057] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt. 42: [2022-11-25 14:33:12,057] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt 42: [2022-11-25 14:33:12,057] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 50: [2022-11-25 14:33:12,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt. 50: [2022-11-25 14:33:12,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt 50: [2022-11-25 14:33:12,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 16: [2022-11-25 14:33:12,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. 63: [2022-11-25 14:33:12,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt. 16: [2022-11-25 14:33:12,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt 63: [2022-11-25 14:33:12,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt 63: [2022-11-25 14:33:12,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 52: [2022-11-25 14:33:12,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt. 16: [2022-11-25 14:33:12,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 52: [2022-11-25 14:33:12,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt 52: [2022-11-25 14:33:12,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 29: [2022-11-25 14:33:12,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. 29: [2022-11-25 14:33:12,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt 29: [2022-11-25 14:33:12,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 61: [2022-11-25 14:33:12,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt. 61: [2022-11-25 14:33:12,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt 48: [2022-11-25 14:33:12,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt. 61: [2022-11-25 14:33:12,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 48: [2022-11-25 14:33:12,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt 48: [2022-11-25 14:33:12,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 14: [2022-11-25 14:33:12,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. 62: [2022-11-25 14:33:12,059] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt. 14: [2022-11-25 14:33:12,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt 14: [2022-11-25 14:33:12,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 62: [2022-11-25 14:33:12,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt 62: [2022-11-25 14:33:12,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 17: [2022-11-25 14:33:12,060] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. 17: [2022-11-25 14:33:12,060] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt 17: [2022-11-25 14:33:12,060] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 57: [2022-11-25 14:33:12,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt. 57: [2022-11-25 14:33:12,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt 57: [2022-11-25 14:33:12,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 33: [2022-11-25 14:33:12,063] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt. 33: [2022-11-25 14:33:12,063] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt 33: [2022-11-25 14:33:12,063] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 59: [2022-11-25 14:33:12,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt. 53: [2022-11-25 14:33:12,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt. 53: [2022-11-25 14:33:12,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt 53: [2022-11-25 14:33:12,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 59: [2022-11-25 14:33:12,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt 59: [2022-11-25 14:33:12,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 11: [2022-11-25 14:33:12,064] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. 11: [2022-11-25 14:33:12,064] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt 11: [2022-11-25 14:33:12,064] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 47: [2022-11-25 14:33:12,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt. 47: [2022-11-25 14:33:12,065] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt 47: [2022-11-25 14:33:12,065] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 13: [2022-11-25 14:33:12,065] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. 13: [2022-11-25 14:33:12,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt 13: [2022-11-25 14:33:12,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 23: [2022-11-25 14:33:12,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. 23: [2022-11-25 14:33:12,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt 23: [2022-11-25 14:33:12,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 44: [2022-11-25 14:33:12,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt. 44: [2022-11-25 14:33:12,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt 44: [2022-11-25 14:33:12,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 29: [2022-11-25 14:33:12,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. 29: [2022-11-25 14:33:12,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt 29: [2022-11-25 14:33:12,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 22: [2022-11-25 14:33:12,071] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. 22: [2022-11-25 14:33:12,071] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt 22: [2022-11-25 14:33:12,071] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 51: [2022-11-25 14:33:12,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt. 51: [2022-11-25 14:33:12,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt 45: [2022-11-25 14:33:12,073] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt. 51: [2022-11-25 14:33:12,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 45: [2022-11-25 14:33:12,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt 45: [2022-11-25 14:33:12,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 12: [2022-11-25 14:33:12,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. 12: [2022-11-25 14:33:12,075] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt 12: [2022-11-25 14:33:12,075] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 53: [2022-11-25 14:33:12,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt. 53: [2022-11-25 14:33:12,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt 53: [2022-11-25 14:33:12,076] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 36: [2022-11-25 14:33:12,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt. 36: [2022-11-25 14:33:12,078] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt 36: [2022-11-25 14:33:12,078] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 54: [2022-11-25 14:33:12,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt. 54: [2022-11-25 14:33:12,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt 54: [2022-11-25 14:33:12,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 31: [2022-11-25 14:33:12,080] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. 31: [2022-11-25 14:33:12,080] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt 31: [2022-11-25 14:33:12,080] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 8: [2022-11-25 14:33:12,080] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. 8: [2022-11-25 14:33:12,080] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt 8: [2022-11-25 14:33:12,080] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 7: [2022-11-25 14:33:12,081] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. 40: [2022-11-25 14:33:12,081] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt. 7: [2022-11-25 14:33:12,081] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt 40: [2022-11-25 14:33:12,081] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt 40: [2022-11-25 14:33:12,081] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 7: [2022-11-25 14:33:12,081] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 38: [2022-11-25 14:33:12,081] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt. 38: [2022-11-25 14:33:12,082] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt 38: [2022-11-25 14:33:12,082] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 41: [2022-11-25 14:33:12,082] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt. 41: [2022-11-25 14:33:12,082] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt 41: [2022-11-25 14:33:12,082] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 17: [2022-11-25 14:33:12,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. 17: [2022-11-25 14:33:12,084] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt 17: [2022-11-25 14:33:12,084] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 18: [2022-11-25 14:33:12,085] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. 18: [2022-11-25 14:33:12,085] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt 18: [2022-11-25 14:33:12,085] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 37: [2022-11-25 14:33:12,086] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt. 37: [2022-11-25 14:33:12,086] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt 37: [2022-11-25 14:33:12,086] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 50: [2022-11-25 14:33:12,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt. 50: [2022-11-25 14:33:12,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt 50: [2022-11-25 14:33:12,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 34: [2022-11-25 14:33:12,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt. 34: [2022-11-25 14:33:12,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt 34: [2022-11-25 14:33:12,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 35: [2022-11-25 14:33:12,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt. 35: [2022-11-25 14:33:12,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt. 35: [2022-11-25 14:33:12,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt 35: [2022-11-25 14:33:12,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt 35: [2022-11-25 14:33:12,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 35: [2022-11-25 14:33:12,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 35: [2022-11-25 14:33:12,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt. 35: [2022-11-25 14:33:12,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt. 35: [2022-11-25 14:33:12,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt 15: [2022-11-25 14:33:12,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. 35: [2022-11-25 14:33:12,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt 15: [2022-11-25 14:33:12,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt 15: [2022-11-25 14:33:12,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 35: [2022-11-25 14:33:12,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 35: [2022-11-25 14:33:12,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 52: [2022-11-25 14:33:12,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt. 52: [2022-11-25 14:33:12,092] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt 52: [2022-11-25 14:33:12,092] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 48: [2022-11-25 14:33:12,094] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt. 48: [2022-11-25 14:33:12,094] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt 48: [2022-11-25 14:33:12,094] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 58: [2022-11-25 14:33:12,100] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt. 58: [2022-11-25 14:33:12,100] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt. 58: [2022-11-25 14:33:12,100] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt 58: [2022-11-25 14:33:12,100] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt 58: [2022-11-25 14:33:12,100] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 58: [2022-11-25 14:33:12,100] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 21: [2022-11-25 14:33:12,103] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. 21: [2022-11-25 14:33:12,103] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt 21: [2022-11-25 14:33:12,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 9: [2022-11-25 14:33:12,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. 9: [2022-11-25 14:33:12,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt 9: [2022-11-25 14:33:12,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 63: [2022-11-25 14:33:12,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt. 63: [2022-11-25 14:33:12,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt 63: [2022-11-25 14:33:12,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 40: [2022-11-25 14:33:12,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt. 7: [2022-11-25 14:33:12,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. 40: [2022-11-25 14:33:12,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt 40: [2022-11-25 14:33:12,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 7: [2022-11-25 14:33:12,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt 7: [2022-11-25 14:33:12,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 12: [2022-11-25 14:33:12,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. 12: [2022-11-25 14:33:12,114] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt 12: [2022-11-25 14:33:12,114] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 45: [2022-11-25 14:33:12,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt. 45: [2022-11-25 14:33:12,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt 22: [2022-11-25 14:33:12,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. 45: [2022-11-25 14:33:12,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 22: [2022-11-25 14:33:12,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt 22: [2022-11-25 14:33:12,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 34: [2022-11-25 14:33:12,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt. 18: [2022-11-25 14:33:12,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. 34: [2022-11-25 14:33:12,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt 18: [2022-11-25 14:33:12,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt 33: [2022-11-25 14:33:12,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt. 34: [2022-11-25 14:33:12,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 18: [2022-11-25 14:33:12,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 33: [2022-11-25 14:33:12,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt 33: [2022-11-25 14:33:12,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 11: [2022-11-25 14:33:12,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. 11: [2022-11-25 14:33:12,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt 11: [2022-11-25 14:33:12,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 14: [2022-11-25 14:33:12,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. 14: [2022-11-25 14:33:12,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt 14: [2022-11-25 14:33:12,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 44: [2022-11-25 14:33:12,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt. 44: [2022-11-25 14:33:12,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt 44: [2022-11-25 14:33:12,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 62: [2022-11-25 14:33:12,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt. 62: [2022-11-25 14:33:12,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt 62: [2022-11-25 14:33:12,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 36: [2022-11-25 14:33:12,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt. 38: [2022-11-25 14:33:12,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt. 36: [2022-11-25 14:33:12,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt 36: [2022-11-25 14:33:12,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 38: [2022-11-25 14:33:12,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt 38: [2022-11-25 14:33:12,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 21: [2022-11-25 14:33:12,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. 21: [2022-11-25 14:33:12,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt 21: [2022-11-25 14:33:12,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 9: [2022-11-25 14:33:12,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. 9: [2022-11-25 14:33:12,119] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt 9: [2022-11-25 14:33:12,119] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 59: [2022-11-25 14:33:12,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt. 59: [2022-11-25 14:33:12,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt 59: [2022-11-25 14:33:12,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 8: [2022-11-25 14:33:12,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. 8: [2022-11-25 14:33:12,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt 8: [2022-11-25 14:33:12,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 23: [2022-11-25 14:33:12,121] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. 23: [2022-11-25 14:33:12,121] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt 23: [2022-11-25 14:33:12,121] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 54: [2022-11-25 14:33:12,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt. 54: [2022-11-25 14:33:12,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt 54: [2022-11-25 14:33:12,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 58: [2022-11-25 14:33:12,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt. 58: [2022-11-25 14:33:12,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt 58: [2022-11-25 14:33:12,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 15: [2022-11-25 14:33:12,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. 15: [2022-11-25 14:33:12,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt 15: [2022-11-25 14:33:12,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 51: [2022-11-25 14:33:12,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt. 51: [2022-11-25 14:33:12,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt 51: [2022-11-25 14:33:12,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 29: [2022-11-25 14:33:12,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. 29: [2022-11-25 14:33:12,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt 29: [2022-11-25 14:33:12,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 35: [2022-11-25 14:33:12,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt. 35: [2022-11-25 14:33:12,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt 35: [2022-11-25 14:33:12,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 58: [2022-11-25 14:33:12,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt. 58: [2022-11-25 14:33:12,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt 58: [2022-11-25 14:33:12,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 47: [2022-11-25 14:33:12,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt. 47: [2022-11-25 14:33:12,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt 47: [2022-11-25 14:33:12,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 58: [2022-11-25 14:33:12,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt. 58: [2022-11-25 14:33:12,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt 58: [2022-11-25 14:33:12,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 38: [2022-11-25 14:33:12,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt. 38: [2022-11-25 14:33:12,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt 38: [2022-11-25 14:33:12,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 37: [2022-11-25 14:33:12,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt. 37: [2022-11-25 14:33:12,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt 37: [2022-11-25 14:33:12,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 52: [2022-11-25 14:33:12,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt. 52: [2022-11-25 14:33:12,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt 52: [2022-11-25 14:33:12,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 48: [2022-11-25 14:33:12,148] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt. 48: [2022-11-25 14:33:12,148] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt 48: [2022-11-25 14:33:12,148] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 63: [2022-11-25 14:33:12,151] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt. 63: [2022-11-25 14:33:12,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt 41: [2022-11-25 14:33:12,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt. 63: [2022-11-25 14:33:12,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 41: [2022-11-25 14:33:12,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt 41: [2022-11-25 14:33:12,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 17: [2022-11-25 14:33:12,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. 17: [2022-11-25 14:33:12,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt 17: [2022-11-25 14:33:12,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 8: [2022-11-25 14:33:12,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. 53: [2022-11-25 14:33:12,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt. 8: [2022-11-25 14:33:12,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt 8: [2022-11-25 14:33:12,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 53: [2022-11-25 14:33:12,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt 53: [2022-11-25 14:33:12,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 40: [2022-11-25 14:33:12,165] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt. 40: [2022-11-25 14:33:12,165] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt 40: [2022-11-25 14:33:12,165] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 58: [2022-11-25 14:33:12,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt. 58: [2022-11-25 14:33:12,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt 58: [2022-11-25 14:33:12,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 44: [2022-11-25 14:33:12,168] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt. 44: [2022-11-25 14:33:12,168] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt 44: [2022-11-25 14:33:12,168] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 13: [2022-11-25 14:33:12,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. 13: [2022-11-25 14:33:12,169] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt 13: [2022-11-25 14:33:12,169] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 9: [2022-11-25 14:33:12,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. 9: [2022-11-25 14:33:12,184] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt 9: [2022-11-25 14:33:12,184] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 36: [2022-11-25 14:33:12,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt. 36: [2022-11-25 14:33:12,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt 36: [2022-11-25 14:33:12,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 12: [2022-11-25 14:33:12,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. 12: [2022-11-25 14:33:12,197] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt 12: [2022-11-25 14:33:12,197] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 15: [2022-11-25 14:33:12,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. 15: [2022-11-25 14:33:12,200] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt 15: [2022-11-25 14:33:12,200] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 45: [2022-11-25 14:33:12,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt. 45: [2022-11-25 14:33:12,201] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt 45: [2022-11-25 14:33:12,201] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 11: [2022-11-25 14:33:12,202] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. 11: [2022-11-25 14:33:12,203] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt 11: [2022-11-25 14:33:12,203] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 62: [2022-11-25 14:33:12,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt. 62: [2022-11-25 14:33:12,203] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt 62: [2022-11-25 14:33:12,203] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 13: [2022-11-25 14:33:12,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. 13: [2022-11-25 14:33:12,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt 13: [2022-11-25 14:33:12,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 50: [2022-11-25 14:33:12,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt. 50: [2022-11-25 14:33:12,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt 50: [2022-11-25 14:33:12,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 7: [2022-11-25 14:33:12,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. 7: [2022-11-25 14:33:12,218] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt 7: [2022-11-25 14:33:12,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 59: [2022-11-25 14:33:12,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt. 59: [2022-11-25 14:33:12,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt 59: [2022-11-25 14:33:12,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 21: [2022-11-25 14:33:12,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. 21: [2022-11-25 14:33:12,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt 21: [2022-11-25 14:33:12,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 23: [2022-11-25 14:33:12,223] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. 23: [2022-11-25 14:33:12,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt 23: [2022-11-25 14:33:12,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 22: [2022-11-25 14:33:12,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. 22: [2022-11-25 14:33:12,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt 22: [2022-11-25 14:33:12,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 33: [2022-11-25 14:33:12,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt. 14: [2022-11-25 14:33:12,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. 33: [2022-11-25 14:33:12,226] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt 14: [2022-11-25 14:33:12,226] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt 33: [2022-11-25 14:33:12,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 14: [2022-11-25 14:33:12,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 18: [2022-11-25 14:33:12,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. 18: [2022-11-25 14:33:12,233] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt 18: [2022-11-25 14:33:12,233] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 43: [2022-11-25 14:33:12,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt. 43: [2022-11-25 14:33:12,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt. 43: [2022-11-25 14:33:12,234] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt. 43: [2022-11-25 14:33:12,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt 43: [2022-11-25 14:33:12,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt 43: [2022-11-25 14:33:12,235] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt 43: [2022-11-25 14:33:12,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 43: [2022-11-25 14:33:12,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 43: [2022-11-25 14:33:12,235] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 34: [2022-11-25 14:33:12,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt. 34: [2022-11-25 14:33:12,238] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt 34: [2022-11-25 14:33:12,238] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 43: [2022-11-25 14:33:12,239] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt. 43: [2022-11-25 14:33:12,239] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt 43: [2022-11-25 14:33:12,239] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 43: [2022-11-25 14:33:12,239] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt. 43: [2022-11-25 14:33:12,239] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt 43: [2022-11-25 14:33:12,239] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 29: [2022-11-25 14:33:12,243] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. 29: [2022-11-25 14:33:12,244] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt 29: [2022-11-25 14:33:12,244] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 47: [2022-11-25 14:33:12,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt. 47: [2022-11-25 14:33:12,244] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt 47: [2022-11-25 14:33:12,244] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 54: [2022-11-25 14:33:12,253] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt. 54: [2022-11-25 14:33:12,253] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt 54: [2022-11-25 14:33:12,253] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 51: [2022-11-25 14:33:12,263] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt. 51: [2022-11-25 14:33:12,263] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt 51: [2022-11-25 14:33:12,263] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 53: [2022-11-25 14:33:12,264] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt. 53: [2022-11-25 14:33:12,264] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt 53: [2022-11-25 14:33:12,264] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 38: [2022-11-25 14:33:12,270] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt. 38: [2022-11-25 14:33:12,270] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt 38: [2022-11-25 14:33:12,270] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 41: [2022-11-25 14:33:12,271] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt. 41: [2022-11-25 14:33:12,271] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt 41: [2022-11-25 14:33:12,271] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 37: [2022-11-25 14:33:12,272] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt. 37: [2022-11-25 14:33:12,272] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt 37: [2022-11-25 14:33:12,272] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 35: [2022-11-25 14:33:12,291] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt. 35: [2022-11-25 14:33:12,291] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt 35: [2022-11-25 14:33:12,291] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 52: [2022-11-25 14:33:12,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt. 52: [2022-11-25 14:33:12,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt 52: [2022-11-25 14:33:12,294] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 58: [2022-11-25 14:33:12,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt. 58: [2022-11-25 14:33:12,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt 58: [2022-11-25 14:33:12,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 63: [2022-11-25 14:33:12,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt. 63: [2022-11-25 14:33:12,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt 63: [2022-11-25 14:33:12,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 48: [2022-11-25 14:33:12,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt. 48: [2022-11-25 14:33:12,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt 48: [2022-11-25 14:33:12,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 17: [2022-11-25 14:33:12,316] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. 17: [2022-11-25 14:33:12,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt 17: [2022-11-25 14:33:12,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 44: [2022-11-25 14:33:12,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt. 44: [2022-11-25 14:33:12,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt 44: [2022-11-25 14:33:12,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 40: [2022-11-25 14:33:12,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt. 40: [2022-11-25 14:33:12,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt 40: [2022-11-25 14:33:12,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 50: [2022-11-25 14:33:12,327] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt. 50: [2022-11-25 14:33:12,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt 50: [2022-11-25 14:33:12,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 8: [2022-11-25 14:33:12,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. 8: [2022-11-25 14:33:12,331] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt 8: [2022-11-25 14:33:12,331] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 11: [2022-11-25 14:33:12,337] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. 11: [2022-11-25 14:33:12,337] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt 11: [2022-11-25 14:33:12,337] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 9: [2022-11-25 14:33:12,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. 9: [2022-11-25 14:33:12,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt 9: [2022-11-25 14:33:12,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 43: [2022-11-25 14:33:12,366] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt. 43: [2022-11-25 14:33:12,366] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt 43: [2022-11-25 14:33:12,366] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 45: [2022-11-25 14:33:12,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt. 45: [2022-11-25 14:33:12,380] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt 45: [2022-11-25 14:33:12,380] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 62: [2022-11-25 14:33:12,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt. 62: [2022-11-25 14:33:12,383] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt 62: [2022-11-25 14:33:12,383] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 12: [2022-11-25 14:33:12,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. 12: [2022-11-25 14:33:12,388] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt 12: [2022-11-25 14:33:12,388] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 21: [2022-11-25 14:33:12,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. 15: [2022-11-25 14:33:12,391] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. 15: [2022-11-25 14:33:12,391] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt 15: [2022-11-25 14:33:12,391] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 21: [2022-11-25 14:33:12,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt 21: [2022-11-25 14:33:12,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 33: [2022-11-25 14:33:12,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt. 33: [2022-11-25 14:33:12,392] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt 33: [2022-11-25 14:33:12,392] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 43: [2022-11-25 14:33:12,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt. 43: [2022-11-25 14:33:12,393] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt 43: [2022-11-25 14:33:12,393] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 18: [2022-11-25 14:33:12,396] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. 18: [2022-11-25 14:33:12,396] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt 18: [2022-11-25 14:33:12,396] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 23: [2022-11-25 14:33:12,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. 23: [2022-11-25 14:33:12,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt 23: [2022-11-25 14:33:12,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 14: [2022-11-25 14:33:12,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. 14: [2022-11-25 14:33:12,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt 14: [2022-11-25 14:33:12,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 37: [2022-11-25 14:33:12,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt. 37: [2022-11-25 14:33:12,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt 37: [2022-11-25 14:33:12,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 7: [2022-11-25 14:33:12,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. 63: [2022-11-25 14:33:12,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt. 41: [2022-11-25 14:33:12,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt. 7: [2022-11-25 14:33:12,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt 63: [2022-11-25 14:33:12,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt 7: [2022-11-25 14:33:12,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 63: [2022-11-25 14:33:12,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 41: [2022-11-25 14:33:12,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt 41: [2022-11-25 14:33:12,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 62: [2022-11-25 14:33:12,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt. 62: [2022-11-25 14:33:12,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt 62: [2022-11-25 14:33:12,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 45: [2022-11-25 14:33:12,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt. 45: [2022-11-25 14:33:12,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt 45: [2022-11-25 14:33:12,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 13: [2022-11-25 14:33:12,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. 13: [2022-11-25 14:33:12,403] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt 13: [2022-11-25 14:33:12,403] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 51: [2022-11-25 14:33:12,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt. 51: [2022-11-25 14:33:12,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt 51: [2022-11-25 14:33:12,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 53: [2022-11-25 14:33:12,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt. 53: [2022-11-25 14:33:12,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt 53: [2022-11-25 14:33:12,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 38: [2022-11-25 14:33:12,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt. 38: [2022-11-25 14:33:12,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt 38: [2022-11-25 14:33:12,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 15: [2022-11-25 14:33:12,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. 59: [2022-11-25 14:33:12,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt. 15: [2022-11-25 14:33:12,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt 15: [2022-11-25 14:33:12,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 59: [2022-11-25 14:33:12,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt 59: [2022-11-25 14:33:12,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 12: [2022-11-25 14:33:12,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. 12: [2022-11-25 14:33:12,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt 12: [2022-11-25 14:33:12,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 47: [2022-11-25 14:33:12,409] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt. 47: [2022-11-25 14:33:12,409] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt 47: [2022-11-25 14:33:12,409] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 18: [2022-11-25 14:33:12,409] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. 18: [2022-11-25 14:33:12,409] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt 18: [2022-11-25 14:33:12,409] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 48: [2022-11-25 14:33:12,409] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt. 36: [2022-11-25 14:33:12,409] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt. 36: [2022-11-25 14:33:12,409] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt 36: [2022-11-25 14:33:12,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 48: [2022-11-25 14:33:12,409] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt 48: [2022-11-25 14:33:12,409] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 21: [2022-11-25 14:33:12,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. 21: [2022-11-25 14:33:12,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt 21: [2022-11-25 14:33:12,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 36: [2022-11-25 14:33:12,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt. 36: [2022-11-25 14:33:12,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt 36: [2022-11-25 14:33:12,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 11: [2022-11-25 14:33:12,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. 11: [2022-11-25 14:33:12,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt 11: [2022-11-25 14:33:12,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 8: [2022-11-25 14:33:12,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. 8: [2022-11-25 14:33:12,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt 8: [2022-11-25 14:33:12,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 40: [2022-11-25 14:33:12,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt. 40: [2022-11-25 14:33:12,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt 40: [2022-11-25 14:33:12,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 14: [2022-11-25 14:33:12,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. 14: [2022-11-25 14:33:12,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt 14: [2022-11-25 14:33:12,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 13: [2022-11-25 14:33:12,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. 13: [2022-11-25 14:33:12,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt 13: [2022-11-25 14:33:12,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 33: [2022-11-25 14:33:12,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt. 33: [2022-11-25 14:33:12,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt 33: [2022-11-25 14:33:12,414] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 34: [2022-11-25 14:33:12,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt. 34: [2022-11-25 14:33:12,414] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt 34: [2022-11-25 14:33:12,414] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 52: [2022-11-25 14:33:12,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt. 52: [2022-11-25 14:33:12,415] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt 52: [2022-11-25 14:33:12,415] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 59: [2022-11-25 14:33:12,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt. 59: [2022-11-25 14:33:12,415] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt 59: [2022-11-25 14:33:12,415] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 44: [2022-11-25 14:33:12,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt. 44: [2022-11-25 14:33:12,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt 44: [2022-11-25 14:33:12,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 47: [2022-11-25 14:33:12,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt. 47: [2022-11-25 14:33:12,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt 47: [2022-11-25 14:33:12,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 23: [2022-11-25 14:33:12,416] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. 23: [2022-11-25 14:33:12,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt 23: [2022-11-25 14:33:12,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 22: [2022-11-25 14:33:12,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. 22: [2022-11-25 14:33:12,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. 22: [2022-11-25 14:33:12,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt 22: [2022-11-25 14:33:12,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt 22: [2022-11-25 14:33:12,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 22: [2022-11-25 14:33:12,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 7: [2022-11-25 14:33:12,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. 7: [2022-11-25 14:33:12,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt 7: [2022-11-25 14:33:12,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 50: [2022-11-25 14:33:12,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt. 34: [2022-11-25 14:33:12,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt. 50: [2022-11-25 14:33:12,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt 50: [2022-11-25 14:33:12,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 34: [2022-11-25 14:33:12,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt 34: [2022-11-25 14:33:12,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 9: [2022-11-25 14:33:12,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. 9: [2022-11-25 14:33:12,421] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt 9: [2022-11-25 14:33:12,421] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 35: [2022-11-25 14:33:12,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt. 35: [2022-11-25 14:33:12,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt. 35: [2022-11-25 14:33:12,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt 35: [2022-11-25 14:33:12,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt 35: [2022-11-25 14:33:12,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 35: [2022-11-25 14:33:12,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 43: [2022-11-25 14:33:12,427] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt. 43: [2022-11-25 14:33:12,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt 43: [2022-11-25 14:33:12,428] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 58: [2022-11-25 14:33:12,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt. 58: [2022-11-25 14:33:12,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt 58: [2022-11-25 14:33:12,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 17: [2022-11-25 14:33:12,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. 17: [2022-11-25 14:33:12,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt 17: [2022-11-25 14:33:12,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 54: [2022-11-25 14:33:12,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt. 54: [2022-11-25 14:33:12,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt. 54: [2022-11-25 14:33:12,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt 54: [2022-11-25 14:33:12,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step3000/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt 54: [2022-11-25 14:33:12,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 54: [2022-11-25 14:33:12,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! 0: successfully saved checkpoint at iteration 3000 to checkpoints_8b7 63: time (ms) | save-checkpoint: 7411.09 63: iteration 3010/ 5494 | consumed samples: 3082240 | consumed tokens: 6312427520 | elapsed time per iteration (s): 6.29 | learning rate: 9.780E-05 | global batch size: 1024 | lm loss: 2.317165E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 162.759 | TFLOPs: 36.39 | 63: iteration 3020/ 5494 | consumed samples: 3092480 | consumed tokens: 6333399040 | elapsed time per iteration (s): 5.55 | learning rate: 9.728E-05 | global batch size: 1024 | lm loss: 2.322562E+00 | grad norm: 0.179 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.404 | TFLOPs: 41.23 | 63: iteration 3030/ 5494 | consumed samples: 3102720 | consumed tokens: 6354370560 | elapsed time per iteration (s): 5.40 | learning rate: 9.677E-05 | global batch size: 1024 | lm loss: 2.316456E+00 | grad norm: 0.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.703 | TFLOPs: 42.41 | 63: iteration 3040/ 5494 | consumed samples: 3112960 | consumed tokens: 6375342080 | elapsed time per iteration (s): 5.54 | learning rate: 9.625E-05 | global batch size: 1024 | lm loss: 2.339428E+00 | grad norm: 0.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.725 | TFLOPs: 41.30 | 63: iteration 3050/ 5494 | consumed samples: 3123200 | consumed tokens: 6396313600 | elapsed time per iteration (s): 5.63 | learning rate: 9.574E-05 | global batch size: 1024 | lm loss: 2.311751E+00 | grad norm: 0.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.000 | TFLOPs: 40.69 | 63: iteration 3060/ 5494 | consumed samples: 3133440 | consumed tokens: 6417285120 | elapsed time per iteration (s): 5.68 | learning rate: 9.523E-05 | global batch size: 1024 | lm loss: 2.325647E+00 | grad norm: 0.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.260 | TFLOPs: 40.30 | 63: iteration 3070/ 5494 | consumed samples: 3143680 | consumed tokens: 6438256640 | elapsed time per iteration (s): 5.64 | learning rate: 9.472E-05 | global batch size: 1024 | lm loss: 2.329883E+00 | grad norm: 0.176 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.504 | TFLOPs: 40.58 | 63: iteration 3080/ 5494 | consumed samples: 3153920 | consumed tokens: 6459228160 | elapsed time per iteration (s): 5.41 | learning rate: 9.420E-05 | global batch size: 1024 | lm loss: 2.328629E+00 | grad norm: 0.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.197 | TFLOPs: 42.30 | 63: iteration 3090/ 5494 | consumed samples: 3164160 | consumed tokens: 6480199680 | elapsed time per iteration (s): 5.90 | learning rate: 9.369E-05 | global batch size: 1024 | lm loss: 2.334631E+00 | grad norm: 0.159 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.629 | TFLOPs: 38.82 | 63: iteration 3100/ 5494 | consumed samples: 3174400 | consumed tokens: 6501171200 | elapsed time per iteration (s): 5.64 | learning rate: 9.318E-05 | global batch size: 1024 | lm loss: 2.323568E+00 | grad norm: 0.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.720 | TFLOPs: 40.63 | 63: iteration 3110/ 5494 | consumed samples: 3184640 | consumed tokens: 6522142720 | elapsed time per iteration (s): 5.66 | learning rate: 9.267E-05 | global batch size: 1024 | lm loss: 2.326986E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.954 | TFLOPs: 40.46 | 63: iteration 3120/ 5494 | consumed samples: 3194880 | consumed tokens: 6543114240 | elapsed time per iteration (s): 6.09 | learning rate: 9.216E-05 | global batch size: 1024 | lm loss: 2.323442E+00 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 168.130 | TFLOPs: 37.59 | 63: iteration 3130/ 5494 | consumed samples: 3205120 | consumed tokens: 6564085760 | elapsed time per iteration (s): 5.54 | learning rate: 9.165E-05 | global batch size: 1024 | lm loss: 2.306409E+00 | grad norm: 0.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.001 | TFLOPs: 41.36 | 63: iteration 3140/ 5494 | consumed samples: 3215360 | consumed tokens: 6585057280 | elapsed time per iteration (s): 5.63 | learning rate: 9.114E-05 | global batch size: 1024 | lm loss: 2.317699E+00 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.790 | TFLOPs: 40.64 | 63: iteration 3150/ 5494 | consumed samples: 3225600 | consumed tokens: 6606028800 | elapsed time per iteration (s): 6.09 | learning rate: 9.063E-05 | global batch size: 1024 | lm loss: 2.323162E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 168.261 | TFLOPs: 37.62 | 63: iteration 3160/ 5494 | consumed samples: 3235840 | consumed tokens: 6627000320 | elapsed time per iteration (s): 5.77 | learning rate: 9.013E-05 | global batch size: 1024 | lm loss: 2.315002E+00 | grad norm: 0.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.577 | TFLOPs: 39.70 | 63: iteration 3170/ 5494 | consumed samples: 3246080 | consumed tokens: 6647971840 | elapsed time per iteration (s): 5.66 | learning rate: 8.962E-05 | global batch size: 1024 | lm loss: 2.313418E+00 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.939 | TFLOPs: 40.45 | 63: iteration 3180/ 5494 | consumed samples: 3256320 | consumed tokens: 6668943360 | elapsed time per iteration (s): 5.91 | learning rate: 8.911E-05 | global batch size: 1024 | lm loss: 2.303514E+00 | grad norm: 0.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.400 | TFLOPs: 38.77 | 63: iteration 3190/ 5494 | consumed samples: 3266560 | consumed tokens: 6689914880 | elapsed time per iteration (s): 5.79 | learning rate: 8.861E-05 | global batch size: 1024 | lm loss: 2.309588E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.987 | TFLOPs: 39.57 | 63: iteration 3200/ 5494 | consumed samples: 3276800 | consumed tokens: 6710886400 | elapsed time per iteration (s): 5.76 | learning rate: 8.810E-05 | global batch size: 1024 | lm loss: 2.308530E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.647 | TFLOPs: 39.72 | 63: iteration 3210/ 5494 | consumed samples: 3287040 | consumed tokens: 6731857920 | elapsed time per iteration (s): 6.05 | learning rate: 8.760E-05 | global batch size: 1024 | lm loss: 2.306970E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 169.292 | TFLOPs: 37.85 | 63: iteration 3220/ 5494 | consumed samples: 3297280 | consumed tokens: 6752829440 | elapsed time per iteration (s): 5.60 | learning rate: 8.710E-05 | global batch size: 1024 | lm loss: 2.293784E+00 | grad norm: 0.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.730 | TFLOPs: 40.85 | 63: iteration 3230/ 5494 | consumed samples: 3307520 | consumed tokens: 6773800960 | elapsed time per iteration (s): 5.57 | learning rate: 8.660E-05 | global batch size: 1024 | lm loss: 2.319479E+00 | grad norm: 0.180 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 183.942 | TFLOPs: 41.12 | 63: iteration 3240/ 5494 | consumed samples: 3317760 | consumed tokens: 6794772480 | elapsed time per iteration (s): 5.78 | learning rate: 8.609E-05 | global batch size: 1024 | lm loss: 2.302942E+00 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.274 | TFLOPs: 39.63 | 63: iteration 3250/ 5494 | consumed samples: 3328000 | consumed tokens: 6815744000 | elapsed time per iteration (s): 5.53 | learning rate: 8.559E-05 | global batch size: 1024 | lm loss: 2.298755E+00 | grad norm: 0.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.089 | TFLOPs: 41.38 | 63: iteration 3260/ 5494 | consumed samples: 3338240 | consumed tokens: 6836715520 | elapsed time per iteration (s): 5.76 | learning rate: 8.509E-05 | global batch size: 1024 | lm loss: 2.302052E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.828 | TFLOPs: 39.76 | 63: iteration 3270/ 5494 | consumed samples: 3348480 | consumed tokens: 6857687040 | elapsed time per iteration (s): 5.73 | learning rate: 8.459E-05 | global batch size: 1024 | lm loss: 2.299782E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.666 | TFLOPs: 39.94 | 63: iteration 3280/ 5494 | consumed samples: 3358720 | consumed tokens: 6878658560 | elapsed time per iteration (s): 5.61 | learning rate: 8.410E-05 | global batch size: 1024 | lm loss: 2.305755E+00 | grad norm: 0.163 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.524 | TFLOPs: 40.81 | 63: iteration 3290/ 5494 | consumed samples: 3368960 | consumed tokens: 6899630080 | elapsed time per iteration (s): 5.97 | learning rate: 8.360E-05 | global batch size: 1024 | lm loss: 2.289078E+00 | grad norm: 0.163 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 171.562 | TFLOPs: 38.36 | 63: iteration 3300/ 5494 | consumed samples: 3379200 | consumed tokens: 6920601600 | elapsed time per iteration (s): 5.40 | learning rate: 8.310E-05 | global batch size: 1024 | lm loss: 2.293840E+00 | grad norm: 0.167 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.592 | TFLOPs: 42.39 | 63: iteration 3310/ 5494 | consumed samples: 3389440 | consumed tokens: 6941573120 | elapsed time per iteration (s): 5.63 | learning rate: 8.261E-05 | global batch size: 1024 | lm loss: 2.300018E+00 | grad norm: 0.164 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.842 | TFLOPs: 40.65 | 63: iteration 3320/ 5494 | consumed samples: 3399680 | consumed tokens: 6962544640 | elapsed time per iteration (s): 14.94 | learning rate: 8.211E-05 | global batch size: 1024 | lm loss: 2.303198E+00 | grad norm: 0.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 68.558 | TFLOPs: 15.33 | 63: iteration 3330/ 5494 | consumed samples: 3409920 | consumed tokens: 6983516160 | elapsed time per iteration (s): 5.51 | learning rate: 8.162E-05 | global batch size: 1024 | lm loss: 2.306894E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.919 | TFLOPs: 41.57 | 63: iteration 3340/ 5494 | consumed samples: 3420160 | consumed tokens: 7004487680 | elapsed time per iteration (s): 5.66 | learning rate: 8.113E-05 | global batch size: 1024 | lm loss: 2.292338E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.966 | TFLOPs: 40.46 | 63: iteration 3350/ 5494 | consumed samples: 3430400 | consumed tokens: 7025459200 | elapsed time per iteration (s): 5.60 | learning rate: 8.063E-05 | global batch size: 1024 | lm loss: 2.296746E+00 | grad norm: 0.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.894 | TFLOPs: 40.89 | 63: iteration 3360/ 5494 | consumed samples: 3440640 | consumed tokens: 7046430720 | elapsed time per iteration (s): 6.12 | learning rate: 8.014E-05 | global batch size: 1024 | lm loss: 2.307060E+00 | grad norm: 0.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 167.297 | TFLOPs: 37.40 | 63: iteration 3370/ 5494 | consumed samples: 3450880 | consumed tokens: 7067402240 | elapsed time per iteration (s): 5.68 | learning rate: 7.965E-05 | global batch size: 1024 | lm loss: 2.293987E+00 | grad norm: 0.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.234 | TFLOPs: 40.29 | 63: iteration 3380/ 5494 | consumed samples: 3461120 | consumed tokens: 7088373760 | elapsed time per iteration (s): 5.62 | learning rate: 7.916E-05 | global batch size: 1024 | lm loss: 2.292796E+00 | grad norm: 0.165 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.346 | TFLOPs: 40.77 | 63: iteration 3390/ 5494 | consumed samples: 3471360 | consumed tokens: 7109345280 | elapsed time per iteration (s): 5.67 | learning rate: 7.868E-05 | global batch size: 1024 | lm loss: 2.294478E+00 | grad norm: 0.169 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.518 | TFLOPs: 40.36 | 63: iteration 3400/ 5494 | consumed samples: 3481600 | consumed tokens: 7130316800 | elapsed time per iteration (s): 5.49 | learning rate: 7.819E-05 | global batch size: 1024 | lm loss: 2.301873E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.520 | TFLOPs: 41.70 | 63: iteration 3410/ 5494 | consumed samples: 3491840 | consumed tokens: 7151288320 | elapsed time per iteration (s): 5.86 | learning rate: 7.770E-05 | global batch size: 1024 | lm loss: 2.285900E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 174.657 | TFLOPs: 39.05 | 63: iteration 3420/ 5494 | consumed samples: 3502080 | consumed tokens: 7172259840 | elapsed time per iteration (s): 5.52 | learning rate: 7.722E-05 | global batch size: 1024 | lm loss: 2.296226E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.393 | TFLOPs: 41.45 | 63: iteration 3430/ 5494 | consumed samples: 3512320 | consumed tokens: 7193231360 | elapsed time per iteration (s): 5.85 | learning rate: 7.674E-05 | global batch size: 1024 | lm loss: 2.292828E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.069 | TFLOPs: 39.14 | 63: iteration 3440/ 5494 | consumed samples: 3522560 | consumed tokens: 7214202880 | elapsed time per iteration (s): 5.63 | learning rate: 7.625E-05 | global batch size: 1024 | lm loss: 2.289887E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.008 | TFLOPs: 40.69 | 63: iteration 3450/ 5494 | consumed samples: 3532800 | consumed tokens: 7235174400 | elapsed time per iteration (s): 5.77 | learning rate: 7.577E-05 | global batch size: 1024 | lm loss: 2.299403E+00 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.440 | TFLOPs: 39.67 | 63: iteration 3460/ 5494 | consumed samples: 3543040 | consumed tokens: 7256145920 | elapsed time per iteration (s): 5.61 | learning rate: 7.529E-05 | global batch size: 1024 | lm loss: 2.282303E+00 | grad norm: 0.163 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.687 | TFLOPs: 40.84 | 63: iteration 3470/ 5494 | consumed samples: 3553280 | consumed tokens: 7277117440 | elapsed time per iteration (s): 5.79 | learning rate: 7.481E-05 | global batch size: 1024 | lm loss: 2.295016E+00 | grad norm: 0.163 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.871 | TFLOPs: 39.54 | 63: iteration 3480/ 5494 | consumed samples: 3563520 | consumed tokens: 7298088960 | elapsed time per iteration (s): 5.51 | learning rate: 7.433E-05 | global batch size: 1024 | lm loss: 2.279744E+00 | grad norm: 0.156 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.746 | TFLOPs: 41.53 | 63: iteration 3490/ 5494 | consumed samples: 3573760 | consumed tokens: 7319060480 | elapsed time per iteration (s): 5.91 | learning rate: 7.386E-05 | global batch size: 1024 | lm loss: 2.295111E+00 | grad norm: 0.166 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.322 | TFLOPs: 38.75 | 63: iteration 3500/ 5494 | consumed samples: 3584000 | consumed tokens: 7340032000 | elapsed time per iteration (s): 5.41 | learning rate: 7.338E-05 | global batch size: 1024 | lm loss: 2.269073E+00 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.304 | TFLOPs: 42.32 | 63: iteration 3510/ 5494 | consumed samples: 3594240 | consumed tokens: 7361003520 | elapsed time per iteration (s): 5.62 | learning rate: 7.291E-05 | global batch size: 1024 | lm loss: 2.297261E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.111 | TFLOPs: 40.71 | 63: iteration 3520/ 5494 | consumed samples: 3604480 | consumed tokens: 7381975040 | elapsed time per iteration (s): 5.77 | learning rate: 7.243E-05 | global batch size: 1024 | lm loss: 2.267759E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.548 | TFLOPs: 39.69 | 63: iteration 3530/ 5494 | consumed samples: 3614720 | consumed tokens: 7402946560 | elapsed time per iteration (s): 5.52 | learning rate: 7.196E-05 | global batch size: 1024 | lm loss: 2.280539E+00 | grad norm: 0.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.349 | TFLOPs: 41.44 | 63: iteration 3540/ 5494 | consumed samples: 3624960 | consumed tokens: 7423918080 | elapsed time per iteration (s): 5.79 | learning rate: 7.149E-05 | global batch size: 1024 | lm loss: 2.294359E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.952 | TFLOPs: 39.56 | 63: iteration 3550/ 5494 | consumed samples: 3635200 | consumed tokens: 7444889600 | elapsed time per iteration (s): 5.65 | learning rate: 7.102E-05 | global batch size: 1024 | lm loss: 2.287841E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.243 | TFLOPs: 40.52 | 63: iteration 3560/ 5494 | consumed samples: 3645440 | consumed tokens: 7465861120 | elapsed time per iteration (s): 5.76 | learning rate: 7.056E-05 | global batch size: 1024 | lm loss: 2.278815E+00 | grad norm: 0.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.673 | TFLOPs: 39.72 | 63: iteration 3570/ 5494 | consumed samples: 3655680 | consumed tokens: 7486832640 | elapsed time per iteration (s): 5.54 | learning rate: 7.009E-05 | global batch size: 1024 | lm loss: 2.280932E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.844 | TFLOPs: 41.33 | 63: iteration 3580/ 5494 | consumed samples: 3665920 | consumed tokens: 7507804160 | elapsed time per iteration (s): 5.62 | learning rate: 6.962E-05 | global batch size: 1024 | lm loss: 2.251561E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.168 | TFLOPs: 40.73 | 63: iteration 3590/ 5494 | consumed samples: 3676160 | consumed tokens: 7528775680 | elapsed time per iteration (s): 5.43 | learning rate: 6.916E-05 | global batch size: 1024 | lm loss: 2.284701E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 188.754 | TFLOPs: 42.20 | 63: iteration 3600/ 5494 | consumed samples: 3686400 | consumed tokens: 7549747200 | elapsed time per iteration (s): 5.66 | learning rate: 6.870E-05 | global batch size: 1024 | lm loss: 2.272101E+00 | grad norm: 0.160 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.840 | TFLOPs: 40.43 | 63: iteration 3610/ 5494 | consumed samples: 3696640 | consumed tokens: 7570718720 | elapsed time per iteration (s): 5.69 | learning rate: 6.824E-05 | global batch size: 1024 | lm loss: 2.255931E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.122 | TFLOPs: 40.27 | 63: iteration 3620/ 5494 | consumed samples: 3706880 | consumed tokens: 7591690240 | elapsed time per iteration (s): 5.74 | learning rate: 6.778E-05 | global batch size: 1024 | lm loss: 2.272362E+00 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.451 | TFLOPs: 39.90 | 63: iteration 3630/ 5494 | consumed samples: 3717120 | consumed tokens: 7612661760 | elapsed time per iteration (s): 5.81 | learning rate: 6.732E-05 | global batch size: 1024 | lm loss: 2.282742E+00 | grad norm: 0.162 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.274 | TFLOPs: 39.41 | 63: iteration 3640/ 5494 | consumed samples: 3727360 | consumed tokens: 7633633280 | elapsed time per iteration (s): 5.86 | learning rate: 6.686E-05 | global batch size: 1024 | lm loss: 2.262251E+00 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 174.653 | TFLOPs: 39.05 | 63: iteration 3650/ 5494 | consumed samples: 3737600 | consumed tokens: 7654604800 | elapsed time per iteration (s): 5.51 | learning rate: 6.641E-05 | global batch size: 1024 | lm loss: 2.268705E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.757 | TFLOPs: 41.53 | 63: iteration 3660/ 5494 | consumed samples: 3747840 | consumed tokens: 7675576320 | elapsed time per iteration (s): 5.85 | learning rate: 6.595E-05 | global batch size: 1024 | lm loss: 2.263137E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.091 | TFLOPs: 39.14 | 63: iteration 3670/ 5494 | consumed samples: 3758080 | consumed tokens: 7696547840 | elapsed time per iteration (s): 5.69 | learning rate: 6.550E-05 | global batch size: 1024 | lm loss: 2.264649E+00 | grad norm: 0.169 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.956 | TFLOPs: 40.23 | 63: iteration 3680/ 5494 | consumed samples: 3768320 | consumed tokens: 7717519360 | elapsed time per iteration (s): 5.80 | learning rate: 6.505E-05 | global batch size: 1024 | lm loss: 2.265941E+00 | grad norm: 0.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.483 | TFLOPs: 39.46 | 63: iteration 3690/ 5494 | consumed samples: 3778560 | consumed tokens: 7738490880 | elapsed time per iteration (s): 5.64 | learning rate: 6.460E-05 | global batch size: 1024 | lm loss: 2.265310E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.569 | TFLOPs: 40.59 | 63: iteration 3700/ 5494 | consumed samples: 3788800 | consumed tokens: 7759462400 | elapsed time per iteration (s): 5.63 | learning rate: 6.415E-05 | global batch size: 1024 | lm loss: 2.269809E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.984 | TFLOPs: 40.69 | 63: iteration 3710/ 5494 | consumed samples: 3799040 | consumed tokens: 7780433920 | elapsed time per iteration (s): 5.66 | learning rate: 6.370E-05 | global batch size: 1024 | lm loss: 2.264246E+00 | grad norm: 0.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.981 | TFLOPs: 40.46 | 63: iteration 3720/ 5494 | consumed samples: 3809280 | consumed tokens: 7801405440 | elapsed time per iteration (s): 5.52 | learning rate: 6.326E-05 | global batch size: 1024 | lm loss: 2.257488E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.342 | TFLOPs: 41.44 | 63: iteration 3730/ 5494 | consumed samples: 3819520 | consumed tokens: 7822376960 | elapsed time per iteration (s): 5.69 | learning rate: 6.282E-05 | global batch size: 1024 | lm loss: 2.247926E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.939 | TFLOPs: 40.23 | 63: iteration 3740/ 5494 | consumed samples: 3829760 | consumed tokens: 7843348480 | elapsed time per iteration (s): 5.40 | learning rate: 6.237E-05 | global batch size: 1024 | lm loss: 2.240571E+00 | grad norm: 0.171 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.751 | TFLOPs: 42.42 | 63: iteration 3750/ 5494 | consumed samples: 3840000 | consumed tokens: 7864320000 | elapsed time per iteration (s): 5.74 | learning rate: 6.193E-05 | global batch size: 1024 | lm loss: 2.266223E+00 | grad norm: 0.150 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.338 | TFLOPs: 39.87 | 63: iteration 3760/ 5494 | consumed samples: 3850240 | consumed tokens: 7885291520 | elapsed time per iteration (s): 5.41 | learning rate: 6.149E-05 | global batch size: 1024 | lm loss: 2.269914E+00 | grad norm: 0.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.393 | TFLOPs: 42.34 | 63: iteration 3770/ 5494 | consumed samples: 3860480 | consumed tokens: 7906263040 | elapsed time per iteration (s): 5.64 | learning rate: 6.106E-05 | global batch size: 1024 | lm loss: 2.242963E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.639 | TFLOPs: 40.61 | 63: iteration 3780/ 5494 | consumed samples: 3870720 | consumed tokens: 7927234560 | elapsed time per iteration (s): 5.46 | learning rate: 6.062E-05 | global batch size: 1024 | lm loss: 2.250921E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 187.490 | TFLOPs: 41.92 | 63: iteration 3790/ 5494 | consumed samples: 3880960 | consumed tokens: 7948206080 | elapsed time per iteration (s): 5.93 | learning rate: 6.019E-05 | global batch size: 1024 | lm loss: 2.249605E+00 | grad norm: 0.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.743 | TFLOPs: 38.62 | 63: iteration 3800/ 5494 | consumed samples: 3891200 | consumed tokens: 7969177600 | elapsed time per iteration (s): 5.67 | learning rate: 5.976E-05 | global batch size: 1024 | lm loss: 2.258531E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.514 | TFLOPs: 40.36 | 63: iteration 3810/ 5494 | consumed samples: 3901440 | consumed tokens: 7990149120 | elapsed time per iteration (s): 5.51 | learning rate: 5.933E-05 | global batch size: 1024 | lm loss: 2.258613E+00 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.727 | TFLOPs: 41.52 | 63: iteration 3820/ 5494 | consumed samples: 3911680 | consumed tokens: 8011120640 | elapsed time per iteration (s): 5.63 | learning rate: 5.890E-05 | global batch size: 1024 | lm loss: 2.255753E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.846 | TFLOPs: 40.65 | 63: iteration 3830/ 5494 | consumed samples: 3921920 | consumed tokens: 8032092160 | elapsed time per iteration (s): 5.53 | learning rate: 5.847E-05 | global batch size: 1024 | lm loss: 2.247771E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.338 | TFLOPs: 41.44 | 63: iteration 3840/ 5494 | consumed samples: 3932160 | consumed tokens: 8053063680 | elapsed time per iteration (s): 5.76 | learning rate: 5.804E-05 | global batch size: 1024 | lm loss: 2.251501E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.632 | TFLOPs: 39.71 | 63: iteration 3850/ 5494 | consumed samples: 3942400 | consumed tokens: 8074035200 | elapsed time per iteration (s): 5.63 | learning rate: 5.762E-05 | global batch size: 1024 | lm loss: 2.249227E+00 | grad norm: 0.158 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.839 | TFLOPs: 40.65 | 63: iteration 3860/ 5494 | consumed samples: 3952640 | consumed tokens: 8095006720 | elapsed time per iteration (s): 5.63 | learning rate: 5.720E-05 | global batch size: 1024 | lm loss: 2.260895E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.768 | TFLOPs: 40.64 | 63: iteration 3870/ 5494 | consumed samples: 3962880 | consumed tokens: 8115978240 | elapsed time per iteration (s): 6.25 | learning rate: 5.678E-05 | global batch size: 1024 | lm loss: 2.257469E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 163.971 | TFLOPs: 36.66 | 63: iteration 3880/ 5494 | consumed samples: 3973120 | consumed tokens: 8136949760 | elapsed time per iteration (s): 5.92 | learning rate: 5.636E-05 | global batch size: 1024 | lm loss: 2.253302E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.034 | TFLOPs: 38.68 | 63: iteration 3890/ 5494 | consumed samples: 3983360 | consumed tokens: 8157921280 | elapsed time per iteration (s): 6.13 | learning rate: 5.594E-05 | global batch size: 1024 | lm loss: 2.253806E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 167.153 | TFLOPs: 37.37 | 63: iteration 3900/ 5494 | consumed samples: 3993600 | consumed tokens: 8178892800 | elapsed time per iteration (s): 5.64 | learning rate: 5.553E-05 | global batch size: 1024 | lm loss: 2.252903E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.651 | TFLOPs: 40.61 | 63: iteration 3910/ 5494 | consumed samples: 4003840 | consumed tokens: 8199864320 | elapsed time per iteration (s): 6.01 | learning rate: 5.512E-05 | global batch size: 1024 | lm loss: 2.252629E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 170.384 | TFLOPs: 38.09 | 63: iteration 3920/ 5494 | consumed samples: 4014080 | consumed tokens: 8220835840 | elapsed time per iteration (s): 5.60 | learning rate: 5.471E-05 | global batch size: 1024 | lm loss: 2.247765E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.715 | TFLOPs: 40.85 | 63: iteration 3930/ 5494 | consumed samples: 4024320 | consumed tokens: 8241807360 | elapsed time per iteration (s): 5.55 | learning rate: 5.430E-05 | global batch size: 1024 | lm loss: 2.264535E+00 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.533 | TFLOPs: 41.26 | 63: iteration 3940/ 5494 | consumed samples: 4034560 | consumed tokens: 8262778880 | elapsed time per iteration (s): 5.65 | learning rate: 5.389E-05 | global batch size: 1024 | lm loss: 2.254543E+00 | grad norm: 0.151 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.385 | TFLOPs: 40.55 | 63: iteration 3950/ 5494 | consumed samples: 4044800 | consumed tokens: 8283750400 | elapsed time per iteration (s): 5.52 | learning rate: 5.348E-05 | global batch size: 1024 | lm loss: 2.236345E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.464 | TFLOPs: 41.46 | 63: iteration 3960/ 5494 | consumed samples: 4055040 | consumed tokens: 8304721920 | elapsed time per iteration (s): 5.63 | learning rate: 5.308E-05 | global batch size: 1024 | lm loss: 2.247398E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.730 | TFLOPs: 40.63 | 63: iteration 3970/ 5494 | consumed samples: 4065280 | consumed tokens: 8325693440 | elapsed time per iteration (s): 5.73 | learning rate: 5.268E-05 | global batch size: 1024 | lm loss: 2.242032E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.838 | TFLOPs: 39.98 | 63: iteration 3980/ 5494 | consumed samples: 4075520 | consumed tokens: 8346664960 | elapsed time per iteration (s): 5.62 | learning rate: 5.228E-05 | global batch size: 1024 | lm loss: 2.241974E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.045 | TFLOPs: 40.70 | 63: iteration 3990/ 5494 | consumed samples: 4085760 | consumed tokens: 8367636480 | elapsed time per iteration (s): 5.63 | learning rate: 5.188E-05 | global batch size: 1024 | lm loss: 2.241450E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.820 | TFLOPs: 40.65 | 0: [2022-11-25 16:09:24,601] [INFO] [logging.py:68:log_dist] [Rank 0] step=4000, skipped=0, lr=[5.148437789292599e-05, 5.148437789292599e-05, 5.148437789292599e-05], mom=[(0.9, 0.999), (0.9, 0.999), (0.9, 0.999)] 63: iteration 4000/ 5494 | consumed samples: 4096000 | consumed tokens: 8388608000 | elapsed time per iteration (s): 5.54 | learning rate: 5.148E-05 | global batch size: 1024 | lm loss: 2.254141E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.985 | TFLOPs: 41.36 | 0: steps: 4000 loss: 2.2625 iter time (s): 5.719 samples/sec: 179.067 63: ------------------------------------------------------------------------------------------ 63: valid loss at iteration 4000 | lm loss value: 2.198155E+00 | lm loss PPL: 9.008375E+00 | 63: ------------------------------------------------------------------------------------------ 0: saving checkpoint at iteration 4000 to checkpoints_8b7 0: [2022-11-25 16:09:26,501] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step4000 is begin to save! 0: [2022-11-25 16:09:26,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_01-model_01-model_states.pt... 32: [2022-11-25 16:09:26,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_24-model_01-model_states.pt... 0: [2022-11-25 16:09:26,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_01-model_00-model_states.pt... 32: [2022-11-25 16:09:26,556] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_24-model_00-model_states.pt... 0: [2022-11-25 16:09:27,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_01-model_00-model_states.pt. 0: [2022-11-25 16:09:27,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_01-model_01-model_states.pt. 32: [2022-11-25 16:09:27,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_24-model_01-model_states.pt. 32: [2022-11-25 16:09:27,002] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_24-model_00-model_states.pt. 0: [2022-11-25 16:09:27,002] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_03-model_01-model_states.pt... 32: [2022-11-25 16:09:27,002] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_25-model_01-model_states.pt... 0: [2022-11-25 16:09:27,002] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_03-model_00-model_states.pt... 32: [2022-11-25 16:09:27,002] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_25-model_00-model_states.pt... 0: [2022-11-25 16:09:27,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_03-model_01-model_states.pt. 0: [2022-11-25 16:09:27,221] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_04-model_01-model_states.pt... 0: [2022-11-25 16:09:27,238] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_03-model_00-model_states.pt. 0: [2022-11-25 16:09:27,238] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_04-model_00-model_states.pt... 32: [2022-11-25 16:09:27,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_25-model_00-model_states.pt. 32: [2022-11-25 16:09:27,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_25-model_01-model_states.pt. 32: [2022-11-25 16:09:27,252] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_26-model_01-model_states.pt... 32: [2022-11-25 16:09:27,252] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_26-model_00-model_states.pt... 0: [2022-11-25 16:09:27,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_04-model_01-model_states.pt. 0: [2022-11-25 16:09:27,476] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_05-model_01-model_states.pt... 0: [2022-11-25 16:09:27,480] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_04-model_00-model_states.pt. 0: [2022-11-25 16:09:27,480] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_05-model_00-model_states.pt... 32: [2022-11-25 16:09:27,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_26-model_01-model_states.pt. 32: [2022-11-25 16:09:27,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_26-model_00-model_states.pt. 32: [2022-11-25 16:09:27,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_27-model_01-model_states.pt... 32: [2022-11-25 16:09:27,518] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_27-model_00-model_states.pt... 0: [2022-11-25 16:09:27,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_05-model_00-model_states.pt. 0: [2022-11-25 16:09:27,719] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_06-model_00-model_states.pt... 0: [2022-11-25 16:09:27,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_05-model_01-model_states.pt. 0: [2022-11-25 16:09:27,724] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_06-model_01-model_states.pt... 32: [2022-11-25 16:09:27,764] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_27-model_00-model_states.pt. 32: [2022-11-25 16:09:27,765] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_28-model_00-model_states.pt... 32: [2022-11-25 16:09:27,779] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_27-model_01-model_states.pt. 32: [2022-11-25 16:09:27,779] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_28-model_01-model_states.pt... 0: [2022-11-25 16:09:27,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_06-model_01-model_states.pt. 0: [2022-11-25 16:09:27,959] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_07-model_01-model_states.pt... 0: [2022-11-25 16:09:27,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_06-model_00-model_states.pt. 0: [2022-11-25 16:09:27,961] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_07-model_00-model_states.pt... 32: [2022-11-25 16:09:28,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_28-model_00-model_states.pt. 32: [2022-11-25 16:09:28,017] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_29-model_00-model_states.pt... 32: [2022-11-25 16:09:28,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_28-model_01-model_states.pt. 32: [2022-11-25 16:09:28,028] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_29-model_01-model_states.pt... 0: [2022-11-25 16:09:28,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_07-model_01-model_states.pt. 0: [2022-11-25 16:09:28,204] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_08-model_01-model_states.pt... 0: [2022-11-25 16:09:28,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_07-model_00-model_states.pt. 0: [2022-11-25 16:09:28,205] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_08-model_00-model_states.pt... 32: [2022-11-25 16:09:28,279] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_29-model_01-model_states.pt. 32: [2022-11-25 16:09:28,279] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_30-model_01-model_states.pt... 32: [2022-11-25 16:09:28,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_29-model_00-model_states.pt. 32: [2022-11-25 16:09:28,305] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_30-model_00-model_states.pt... 0: [2022-11-25 16:09:28,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_08-model_00-model_states.pt. 0: [2022-11-25 16:09:28,441] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_09-model_00-model_states.pt... 0: [2022-11-25 16:09:28,462] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_08-model_01-model_states.pt. 0: [2022-11-25 16:09:28,463] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_09-model_01-model_states.pt... 32: [2022-11-25 16:09:28,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_30-model_00-model_states.pt. 32: [2022-11-25 16:09:28,572] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_31-model_00-model_states.pt... 32: [2022-11-25 16:09:28,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_30-model_01-model_states.pt. 32: [2022-11-25 16:09:28,574] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_31-model_01-model_states.pt... 0: [2022-11-25 16:09:28,669] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_09-model_00-model_states.pt. 0: [2022-11-25 16:09:28,669] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_10-model_00-model_states.pt... 0: [2022-11-25 16:09:28,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_09-model_01-model_states.pt. 0: [2022-11-25 16:09:28,690] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_10-model_01-model_states.pt... 32: [2022-11-25 16:09:28,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_31-model_00-model_states.pt. 32: [2022-11-25 16:09:28,825] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_32-model_00-model_states.pt... 32: [2022-11-25 16:09:28,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_31-model_01-model_states.pt. 32: [2022-11-25 16:09:28,850] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_32-model_01-model_states.pt... 0: [2022-11-25 16:09:28,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_10-model_01-model_states.pt. 0: [2022-11-25 16:09:28,910] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_11-model_01-model_states.pt... 0: [2022-11-25 16:09:28,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_10-model_00-model_states.pt. 0: [2022-11-25 16:09:28,914] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_11-model_00-model_states.pt... 32: [2022-11-25 16:09:29,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_32-model_01-model_states.pt. 32: [2022-11-25 16:09:29,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_32-model_00-model_states.pt. 32: [2022-11-25 16:09:29,089] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_33-model_01-model_states.pt... 32: [2022-11-25 16:09:29,089] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_33-model_00-model_states.pt... 0: [2022-11-25 16:09:29,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_11-model_01-model_states.pt. 0: [2022-11-25 16:09:29,156] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_12-model_01-model_states.pt... 0: [2022-11-25 16:09:29,169] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_11-model_00-model_states.pt. 0: [2022-11-25 16:09:29,169] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_12-model_00-model_states.pt... 32: [2022-11-25 16:09:29,352] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_33-model_00-model_states.pt. 32: [2022-11-25 16:09:29,353] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_34-model_00-model_states.pt... 32: [2022-11-25 16:09:29,374] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_33-model_01-model_states.pt. 32: [2022-11-25 16:09:29,374] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_34-model_01-model_states.pt... 0: [2022-11-25 16:09:29,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_12-model_01-model_states.pt. 0: [2022-11-25 16:09:29,388] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_13-model_01-model_states.pt... 0: [2022-11-25 16:09:29,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_12-model_00-model_states.pt. 0: [2022-11-25 16:09:29,404] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_13-model_00-model_states.pt... 32: [2022-11-25 16:09:29,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_34-model_00-model_states.pt. 32: [2022-11-25 16:09:29,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_35-model_00-model_states.pt... 0: [2022-11-25 16:09:29,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_13-model_01-model_states.pt. 0: [2022-11-25 16:09:29,625] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_14-model_01-model_states.pt... 32: [2022-11-25 16:09:29,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_34-model_01-model_states.pt. 32: [2022-11-25 16:09:29,632] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_35-model_01-model_states.pt... 0: [2022-11-25 16:09:29,633] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_13-model_00-model_states.pt. 0: [2022-11-25 16:09:29,633] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_14-model_00-model_states.pt... 0: [2022-11-25 16:09:29,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_14-model_01-model_states.pt. 0: [2022-11-25 16:09:29,851] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_15-model_01-model_states.pt... 0: [2022-11-25 16:09:29,861] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_14-model_00-model_states.pt. 0: [2022-11-25 16:09:29,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_15-model_00-model_states.pt... 32: [2022-11-25 16:09:29,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_35-model_00-model_states.pt. 32: [2022-11-25 16:09:29,867] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_36-model_00-model_states.pt... 32: [2022-11-25 16:09:29,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_35-model_01-model_states.pt. 32: [2022-11-25 16:09:29,868] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_36-model_01-model_states.pt... 0: [2022-11-25 16:09:30,068] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_15-model_01-model_states.pt. 0: [2022-11-25 16:09:30,069] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_16-model_01-model_states.pt... 0: [2022-11-25 16:09:30,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_15-model_00-model_states.pt. 0: [2022-11-25 16:09:30,089] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_16-model_00-model_states.pt... 32: [2022-11-25 16:09:30,121] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_36-model_00-model_states.pt. 32: [2022-11-25 16:09:30,121] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_37-model_00-model_states.pt... 32: [2022-11-25 16:09:30,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_36-model_01-model_states.pt. 32: [2022-11-25 16:09:30,130] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_37-model_01-model_states.pt... 0: [2022-11-25 16:09:30,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_16-model_01-model_states.pt. 0: [2022-11-25 16:09:30,297] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_17-model_01-model_states.pt... 0: [2022-11-25 16:09:30,323] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_16-model_00-model_states.pt. 0: [2022-11-25 16:09:30,323] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_17-model_00-model_states.pt... 32: [2022-11-25 16:09:30,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_37-model_00-model_states.pt. 32: [2022-11-25 16:09:30,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_38-model_00-model_states.pt... 32: [2022-11-25 16:09:30,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_37-model_01-model_states.pt. 32: [2022-11-25 16:09:30,388] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_38-model_01-model_states.pt... 0: [2022-11-25 16:09:30,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_17-model_01-model_states.pt. 0: [2022-11-25 16:09:30,541] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_18-model_01-model_states.pt... 0: [2022-11-25 16:09:30,544] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_17-model_00-model_states.pt. 0: [2022-11-25 16:09:30,545] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_18-model_00-model_states.pt... 32: [2022-11-25 16:09:30,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_38-model_01-model_states.pt. 32: [2022-11-25 16:09:30,620] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_39-model_01-model_states.pt... 32: [2022-11-25 16:09:30,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_38-model_00-model_states.pt. 32: [2022-11-25 16:09:30,623] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_39-model_00-model_states.pt... 0: [2022-11-25 16:09:30,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_18-model_01-model_states.pt. 0: [2022-11-25 16:09:30,776] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_19-model_01-model_states.pt... 0: [2022-11-25 16:09:30,784] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_18-model_00-model_states.pt. 0: [2022-11-25 16:09:30,784] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_19-model_00-model_states.pt... 32: [2022-11-25 16:09:30,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_39-model_00-model_states.pt. 32: [2022-11-25 16:09:30,869] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_40-model_00-model_states.pt... 32: [2022-11-25 16:09:30,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_39-model_01-model_states.pt. 32: [2022-11-25 16:09:30,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_40-model_01-model_states.pt... 0: [2022-11-25 16:09:31,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_19-model_01-model_states.pt. 0: [2022-11-25 16:09:31,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_20-model_01-model_states.pt... 0: [2022-11-25 16:09:31,017] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_19-model_00-model_states.pt. 0: [2022-11-25 16:09:31,017] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_20-model_00-model_states.pt... 32: [2022-11-25 16:09:31,135] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_40-model_01-model_states.pt. 32: [2022-11-25 16:09:31,136] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_41-model_01-model_states.pt... 32: [2022-11-25 16:09:31,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_40-model_00-model_states.pt. 32: [2022-11-25 16:09:31,148] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_41-model_00-model_states.pt... 0: [2022-11-25 16:09:31,252] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_20-model_01-model_states.pt. 0: [2022-11-25 16:09:31,253] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_21-model_01-model_states.pt... 0: [2022-11-25 16:09:31,270] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_20-model_00-model_states.pt. 0: [2022-11-25 16:09:31,270] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_21-model_00-model_states.pt... 32: [2022-11-25 16:09:31,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_41-model_01-model_states.pt. 32: [2022-11-25 16:09:31,429] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_42-model_01-model_states.pt... 32: [2022-11-25 16:09:31,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_41-model_00-model_states.pt. 32: [2022-11-25 16:09:31,444] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_42-model_00-model_states.pt... 0: [2022-11-25 16:09:31,512] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_21-model_00-model_states.pt. 0: [2022-11-25 16:09:31,513] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_22-model_00-model_states.pt... 0: [2022-11-25 16:09:31,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_21-model_01-model_states.pt. 0: [2022-11-25 16:09:31,531] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_22-model_01-model_states.pt... 32: [2022-11-25 16:09:31,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_42-model_01-model_states.pt. 32: [2022-11-25 16:09:31,689] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_43-model_01-model_states.pt... 32: [2022-11-25 16:09:31,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_42-model_00-model_states.pt. 32: [2022-11-25 16:09:31,696] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_43-model_00-model_states.pt... 0: [2022-11-25 16:09:31,745] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_22-model_01-model_states.pt. 0: [2022-11-25 16:09:31,745] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_23-model_01-model_states.pt... 0: [2022-11-25 16:09:31,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_22-model_00-model_states.pt. 0: [2022-11-25 16:09:31,749] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_23-model_00-model_states.pt... 32: [2022-11-25 16:09:31,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_43-model_01-model_states.pt. 32: [2022-11-25 16:09:31,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_43-model_00-model_states.pt. 32: [2022-11-25 16:09:31,947] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_44-model_01-model_states.pt... 32: [2022-11-25 16:09:31,947] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_44-model_00-model_states.pt... 0: [2022-11-25 16:09:31,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_23-model_00-model_states.pt. 0: [2022-11-25 16:09:31,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_23-model_01-model_states.pt. 0: [2022-11-25 16:09:31,975] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: checkpoints_8b7/global_step4000/mp_rank_01_model_states.pt 0: [2022-11-25 16:09:31,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/mp_rank_01_model_states.pt... 0: [2022-11-25 16:09:31,975] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_8b7/global_step4000/mp_rank_00_model_states.pt 0: [2022-11-25 16:09:31,975] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/mp_rank_00_model_states.pt... 0: [2022-11-25 16:09:31,993] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/mp_rank_01_model_states.pt. 0: [2022-11-25 16:09:31,994] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/mp_rank_00_model_states.pt. 32: [2022-11-25 16:09:32,221] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_44-model_01-model_states.pt. 32: [2022-11-25 16:09:32,222] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_46-model_01-model_states.pt... 32: [2022-11-25 16:09:32,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_44-model_00-model_states.pt. 32: [2022-11-25 16:09:32,232] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/layer_46-model_00-model_states.pt... 32: [2022-11-25 16:09:32,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_46-model_01-model_states.pt. 32: [2022-11-25 16:09:32,234] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/mp_rank_03_model_states.pt... 32: [2022-11-25 16:09:32,235] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/layer_46-model_00-model_states.pt. 32: [2022-11-25 16:09:32,236] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/mp_rank_02_model_states.pt... 32: [2022-11-25 16:09:32,239] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/mp_rank_02_model_states.pt. 32: [2022-11-25 16:09:32,239] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/mp_rank_03_model_states.pt. 32: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt... 63: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt... 63: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt... 63: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt... 63: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt... 53: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt... 53: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt... 53: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt... 53: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt... 34: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt... 34: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt... 34: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt... 34: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt... 42: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt... 42: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt... 42: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt... 42: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt... 32: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt... 32: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt... 32: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt... 57: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt... 57: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt... 57: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt... 57: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt... 63: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt... 39: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt... 39: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt... 39: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt... 39: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt... 53: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt... 53: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt... 53: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt... 53: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt... 55: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt... 55: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt... 55: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt... 55: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt... 51: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt... 51: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt... 51: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt... 51: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt... 33: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt... 33: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt... 33: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt... 33: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt... 61: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt... 61: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt... 61: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt... 61: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt... 61: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt... 59: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt... 59: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt... 59: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt... 59: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt... 35: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt... 35: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt... 35: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt... 35: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt... 37: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt... 37: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt... 37: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt... 37: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt... 58: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt... 58: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt... 58: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt... 58: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt... 40: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt... 40: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt... 40: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt... 40: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt... 60: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt... 60: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt... 60: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt... 60: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt... 52: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt... 52: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt... 52: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt... 52: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt... 56: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt... 56: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt... 56: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt... 56: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt... 54: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt... 54: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt... 54: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt... 54: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt... 54: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt... 54: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt... 62: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt... 62: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt... 62: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt... 62: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt... 36: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt... 36: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt... 36: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt... 28: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... 38: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt... 38: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt... 38: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt... 0: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... 24: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... 49: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt... 49: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt... 49: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt... 47: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt... 47: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt... 47: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt... 47: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt... 41: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt... 41: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt... 41: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt... 41: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt... 45: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt... 45: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt... 45: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt... 45: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt... 45: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt... 43: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt... 43: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt... 43: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt... 27: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... 27: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... 27: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... 27: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... 3: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... 17: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... 23: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... 34: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt... 46: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt... 46: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt... 46: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt... 44: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt... 44: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt... 44: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt... 44: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt... 10: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... 48: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt... 48: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt... 48: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt... 48: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt... 48: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt... 50: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt... 50: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt... 50: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt... 50: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt... 42: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt... 32: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt... 12: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... 12: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... 14: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... 15: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... 15: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... 15: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... 15: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... 57: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt... 63: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt... 63: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt... 63: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt... 39: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt... 39: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt... 39: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt... 55: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt... 51: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt... 51: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt... 51: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt... 33: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt... 33: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt... 33: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt... 33: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt... 1: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... 61: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt... 61: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt... 59: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt... 59: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt... 59: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt... 59: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt... 13: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... 13: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... 13: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... 35: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt... 5: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... 29: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... 29: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... 37: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt... 37: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt... 37: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt... 37: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt... 58: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt... 58: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt... 58: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt... 40: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt... 40: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt... 40: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt... 40: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt... 60: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt... 60: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt... 60: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt... 52: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt... 52: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt... 52: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt... 30: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... 30: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... 4: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... 4: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... 56: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt... 56: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt... 56: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt... 56: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt... 54: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt... 62: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt... 62: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt... 62: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt... 62: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt... 36: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt... 36: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt... 36: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt... 36: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt... 28: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... 28: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... 38: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt... 0: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... 2: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... 2: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... 2: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... 24: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... 24: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... 49: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt... 49: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt... 47: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt... 47: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt... 47: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt... 41: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt... 41: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt... 41: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt... 45: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt... 45: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt... 9: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... 9: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... 9: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... 43: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt... 27: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... 27: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... 25: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... 3: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... 3: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... 3: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... 7: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... 7: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... 17: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... 23: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... 23: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... 11: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... 11: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... 11: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... 11: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... 11: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... 31: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... 31: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... 31: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... 31: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... 31: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... 19: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... 19: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... 19: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... 34: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt... 34: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt... 46: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt... 46: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt... 46: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt... 46: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt... 44: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt... 44: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt... 6: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... 6: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... 16: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... 16: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... 16: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... 16: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... 18: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... 18: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... 10: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... 20: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... 20: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... 20: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... 20: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... 8: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... 8: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... 8: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... 8: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... 48: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt... 48: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt... 50: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt... 50: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt... 42: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt... 42: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt... 32: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt... 32: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt... 12: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... 12: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... 26: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... 26: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... 26: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... 14: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... 15: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... 22: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... 21: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... 21: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... 21: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... 21: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... 21: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... 21: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... 57: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt... 57: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt... 39: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt... 55: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt... 55: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt... 55: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt... 51: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt... 1: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... 1: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... 1: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... 1: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... 1: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... 61: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt... 13: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... 35: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt... 35: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt... 35: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt... 5: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... 5: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... 5: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... 29: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... 58: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt... 60: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt... 52: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt... 30: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... 30: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... 4: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... 4: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... 4: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... 54: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt... 36: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt... 28: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... 28: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... 38: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt... 38: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt... 38: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt... 38: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt... 0: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... 2: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... 2: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... 2: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... 24: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... 24: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... 49: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt... 49: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt... 49: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt... 47: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt... 41: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt... 45: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt... 9: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... 43: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt... 43: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt... 43: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt... 27: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... 25: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... 25: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... 3: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... 3: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... 7: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... 17: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... 23: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... 11: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... 31: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... 19: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... 34: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt... 46: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt... 44: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt... 44: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt... 6: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... 6: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... 6: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... 16: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... 18: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... 18: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... 10: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... 10: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... 10: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... 20: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... 8: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... 8: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... 48: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt... 50: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt... 42: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt... 32: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt... 12: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... 12: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... 26: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... 26: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... 26: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... 26: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... 26: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... 14: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... 14: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... 15: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... 22: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... 22: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... 21: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... 57: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt... 1: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... 1: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... 13: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... 13: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... 5: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... 5: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... 29: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... 30: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... 30: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... 4: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... 28: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... 0: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... 2: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... 2: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... 24: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... 9: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... 9: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... 43: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt... 27: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... 25: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... 25: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... 25: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... 25: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... 25: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... 3: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... 3: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... 7: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... 7: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... 17: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... 17: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... 17: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... 17: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... 23: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... 11: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... 11: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... 31: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... 31: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... 19: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... 6: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... 6: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... 16: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... 16: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... 18: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... 10: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... 20: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... 20: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... 8: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... 8: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... 50: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt... 12: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... 14: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... 14: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... 22: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... 22: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... 21: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... 13: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... 5: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... 29: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... 30: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... 4: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... 28: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... 0: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... 0: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... 24: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... 9: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... 7: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... 17: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... 23: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... 19: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... 19: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... 6: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... 16: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... 18: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... 10: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... 20: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... 14: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... 15: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... 22: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... 13: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... 5: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... 29: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... 30: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... 4: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... 28: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... 0: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... 0: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... 24: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... 9: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... 7: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... 23: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... 19: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... 18: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... 10: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... 12: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... 14: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... 15: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... 22: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... 22: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... 29: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... 7: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... 23: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... 18: [2022-11-25 16:09:32,399] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... 29: [2022-11-25 16:09:32,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step4000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... 32: [2022-11-25 16:09:32,645] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt. 32: [2022-11-25 16:09:32,645] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt 32: [2022-11-25 16:09:32,645] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 32: [2022-11-25 16:09:32,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt. 32: [2022-11-25 16:09:32,657] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt 32: [2022-11-25 16:09:32,657] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 0: [2022-11-25 16:09:32,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. 0: [2022-11-25 16:09:32,659] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt 0: [2022-11-25 16:09:32,659] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 0: [2022-11-25 16:09:32,662] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. 0: [2022-11-25 16:09:32,662] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt 0: [2022-11-25 16:09:32,662] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 16: [2022-11-25 16:09:32,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. 16: [2022-11-25 16:09:32,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt 16: [2022-11-25 16:09:32,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 32: [2022-11-25 16:09:32,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt. 32: [2022-11-25 16:09:32,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt 32: [2022-11-25 16:09:32,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 16: [2022-11-25 16:09:32,669] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. 16: [2022-11-25 16:09:32,669] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt 16: [2022-11-25 16:09:32,669] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 57: [2022-11-25 16:09:32,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt. 57: [2022-11-25 16:09:32,671] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt 57: [2022-11-25 16:09:32,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 23: [2022-11-25 16:09:32,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. 23: [2022-11-25 16:09:32,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt 23: [2022-11-25 16:09:32,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 60: [2022-11-25 16:09:32,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt. 60: [2022-11-25 16:09:32,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt 60: [2022-11-25 16:09:32,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 28: [2022-11-25 16:09:32,679] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. 28: [2022-11-25 16:09:32,679] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt 28: [2022-11-25 16:09:32,679] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 60: [2022-11-25 16:09:32,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt. 60: [2022-11-25 16:09:32,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt 60: [2022-11-25 16:09:32,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 23: [2022-11-25 16:09:32,681] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. 23: [2022-11-25 16:09:32,681] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt 23: [2022-11-25 16:09:32,681] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 60: [2022-11-25 16:09:32,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt. 60: [2022-11-25 16:09:32,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt 25: [2022-11-25 16:09:32,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. 60: [2022-11-25 16:09:32,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 25: [2022-11-25 16:09:32,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt 25: [2022-11-25 16:09:32,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 44: [2022-11-25 16:09:32,682] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt. 44: [2022-11-25 16:09:32,682] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt 44: [2022-11-25 16:09:32,682] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 25: [2022-11-25 16:09:32,683] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. 25: [2022-11-25 16:09:32,683] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt 25: [2022-11-25 16:09:32,683] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 41: [2022-11-25 16:09:32,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt. 45: [2022-11-25 16:09:32,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt. 41: [2022-11-25 16:09:32,684] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt 45: [2022-11-25 16:09:32,684] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt 41: [2022-11-25 16:09:32,684] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 45: [2022-11-25 16:09:32,684] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 23: [2022-11-25 16:09:32,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. 23: [2022-11-25 16:09:32,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt 23: [2022-11-25 16:09:32,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 20: [2022-11-25 16:09:32,685] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. 20: [2022-11-25 16:09:32,685] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt 20: [2022-11-25 16:09:32,685] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 25: [2022-11-25 16:09:32,686] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. 25: [2022-11-25 16:09:32,686] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt 25: [2022-11-25 16:09:32,686] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 20: [2022-11-25 16:09:32,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. 30: [2022-11-25 16:09:32,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. 30: [2022-11-25 16:09:32,687] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. 20: [2022-11-25 16:09:32,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt 30: [2022-11-25 16:09:32,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt 20: [2022-11-25 16:09:32,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 30: [2022-11-25 16:09:32,687] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt 30: [2022-11-25 16:09:32,687] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 23: [2022-11-25 16:09:32,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. 30: [2022-11-25 16:09:32,688] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 41: [2022-11-25 16:09:32,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt. 23: [2022-11-25 16:09:32,688] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt 23: [2022-11-25 16:09:32,688] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 41: [2022-11-25 16:09:32,688] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt 41: [2022-11-25 16:09:32,688] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 26: [2022-11-25 16:09:32,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. 26: [2022-11-25 16:09:32,688] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt 26: [2022-11-25 16:09:32,688] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 62: [2022-11-25 16:09:32,688] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt. 62: [2022-11-25 16:09:32,688] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt 62: [2022-11-25 16:09:32,688] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 51: [2022-11-25 16:09:32,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt. 51: [2022-11-25 16:09:32,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt. 51: [2022-11-25 16:09:32,689] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt 51: [2022-11-25 16:09:32,689] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt 51: [2022-11-25 16:09:32,689] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 51: [2022-11-25 16:09:32,689] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 0: [2022-11-25 16:09:32,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. 0: [2022-11-25 16:09:32,689] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt 57: [2022-11-25 16:09:32,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt. 0: [2022-11-25 16:09:32,689] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 57: [2022-11-25 16:09:32,689] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt 57: [2022-11-25 16:09:32,689] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 28: [2022-11-25 16:09:32,689] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. 28: [2022-11-25 16:09:32,689] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt 28: [2022-11-25 16:09:32,689] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 57: [2022-11-25 16:09:32,690] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt. 57: [2022-11-25 16:09:32,690] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt 57: [2022-11-25 16:09:32,690] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 0: [2022-11-25 16:09:32,690] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. 53: [2022-11-25 16:09:32,690] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt. 53: [2022-11-25 16:09:32,690] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt 53: [2022-11-25 16:09:32,690] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 37: [2022-11-25 16:09:32,690] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt. 9: [2022-11-25 16:09:32,690] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. 37: [2022-11-25 16:09:32,691] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt 9: [2022-11-25 16:09:32,690] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt 37: [2022-11-25 16:09:32,691] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 9: [2022-11-25 16:09:32,691] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 50: [2022-11-25 16:09:32,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt. 50: [2022-11-25 16:09:32,693] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt 45: [2022-11-25 16:09:32,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt. 50: [2022-11-25 16:09:32,693] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 45: [2022-11-25 16:09:32,693] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt 45: [2022-11-25 16:09:32,693] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 53: [2022-11-25 16:09:32,693] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt. 53: [2022-11-25 16:09:32,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt 53: [2022-11-25 16:09:32,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 32: [2022-11-25 16:09:32,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt. 32: [2022-11-25 16:09:32,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt 5: [2022-11-25 16:09:32,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. 5: [2022-11-25 16:09:32,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. 32: [2022-11-25 16:09:32,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 5: [2022-11-25 16:09:32,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt 5: [2022-11-25 16:09:32,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt 5: [2022-11-25 16:09:32,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 5: [2022-11-25 16:09:32,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 9: [2022-11-25 16:09:32,694] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. 9: [2022-11-25 16:09:32,694] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt 9: [2022-11-25 16:09:32,694] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 20: [2022-11-25 16:09:32,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. 20: [2022-11-25 16:09:32,695] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt 20: [2022-11-25 16:09:32,695] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 37: [2022-11-25 16:09:32,695] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt. 37: [2022-11-25 16:09:32,696] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt 37: [2022-11-25 16:09:32,696] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 30: [2022-11-25 16:09:32,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. 30: [2022-11-25 16:09:32,697] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt 21: [2022-11-25 16:09:32,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. 21: [2022-11-25 16:09:32,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. 30: [2022-11-25 16:09:32,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 28: [2022-11-25 16:09:32,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. 21: [2022-11-25 16:09:32,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt 21: [2022-11-25 16:09:32,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt 21: [2022-11-25 16:09:32,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 21: [2022-11-25 16:09:32,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 28: [2022-11-25 16:09:32,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt 28: [2022-11-25 16:09:32,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 61: [2022-11-25 16:09:32,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt. 61: [2022-11-25 16:09:32,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt 61: [2022-11-25 16:09:32,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 53: [2022-11-25 16:09:32,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt. 30: [2022-11-25 16:09:32,698] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. 53: [2022-11-25 16:09:32,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt 53: [2022-11-25 16:09:32,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 30: [2022-11-25 16:09:32,698] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt 30: [2022-11-25 16:09:32,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 16: [2022-11-25 16:09:32,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. 33: [2022-11-25 16:09:32,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt. 33: [2022-11-25 16:09:32,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt. 16: [2022-11-25 16:09:32,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt 33: [2022-11-25 16:09:32,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt 33: [2022-11-25 16:09:32,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt 16: [2022-11-25 16:09:32,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 33: [2022-11-25 16:09:32,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 33: [2022-11-25 16:09:32,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 16: [2022-11-25 16:09:32,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. 2: [2022-11-25 16:09:32,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. 2: [2022-11-25 16:09:32,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. 2: [2022-11-25 16:09:32,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. 35: [2022-11-25 16:09:32,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt. 35: [2022-11-25 16:09:32,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt. 16: [2022-11-25 16:09:32,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt 35: [2022-11-25 16:09:32,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt 35: [2022-11-25 16:09:32,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt 16: [2022-11-25 16:09:32,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 2: [2022-11-25 16:09:32,699] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. 2: [2022-11-25 16:09:32,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt 2: [2022-11-25 16:09:32,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt 2: [2022-11-25 16:09:32,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt 35: [2022-11-25 16:09:32,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 35: [2022-11-25 16:09:32,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 2: [2022-11-25 16:09:32,699] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt 2: [2022-11-25 16:09:32,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 2: [2022-11-25 16:09:32,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 2: [2022-11-25 16:09:32,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 2: [2022-11-25 16:09:32,699] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 50: [2022-11-25 16:09:32,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt. 50: [2022-11-25 16:09:32,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt 50: [2022-11-25 16:09:32,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 18: [2022-11-25 16:09:32,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. 18: [2022-11-25 16:09:32,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt 18: [2022-11-25 16:09:32,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 34: [2022-11-25 16:09:32,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt. 34: [2022-11-25 16:09:32,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt. 34: [2022-11-25 16:09:32,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt 34: [2022-11-25 16:09:32,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 18: [2022-11-25 16:09:32,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. 34: [2022-11-25 16:09:32,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt 34: [2022-11-25 16:09:32,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 18: [2022-11-25 16:09:32,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt 57: [2022-11-25 16:09:32,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt. 57: [2022-11-25 16:09:32,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt 18: [2022-11-25 16:09:32,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 57: [2022-11-25 16:09:32,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 44: [2022-11-25 16:09:32,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt. 62: [2022-11-25 16:09:32,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt. 44: [2022-11-25 16:09:32,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt 62: [2022-11-25 16:09:32,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt 44: [2022-11-25 16:09:32,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 20: [2022-11-25 16:09:32,701] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. 62: [2022-11-25 16:09:32,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 20: [2022-11-25 16:09:32,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt 20: [2022-11-25 16:09:32,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 25: [2022-11-25 16:09:32,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. 25: [2022-11-25 16:09:32,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt 19: [2022-11-25 16:09:32,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. 25: [2022-11-25 16:09:32,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 19: [2022-11-25 16:09:32,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. 19: [2022-11-25 16:09:32,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt 19: [2022-11-25 16:09:32,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 47: [2022-11-25 16:09:32,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt. 47: [2022-11-25 16:09:32,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt. 47: [2022-11-25 16:09:32,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt 19: [2022-11-25 16:09:32,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt 47: [2022-11-25 16:09:32,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt 19: [2022-11-25 16:09:32,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 47: [2022-11-25 16:09:32,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 47: [2022-11-25 16:09:32,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 50: [2022-11-25 16:09:32,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt. 50: [2022-11-25 16:09:32,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt 50: [2022-11-25 16:09:32,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 62: [2022-11-25 16:09:32,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt. 31: [2022-11-25 16:09:32,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. 31: [2022-11-25 16:09:32,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. 31: [2022-11-25 16:09:32,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. 31: [2022-11-25 16:09:32,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. 62: [2022-11-25 16:09:32,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt 31: [2022-11-25 16:09:32,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt 31: [2022-11-25 16:09:32,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt 62: [2022-11-25 16:09:32,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 31: [2022-11-25 16:09:32,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt 31: [2022-11-25 16:09:32,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt 31: [2022-11-25 16:09:32,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 31: [2022-11-25 16:09:32,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 31: [2022-11-25 16:09:32,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 31: [2022-11-25 16:09:32,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 53: [2022-11-25 16:09:32,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt. 53: [2022-11-25 16:09:32,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt 53: [2022-11-25 16:09:32,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 28: [2022-11-25 16:09:32,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. 28: [2022-11-25 16:09:32,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. 28: [2022-11-25 16:09:32,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt 28: [2022-11-25 16:09:32,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 28: [2022-11-25 16:09:32,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt 28: [2022-11-25 16:09:32,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 58: [2022-11-25 16:09:32,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt. 58: [2022-11-25 16:09:32,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt. 58: [2022-11-25 16:09:32,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt 58: [2022-11-25 16:09:32,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt 58: [2022-11-25 16:09:32,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 58: [2022-11-25 16:09:32,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 27: [2022-11-25 16:09:32,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. 27: [2022-11-25 16:09:32,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. 27: [2022-11-25 16:09:32,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt 27: [2022-11-25 16:09:32,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt 23: [2022-11-25 16:09:32,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. 23: [2022-11-25 16:09:32,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt 12: [2022-11-25 16:09:32,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. 27: [2022-11-25 16:09:32,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 27: [2022-11-25 16:09:32,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 23: [2022-11-25 16:09:32,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 50: [2022-11-25 16:09:32,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt. 12: [2022-11-25 16:09:32,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt 50: [2022-11-25 16:09:32,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt 12: [2022-11-25 16:09:32,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 50: [2022-11-25 16:09:32,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 12: [2022-11-25 16:09:32,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. 60: [2022-11-25 16:09:32,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt. 60: [2022-11-25 16:09:32,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt 60: [2022-11-25 16:09:32,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 4: [2022-11-25 16:09:32,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. 4: [2022-11-25 16:09:32,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. 4: [2022-11-25 16:09:32,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. 4: [2022-11-25 16:09:32,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. 4: [2022-11-25 16:09:32,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt 4: [2022-11-25 16:09:32,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt 12: [2022-11-25 16:09:32,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt 4: [2022-11-25 16:09:32,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt 4: [2022-11-25 16:09:32,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt 12: [2022-11-25 16:09:32,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 4: [2022-11-25 16:09:32,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 4: [2022-11-25 16:09:32,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 4: [2022-11-25 16:09:32,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 4: [2022-11-25 16:09:32,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 12: [2022-11-25 16:09:32,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. 12: [2022-11-25 16:09:32,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt 26: [2022-11-25 16:09:32,707] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. 12: [2022-11-25 16:09:32,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 26: [2022-11-25 16:09:32,707] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt 26: [2022-11-25 16:09:32,707] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 54: [2022-11-25 16:09:32,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt. 54: [2022-11-25 16:09:32,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt. 54: [2022-11-25 16:09:32,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt 54: [2022-11-25 16:09:32,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt 54: [2022-11-25 16:09:32,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 54: [2022-11-25 16:09:32,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 62: [2022-11-25 16:09:32,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt. 62: [2022-11-25 16:09:32,708] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt 62: [2022-11-25 16:09:32,708] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 27: [2022-11-25 16:09:32,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. 6: [2022-11-25 16:09:32,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. 6: [2022-11-25 16:09:32,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. 8: [2022-11-25 16:09:32,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. 8: [2022-11-25 16:09:32,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. 8: [2022-11-25 16:09:32,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. 27: [2022-11-25 16:09:32,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt 27: [2022-11-25 16:09:32,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 6: [2022-11-25 16:09:32,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt 6: [2022-11-25 16:09:32,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt 8: [2022-11-25 16:09:32,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt 8: [2022-11-25 16:09:32,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt 8: [2022-11-25 16:09:32,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt 8: [2022-11-25 16:09:32,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 30: [2022-11-25 16:09:32,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. 8: [2022-11-25 16:09:32,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 8: [2022-11-25 16:09:32,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 30: [2022-11-25 16:09:32,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt 6: [2022-11-25 16:09:32,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 6: [2022-11-25 16:09:32,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 1: [2022-11-25 16:09:32,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. 1: [2022-11-25 16:09:32,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. 1: [2022-11-25 16:09:32,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. 30: [2022-11-25 16:09:32,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 1: [2022-11-25 16:09:32,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. 7: [2022-11-25 16:09:32,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. 7: [2022-11-25 16:09:32,710] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. 1: [2022-11-25 16:09:32,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt 1: [2022-11-25 16:09:32,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt 1: [2022-11-25 16:09:32,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt 7: [2022-11-25 16:09:32,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt 7: [2022-11-25 16:09:32,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt 7: [2022-11-25 16:09:32,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 7: [2022-11-25 16:09:32,711] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 1: [2022-11-25 16:09:32,710] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt 1: [2022-11-25 16:09:32,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 1: [2022-11-25 16:09:32,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 1: [2022-11-25 16:09:32,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 1: [2022-11-25 16:09:32,710] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 43: [2022-11-25 16:09:32,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt. 43: [2022-11-25 16:09:32,711] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt. 43: [2022-11-25 16:09:32,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt 43: [2022-11-25 16:09:32,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt 43: [2022-11-25 16:09:32,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 43: [2022-11-25 16:09:32,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 11: [2022-11-25 16:09:32,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. 11: [2022-11-25 16:09:32,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. 11: [2022-11-25 16:09:32,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. 11: [2022-11-25 16:09:32,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt 11: [2022-11-25 16:09:32,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt 38: [2022-11-25 16:09:32,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt. 11: [2022-11-25 16:09:32,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt 11: [2022-11-25 16:09:32,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 38: [2022-11-25 16:09:32,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt 11: [2022-11-25 16:09:32,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 38: [2022-11-25 16:09:32,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 11: [2022-11-25 16:09:32,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 55: [2022-11-25 16:09:32,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt. 55: [2022-11-25 16:09:32,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt. 55: [2022-11-25 16:09:32,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt. 55: [2022-11-25 16:09:32,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt. 55: [2022-11-25 16:09:32,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt 55: [2022-11-25 16:09:32,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt 55: [2022-11-25 16:09:32,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt 55: [2022-11-25 16:09:32,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 55: [2022-11-25 16:09:32,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt 55: [2022-11-25 16:09:32,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 61: [2022-11-25 16:09:32,713] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt. 55: [2022-11-25 16:09:32,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 55: [2022-11-25 16:09:32,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 61: [2022-11-25 16:09:32,713] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt 61: [2022-11-25 16:09:32,713] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 38: [2022-11-25 16:09:32,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt. 38: [2022-11-25 16:09:32,714] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt 38: [2022-11-25 16:09:32,714] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 61: [2022-11-25 16:09:32,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt. 52: [2022-11-25 16:09:32,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt. 52: [2022-11-25 16:09:32,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt 61: [2022-11-25 16:09:32,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt 52: [2022-11-25 16:09:32,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 61: [2022-11-25 16:09:32,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 52: [2022-11-25 16:09:32,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt. 52: [2022-11-25 16:09:32,715] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt. 52: [2022-11-25 16:09:32,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt 52: [2022-11-25 16:09:32,715] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt 52: [2022-11-25 16:09:32,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 52: [2022-11-25 16:09:32,715] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 29: [2022-11-25 16:09:32,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. 29: [2022-11-25 16:09:32,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. 29: [2022-11-25 16:09:32,718] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. 29: [2022-11-25 16:09:32,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt 29: [2022-11-25 16:09:32,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt 29: [2022-11-25 16:09:32,718] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt 29: [2022-11-25 16:09:32,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 29: [2022-11-25 16:09:32,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 29: [2022-11-25 16:09:32,718] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 6: [2022-11-25 16:09:32,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. 6: [2022-11-25 16:09:32,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt 6: [2022-11-25 16:09:32,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 18: [2022-11-25 16:09:32,720] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. 18: [2022-11-25 16:09:32,720] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt 18: [2022-11-25 16:09:32,720] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 26: [2022-11-25 16:09:32,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. 26: [2022-11-25 16:09:32,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt 26: [2022-11-25 16:09:32,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 36: [2022-11-25 16:09:32,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt. 36: [2022-11-25 16:09:32,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt. 36: [2022-11-25 16:09:32,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt 36: [2022-11-25 16:09:32,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt 36: [2022-11-25 16:09:32,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 36: [2022-11-25 16:09:32,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 22: [2022-11-25 16:09:32,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. 22: [2022-11-25 16:09:32,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. 22: [2022-11-25 16:09:32,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. 22: [2022-11-25 16:09:32,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt 22: [2022-11-25 16:09:32,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt 22: [2022-11-25 16:09:32,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt 22: [2022-11-25 16:09:32,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 22: [2022-11-25 16:09:32,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 22: [2022-11-25 16:09:32,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 17: [2022-11-25 16:09:32,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. 17: [2022-11-25 16:09:32,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. 17: [2022-11-25 16:09:32,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. 17: [2022-11-25 16:09:32,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt 48: [2022-11-25 16:09:32,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt. 48: [2022-11-25 16:09:32,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt. 48: [2022-11-25 16:09:32,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt. 38: [2022-11-25 16:09:32,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt. 48: [2022-11-25 16:09:32,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt 48: [2022-11-25 16:09:32,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt 48: [2022-11-25 16:09:32,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt 17: [2022-11-25 16:09:32,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt 17: [2022-11-25 16:09:32,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt 17: [2022-11-25 16:09:32,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 48: [2022-11-25 16:09:32,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 38: [2022-11-25 16:09:32,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt 17: [2022-11-25 16:09:32,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 48: [2022-11-25 16:09:32,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 63: [2022-11-25 16:09:32,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt. 63: [2022-11-25 16:09:32,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt. 63: [2022-11-25 16:09:32,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt. 38: [2022-11-25 16:09:32,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 17: [2022-11-25 16:09:32,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 48: [2022-11-25 16:09:32,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 63: [2022-11-25 16:09:32,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt 63: [2022-11-25 16:09:32,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt 63: [2022-11-25 16:09:32,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt 63: [2022-11-25 16:09:32,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 63: [2022-11-25 16:09:32,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 63: [2022-11-25 16:09:32,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 13: [2022-11-25 16:09:32,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. 13: [2022-11-25 16:09:32,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. 13: [2022-11-25 16:09:32,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. 13: [2022-11-25 16:09:32,726] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. 13: [2022-11-25 16:09:32,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt 13: [2022-11-25 16:09:32,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt 13: [2022-11-25 16:09:32,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt 1: [2022-11-25 16:09:32,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. 13: [2022-11-25 16:09:32,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 13: [2022-11-25 16:09:32,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 13: [2022-11-25 16:09:32,726] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt 13: [2022-11-25 16:09:32,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 1: [2022-11-25 16:09:32,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt 13: [2022-11-25 16:09:32,726] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 1: [2022-11-25 16:09:32,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 25: [2022-11-25 16:09:32,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. 25: [2022-11-25 16:09:32,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt 25: [2022-11-25 16:09:32,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 59: [2022-11-25 16:09:32,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt. 59: [2022-11-25 16:09:32,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt 59: [2022-11-25 16:09:32,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt. 59: [2022-11-25 16:09:32,731] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt. 59: [2022-11-25 16:09:32,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 59: [2022-11-25 16:09:32,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt 59: [2022-11-25 16:09:32,731] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt 59: [2022-11-25 16:09:32,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 59: [2022-11-25 16:09:32,731] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 26: [2022-11-25 16:09:32,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. 26: [2022-11-25 16:09:32,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt 26: [2022-11-25 16:09:32,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 26: [2022-11-25 16:09:32,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. 26: [2022-11-25 16:09:32,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt 26: [2022-11-25 16:09:32,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 39: [2022-11-25 16:09:32,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt. 39: [2022-11-25 16:09:32,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt. 39: [2022-11-25 16:09:32,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt. 39: [2022-11-25 16:09:32,734] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt. 39: [2022-11-25 16:09:32,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt 39: [2022-11-25 16:09:32,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt 39: [2022-11-25 16:09:32,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 39: [2022-11-25 16:09:32,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt 39: [2022-11-25 16:09:32,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt 39: [2022-11-25 16:09:32,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 39: [2022-11-25 16:09:32,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 39: [2022-11-25 16:09:32,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 4: [2022-11-25 16:09:32,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. 4: [2022-11-25 16:09:32,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt 4: [2022-11-25 16:09:32,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 53: [2022-11-25 16:09:32,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt. 53: [2022-11-25 16:09:32,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt 53: [2022-11-25 16:09:32,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 49: [2022-11-25 16:09:32,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt. 49: [2022-11-25 16:09:32,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt. 49: [2022-11-25 16:09:32,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt. 49: [2022-11-25 16:09:32,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt. 49: [2022-11-25 16:09:32,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt 49: [2022-11-25 16:09:32,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt 49: [2022-11-25 16:09:32,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt 49: [2022-11-25 16:09:32,743] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt 49: [2022-11-25 16:09:32,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 49: [2022-11-25 16:09:32,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 49: [2022-11-25 16:09:32,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 49: [2022-11-25 16:09:32,743] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 62: [2022-11-25 16:09:32,743] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt. 62: [2022-11-25 16:09:32,744] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt 62: [2022-11-25 16:09:32,744] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 31: [2022-11-25 16:09:32,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. 31: [2022-11-25 16:09:32,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt 31: [2022-11-25 16:09:32,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 0: [2022-11-25 16:09:32,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt 0: [2022-11-25 16:09:32,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 13: [2022-11-25 16:09:32,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. 13: [2022-11-25 16:09:32,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt 13: [2022-11-25 16:09:32,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 60: [2022-11-25 16:09:32,748] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt. 60: [2022-11-25 16:09:32,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt 60: [2022-11-25 16:09:32,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 16: [2022-11-25 16:09:32,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. 16: [2022-11-25 16:09:32,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt 16: [2022-11-25 16:09:32,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 3: [2022-11-25 16:09:32,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. 3: [2022-11-25 16:09:32,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. 3: [2022-11-25 16:09:32,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. 3: [2022-11-25 16:09:32,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt 3: [2022-11-25 16:09:32,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt 3: [2022-11-25 16:09:32,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt 3: [2022-11-25 16:09:32,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 3: [2022-11-25 16:09:32,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 3: [2022-11-25 16:09:32,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 2: [2022-11-25 16:09:32,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. 2: [2022-11-25 16:09:32,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt 2: [2022-11-25 16:09:32,754] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 50: [2022-11-25 16:09:32,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt. 50: [2022-11-25 16:09:32,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt 50: [2022-11-25 16:09:32,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 42: [2022-11-25 16:09:32,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt. 42: [2022-11-25 16:09:32,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt. 42: [2022-11-25 16:09:32,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt. 42: [2022-11-25 16:09:32,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt. 42: [2022-11-25 16:09:32,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt 42: [2022-11-25 16:09:32,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt 42: [2022-11-25 16:09:32,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt 42: [2022-11-25 16:09:32,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt 42: [2022-11-25 16:09:32,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 42: [2022-11-25 16:09:32,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 42: [2022-11-25 16:09:32,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 42: [2022-11-25 16:09:32,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 56: [2022-11-25 16:09:32,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt. 56: [2022-11-25 16:09:32,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt. 56: [2022-11-25 16:09:32,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt. 56: [2022-11-25 16:09:32,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt 56: [2022-11-25 16:09:32,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt 56: [2022-11-25 16:09:32,760] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt. 56: [2022-11-25 16:09:32,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 56: [2022-11-25 16:09:32,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 56: [2022-11-25 16:09:32,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt 56: [2022-11-25 16:09:32,761] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt 56: [2022-11-25 16:09:32,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 56: [2022-11-25 16:09:32,761] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 55: [2022-11-25 16:09:32,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt. 55: [2022-11-25 16:09:32,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt 55: [2022-11-25 16:09:32,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 46: [2022-11-25 16:09:32,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt. 46: [2022-11-25 16:09:32,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt. 46: [2022-11-25 16:09:32,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt. 46: [2022-11-25 16:09:32,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt 46: [2022-11-25 16:09:32,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt 46: [2022-11-25 16:09:32,771] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt 46: [2022-11-25 16:09:32,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 46: [2022-11-25 16:09:32,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 46: [2022-11-25 16:09:32,771] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 46: [2022-11-25 16:09:32,771] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt. 46: [2022-11-25 16:09:32,772] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt 46: [2022-11-25 16:09:32,772] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 27: [2022-11-25 16:09:32,775] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. 27: [2022-11-25 16:09:32,775] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt 27: [2022-11-25 16:09:32,775] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 20: [2022-11-25 16:09:32,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. 20: [2022-11-25 16:09:32,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt 20: [2022-11-25 16:09:32,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 3: [2022-11-25 16:09:32,783] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. 3: [2022-11-25 16:09:32,783] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt 3: [2022-11-25 16:09:32,783] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 15: [2022-11-25 16:09:32,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. 15: [2022-11-25 16:09:32,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. 15: [2022-11-25 16:09:32,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. 42: [2022-11-25 16:09:32,786] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt. 15: [2022-11-25 16:09:32,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt 15: [2022-11-25 16:09:32,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt 42: [2022-11-25 16:09:32,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt 15: [2022-11-25 16:09:32,786] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt 42: [2022-11-25 16:09:32,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 15: [2022-11-25 16:09:32,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 15: [2022-11-25 16:09:32,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 15: [2022-11-25 16:09:32,786] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 3: [2022-11-25 16:09:32,788] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. 3: [2022-11-25 16:09:32,788] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt 3: [2022-11-25 16:09:32,788] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 15: [2022-11-25 16:09:32,791] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. 15: [2022-11-25 16:09:32,791] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt 15: [2022-11-25 16:09:32,791] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. 15: [2022-11-25 16:09:32,791] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 15: [2022-11-25 16:09:32,791] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt 15: [2022-11-25 16:09:32,791] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 49: [2022-11-25 16:09:32,794] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt. 49: [2022-11-25 16:09:32,794] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt 49: [2022-11-25 16:09:32,794] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 40: [2022-11-25 16:09:32,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt. 40: [2022-11-25 16:09:32,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt. 40: [2022-11-25 16:09:32,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt. 40: [2022-11-25 16:09:32,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt. 40: [2022-11-25 16:09:32,800] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt. 40: [2022-11-25 16:09:32,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt 40: [2022-11-25 16:09:32,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt 40: [2022-11-25 16:09:32,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt 40: [2022-11-25 16:09:32,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt 40: [2022-11-25 16:09:32,801] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt 40: [2022-11-25 16:09:32,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 40: [2022-11-25 16:09:32,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 40: [2022-11-25 16:09:32,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 40: [2022-11-25 16:09:32,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 40: [2022-11-25 16:09:32,801] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 46: [2022-11-25 16:09:32,802] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt. 46: [2022-11-25 16:09:32,802] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt 46: [2022-11-25 16:09:32,803] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 14: [2022-11-25 16:09:32,808] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. 14: [2022-11-25 16:09:32,808] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. 14: [2022-11-25 16:09:32,808] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. 14: [2022-11-25 16:09:32,808] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. 14: [2022-11-25 16:09:32,808] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. 14: [2022-11-25 16:09:32,808] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt 14: [2022-11-25 16:09:32,808] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt 14: [2022-11-25 16:09:32,808] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt 14: [2022-11-25 16:09:32,808] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt 14: [2022-11-25 16:09:32,808] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 14: [2022-11-25 16:09:32,808] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 14: [2022-11-25 16:09:32,808] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt 14: [2022-11-25 16:09:32,808] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 14: [2022-11-25 16:09:32,808] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 14: [2022-11-25 16:09:32,808] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 27: [2022-11-25 16:09:32,824] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. 27: [2022-11-25 16:09:32,824] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt 27: [2022-11-25 16:09:32,824] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 56: [2022-11-25 16:09:32,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt. 56: [2022-11-25 16:09:32,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt 56: [2022-11-25 16:09:32,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 43: [2022-11-25 16:09:32,836] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt. 43: [2022-11-25 16:09:32,836] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt 43: [2022-11-25 16:09:32,836] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 0: [2022-11-25 16:09:32,838] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. 0: [2022-11-25 16:09:32,838] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt 0: [2022-11-25 16:09:32,838] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 38: [2022-11-25 16:09:32,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt. 38: [2022-11-25 16:09:32,840] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt 38: [2022-11-25 16:09:32,840] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 10: [2022-11-25 16:09:32,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. 10: [2022-11-25 16:09:32,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. 10: [2022-11-25 16:09:32,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. 10: [2022-11-25 16:09:32,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. 10: [2022-11-25 16:09:32,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. 10: [2022-11-25 16:09:32,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt 10: [2022-11-25 16:09:32,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt 10: [2022-11-25 16:09:32,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt 10: [2022-11-25 16:09:32,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt 10: [2022-11-25 16:09:32,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt 10: [2022-11-25 16:09:32,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 10: [2022-11-25 16:09:32,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 10: [2022-11-25 16:09:32,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 10: [2022-11-25 16:09:32,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 10: [2022-11-25 16:09:32,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 57: [2022-11-25 16:09:32,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt. 57: [2022-11-25 16:09:32,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt 57: [2022-11-25 16:09:32,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 26: [2022-11-25 16:09:32,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. 26: [2022-11-25 16:09:32,872] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt 26: [2022-11-25 16:09:32,872] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 24: [2022-11-25 16:09:32,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. 24: [2022-11-25 16:09:32,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. 24: [2022-11-25 16:09:32,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. 24: [2022-11-25 16:09:32,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. 24: [2022-11-25 16:09:32,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt 24: [2022-11-25 16:09:32,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt 24: [2022-11-25 16:09:32,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt 24: [2022-11-25 16:09:32,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt 24: [2022-11-25 16:09:32,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 24: [2022-11-25 16:09:32,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 24: [2022-11-25 16:09:32,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 24: [2022-11-25 16:09:32,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. 24: [2022-11-25 16:09:32,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 24: [2022-11-25 16:09:32,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt 24: [2022-11-25 16:09:32,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 19: [2022-11-25 16:09:32,889] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. 19: [2022-11-25 16:09:32,889] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt 19: [2022-11-25 16:09:32,889] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 59: [2022-11-25 16:09:32,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt. 59: [2022-11-25 16:09:32,891] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt 59: [2022-11-25 16:09:32,891] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 35: [2022-11-25 16:09:32,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt. 35: [2022-11-25 16:09:32,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt 35: [2022-11-25 16:09:32,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 18: [2022-11-25 16:09:32,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. 18: [2022-11-25 16:09:32,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt 18: [2022-11-25 16:09:32,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 39: [2022-11-25 16:09:32,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt. 39: [2022-11-25 16:09:32,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt 39: [2022-11-25 16:09:32,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 6: [2022-11-25 16:09:32,924] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. 6: [2022-11-25 16:09:32,924] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt 6: [2022-11-25 16:09:32,924] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 52: [2022-11-25 16:09:32,928] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt. 52: [2022-11-25 16:09:32,928] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt 52: [2022-11-25 16:09:32,928] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 63: [2022-11-25 16:09:32,928] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt. 63: [2022-11-25 16:09:32,928] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt 63: [2022-11-25 16:09:32,928] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 48: [2022-11-25 16:09:32,929] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt. 48: [2022-11-25 16:09:32,930] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt 48: [2022-11-25 16:09:32,930] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 53: [2022-11-25 16:09:32,933] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt. 53: [2022-11-25 16:09:32,934] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt 53: [2022-11-25 16:09:32,934] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 30: [2022-11-25 16:09:32,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. 30: [2022-11-25 16:09:32,934] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt 30: [2022-11-25 16:09:32,934] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 33: [2022-11-25 16:09:32,941] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt. 33: [2022-11-25 16:09:32,941] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt 33: [2022-11-25 16:09:32,941] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 11: [2022-11-25 16:09:32,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. 11: [2022-11-25 16:09:32,946] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt 11: [2022-11-25 16:09:32,946] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 32: [2022-11-25 16:09:32,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt. 32: [2022-11-25 16:09:32,953] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt 32: [2022-11-25 16:09:32,953] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 5: [2022-11-25 16:09:32,956] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. 5: [2022-11-25 16:09:32,956] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt 5: [2022-11-25 16:09:32,956] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 9: [2022-11-25 16:09:32,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. 9: [2022-11-25 16:09:32,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt 9: [2022-11-25 16:09:32,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 8: [2022-11-25 16:09:32,959] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. 8: [2022-11-25 16:09:32,959] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt 8: [2022-11-25 16:09:32,959] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 45: [2022-11-25 16:09:32,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt. 45: [2022-11-25 16:09:32,961] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt 45: [2022-11-25 16:09:32,961] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 37: [2022-11-25 16:09:32,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt. 37: [2022-11-25 16:09:32,963] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt 37: [2022-11-25 16:09:32,964] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 44: [2022-11-25 16:09:32,964] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt. 44: [2022-11-25 16:09:32,964] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt 44: [2022-11-25 16:09:32,964] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 12: [2022-11-25 16:09:32,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. 21: [2022-11-25 16:09:32,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. 12: [2022-11-25 16:09:32,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt 12: [2022-11-25 16:09:32,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 21: [2022-11-25 16:09:32,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt 21: [2022-11-25 16:09:32,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 47: [2022-11-25 16:09:32,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt. 47: [2022-11-25 16:09:32,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt 47: [2022-11-25 16:09:32,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 41: [2022-11-25 16:09:32,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt. 41: [2022-11-25 16:09:32,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt 41: [2022-11-25 16:09:32,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 54: [2022-11-25 16:09:32,966] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt. 54: [2022-11-25 16:09:32,966] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt 54: [2022-11-25 16:09:32,966] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 7: [2022-11-25 16:09:32,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. 58: [2022-11-25 16:09:32,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt. 7: [2022-11-25 16:09:32,970] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt 58: [2022-11-25 16:09:32,970] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt 58: [2022-11-25 16:09:32,970] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 7: [2022-11-25 16:09:32,970] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 34: [2022-11-25 16:09:32,972] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt. 34: [2022-11-25 16:09:32,973] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt 34: [2022-11-25 16:09:32,973] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 36: [2022-11-25 16:09:32,975] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt. 36: [2022-11-25 16:09:32,975] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt 36: [2022-11-25 16:09:32,975] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 22: [2022-11-25 16:09:32,977] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. 22: [2022-11-25 16:09:32,977] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt 22: [2022-11-25 16:09:32,977] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 17: [2022-11-25 16:09:32,980] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. 17: [2022-11-25 16:09:32,980] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt 17: [2022-11-25 16:09:32,980] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 61: [2022-11-25 16:09:32,992] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt. 61: [2022-11-25 16:09:32,993] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt 61: [2022-11-25 16:09:32,993] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 29: [2022-11-25 16:09:32,995] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. 29: [2022-11-25 16:09:32,995] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt 29: [2022-11-25 16:09:32,995] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 31: [2022-11-25 16:09:33,003] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. 31: [2022-11-25 16:09:33,003] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt 31: [2022-11-25 16:09:33,003] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 25: [2022-11-25 16:09:33,007] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. 25: [2022-11-25 16:09:33,007] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt 25: [2022-11-25 16:09:33,007] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 28: [2022-11-25 16:09:33,008] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. 28: [2022-11-25 16:09:33,008] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt 28: [2022-11-25 16:09:33,008] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 15: [2022-11-25 16:09:33,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. 15: [2022-11-25 16:09:33,011] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt 15: [2022-11-25 16:09:33,012] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 10: [2022-11-25 16:09:33,012] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. 10: [2022-11-25 16:09:33,013] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt 10: [2022-11-25 16:09:33,013] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 3: [2022-11-25 16:09:33,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. 3: [2022-11-25 16:09:33,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt 3: [2022-11-25 16:09:33,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 23: [2022-11-25 16:09:33,022] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. 23: [2022-11-25 16:09:33,022] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt 23: [2022-11-25 16:09:33,022] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 1: [2022-11-25 16:09:33,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. 1: [2022-11-25 16:09:33,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt 14: [2022-11-25 16:09:33,027] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. 14: [2022-11-25 16:09:33,027] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt 1: [2022-11-25 16:09:33,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 14: [2022-11-25 16:09:33,027] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 4: [2022-11-25 16:09:33,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. 4: [2022-11-25 16:09:33,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt 4: [2022-11-25 16:09:33,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 50: [2022-11-25 16:09:33,028] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt. 50: [2022-11-25 16:09:33,028] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt 50: [2022-11-25 16:09:33,028] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 51: [2022-11-25 16:09:33,029] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt. 51: [2022-11-25 16:09:33,029] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt 51: [2022-11-25 16:09:33,030] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 60: [2022-11-25 16:09:33,031] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt. 60: [2022-11-25 16:09:33,031] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt 60: [2022-11-25 16:09:33,031] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 16: [2022-11-25 16:09:33,036] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. 16: [2022-11-25 16:09:33,036] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt 16: [2022-11-25 16:09:33,036] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 62: [2022-11-25 16:09:33,037] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt. 62: [2022-11-25 16:09:33,037] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt 62: [2022-11-25 16:09:33,037] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 27: [2022-11-25 16:09:33,046] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. 27: [2022-11-25 16:09:33,046] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt 27: [2022-11-25 16:09:33,046] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 2: [2022-11-25 16:09:33,048] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. 2: [2022-11-25 16:09:33,048] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt 2: [2022-11-25 16:09:33,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 13: [2022-11-25 16:09:33,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. 13: [2022-11-25 16:09:33,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt 13: [2022-11-25 16:09:33,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 42: [2022-11-25 16:09:33,050] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt. 42: [2022-11-25 16:09:33,050] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt 42: [2022-11-25 16:09:33,050] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 49: [2022-11-25 16:09:33,054] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt. 49: [2022-11-25 16:09:33,054] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt 49: [2022-11-25 16:09:33,054] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 55: [2022-11-25 16:09:33,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt. 55: [2022-11-25 16:09:33,058] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt 24: [2022-11-25 16:09:33,058] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. 55: [2022-11-25 16:09:33,058] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 24: [2022-11-25 16:09:33,059] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt 24: [2022-11-25 16:09:33,059] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 40: [2022-11-25 16:09:33,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt. 40: [2022-11-25 16:09:33,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt 40: [2022-11-25 16:09:33,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 46: [2022-11-25 16:09:33,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt. 46: [2022-11-25 16:09:33,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt 46: [2022-11-25 16:09:33,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 38: [2022-11-25 16:09:33,069] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt. 38: [2022-11-25 16:09:33,069] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt 38: [2022-11-25 16:09:33,069] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 56: [2022-11-25 16:09:33,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt. 56: [2022-11-25 16:09:33,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt 56: [2022-11-25 16:09:33,076] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 39: [2022-11-25 16:09:33,079] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt. 39: [2022-11-25 16:09:33,079] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt 39: [2022-11-25 16:09:33,079] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 57: [2022-11-25 16:09:33,087] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt. 57: [2022-11-25 16:09:33,087] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt 57: [2022-11-25 16:09:33,087] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 0: [2022-11-25 16:09:33,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. 0: [2022-11-25 16:09:33,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt 0: [2022-11-25 16:09:33,089] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 43: [2022-11-25 16:09:33,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt. 43: [2022-11-25 16:09:33,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt 43: [2022-11-25 16:09:33,089] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 26: [2022-11-25 16:09:33,091] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. 26: [2022-11-25 16:09:33,091] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt 26: [2022-11-25 16:09:33,091] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 20: [2022-11-25 16:09:33,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. 20: [2022-11-25 16:09:33,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt 20: [2022-11-25 16:09:33,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 19: [2022-11-25 16:09:33,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. 19: [2022-11-25 16:09:33,096] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt 19: [2022-11-25 16:09:33,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 18: [2022-11-25 16:09:33,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. 18: [2022-11-25 16:09:33,096] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt 18: [2022-11-25 16:09:33,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 53: [2022-11-25 16:09:33,099] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt. 53: [2022-11-25 16:09:33,100] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt 53: [2022-11-25 16:09:33,100] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 59: [2022-11-25 16:09:33,104] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt. 59: [2022-11-25 16:09:33,104] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt 59: [2022-11-25 16:09:33,104] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 63: [2022-11-25 16:09:33,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt. 63: [2022-11-25 16:09:33,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt 63: [2022-11-25 16:09:33,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 30: [2022-11-25 16:09:33,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. 30: [2022-11-25 16:09:33,113] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt 30: [2022-11-25 16:09:33,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 6: [2022-11-25 16:09:33,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. 6: [2022-11-25 16:09:33,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt 6: [2022-11-25 16:09:33,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 52: [2022-11-25 16:09:33,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt. 52: [2022-11-25 16:09:33,119] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt 52: [2022-11-25 16:09:33,119] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 34: [2022-11-25 16:09:33,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt. 34: [2022-11-25 16:09:33,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt 34: [2022-11-25 16:09:33,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 12: [2022-11-25 16:09:33,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. 12: [2022-11-25 16:09:33,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt 12: [2022-11-25 16:09:33,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 32: [2022-11-25 16:09:33,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt. 32: [2022-11-25 16:09:33,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt 11: [2022-11-25 16:09:33,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. 32: [2022-11-25 16:09:33,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 11: [2022-11-25 16:09:33,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt 11: [2022-11-25 16:09:33,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 48: [2022-11-25 16:09:33,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt. 48: [2022-11-25 16:09:33,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt 48: [2022-11-25 16:09:33,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 45: [2022-11-25 16:09:33,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt. 45: [2022-11-25 16:09:33,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt 45: [2022-11-25 16:09:33,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 17: [2022-11-25 16:09:33,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. 17: [2022-11-25 16:09:33,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt 17: [2022-11-25 16:09:33,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 15: [2022-11-25 16:09:33,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. 15: [2022-11-25 16:09:33,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt 15: [2022-11-25 16:09:33,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 60: [2022-11-25 16:09:33,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt. 60: [2022-11-25 16:09:33,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt 60: [2022-11-25 16:09:33,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 50: [2022-11-25 16:09:33,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt. 50: [2022-11-25 16:09:33,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt 50: [2022-11-25 16:09:33,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 37: [2022-11-25 16:09:33,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt. 37: [2022-11-25 16:09:33,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt 37: [2022-11-25 16:09:33,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 28: [2022-11-25 16:09:33,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. 28: [2022-11-25 16:09:33,131] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt 28: [2022-11-25 16:09:33,131] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 16: [2022-11-25 16:09:33,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. 16: [2022-11-25 16:09:33,131] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt 16: [2022-11-25 16:09:33,131] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 31: [2022-11-25 16:09:33,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. 31: [2022-11-25 16:09:33,131] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt 31: [2022-11-25 16:09:33,131] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 14: [2022-11-25 16:09:33,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. 14: [2022-11-25 16:09:33,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt 14: [2022-11-25 16:09:33,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 29: [2022-11-25 16:09:33,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. 29: [2022-11-25 16:09:33,133] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt 29: [2022-11-25 16:09:33,133] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 3: [2022-11-25 16:09:33,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. 3: [2022-11-25 16:09:33,134] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt 3: [2022-11-25 16:09:33,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 8: [2022-11-25 16:09:33,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. 8: [2022-11-25 16:09:33,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt 8: [2022-11-25 16:09:33,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 55: [2022-11-25 16:09:33,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt. 55: [2022-11-25 16:09:33,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt 47: [2022-11-25 16:09:33,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt. 55: [2022-11-25 16:09:33,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 47: [2022-11-25 16:09:33,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt 47: [2022-11-25 16:09:33,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 42: [2022-11-25 16:09:33,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt. 42: [2022-11-25 16:09:33,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt 61: [2022-11-25 16:09:33,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt. 42: [2022-11-25 16:09:33,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 61: [2022-11-25 16:09:33,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt 61: [2022-11-25 16:09:33,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 1: [2022-11-25 16:09:33,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. 25: [2022-11-25 16:09:33,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. 1: [2022-11-25 16:09:33,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt 25: [2022-11-25 16:09:33,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt 25: [2022-11-25 16:09:33,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 1: [2022-11-25 16:09:33,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 58: [2022-11-25 16:09:33,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt. 58: [2022-11-25 16:09:33,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt 58: [2022-11-25 16:09:33,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 10: [2022-11-25 16:09:33,140] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. 10: [2022-11-25 16:09:33,140] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt 10: [2022-11-25 16:09:33,140] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 4: [2022-11-25 16:09:33,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. 33: [2022-11-25 16:09:33,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt. 4: [2022-11-25 16:09:33,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt 9: [2022-11-25 16:09:33,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. 33: [2022-11-25 16:09:33,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt 4: [2022-11-25 16:09:33,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 33: [2022-11-25 16:09:33,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 9: [2022-11-25 16:09:33,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt 13: [2022-11-25 16:09:33,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. 9: [2022-11-25 16:09:33,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 27: [2022-11-25 16:09:33,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. 13: [2022-11-25 16:09:33,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt 27: [2022-11-25 16:09:33,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt 27: [2022-11-25 16:09:33,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 13: [2022-11-25 16:09:33,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 24: [2022-11-25 16:09:33,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. 24: [2022-11-25 16:09:33,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt 24: [2022-11-25 16:09:33,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 5: [2022-11-25 16:09:33,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. 5: [2022-11-25 16:09:33,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt 5: [2022-11-25 16:09:33,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 23: [2022-11-25 16:09:33,146] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. 23: [2022-11-25 16:09:33,146] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt 23: [2022-11-25 16:09:33,146] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 40: [2022-11-25 16:09:33,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt. 40: [2022-11-25 16:09:33,147] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt 40: [2022-11-25 16:09:33,147] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 22: [2022-11-25 16:09:33,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. 22: [2022-11-25 16:09:33,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt 2: [2022-11-25 16:09:33,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. 22: [2022-11-25 16:09:33,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 2: [2022-11-25 16:09:33,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt 2: [2022-11-25 16:09:33,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 46: [2022-11-25 16:09:33,152] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt. 46: [2022-11-25 16:09:33,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt 46: [2022-11-25 16:09:33,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 7: [2022-11-25 16:09:33,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. 7: [2022-11-25 16:09:33,153] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt 7: [2022-11-25 16:09:33,153] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 39: [2022-11-25 16:09:33,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt. 36: [2022-11-25 16:09:33,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt. 39: [2022-11-25 16:09:33,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt 36: [2022-11-25 16:09:33,154] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt 39: [2022-11-25 16:09:33,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 36: [2022-11-25 16:09:33,154] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 26: [2022-11-25 16:09:33,154] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. 26: [2022-11-25 16:09:33,155] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt 26: [2022-11-25 16:09:33,155] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 49: [2022-11-25 16:09:33,155] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt. 49: [2022-11-25 16:09:33,155] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt 49: [2022-11-25 16:09:33,155] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 57: [2022-11-25 16:09:33,155] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt. 57: [2022-11-25 16:09:33,155] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt 57: [2022-11-25 16:09:33,155] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 53: [2022-11-25 16:09:33,158] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt. 53: [2022-11-25 16:09:33,158] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt 53: [2022-11-25 16:09:33,158] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 62: [2022-11-25 16:09:33,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt. 62: [2022-11-25 16:09:33,160] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt 62: [2022-11-25 16:09:33,160] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 18: [2022-11-25 16:09:33,164] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. 18: [2022-11-25 16:09:33,164] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt 18: [2022-11-25 16:09:33,164] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 0: [2022-11-25 16:09:33,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. 0: [2022-11-25 16:09:33,167] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt 0: [2022-11-25 16:09:33,167] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 30: [2022-11-25 16:09:33,170] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. 30: [2022-11-25 16:09:33,170] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt 30: [2022-11-25 16:09:33,171] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 43: [2022-11-25 16:09:33,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt. 43: [2022-11-25 16:09:33,171] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt 43: [2022-11-25 16:09:33,171] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 20: [2022-11-25 16:09:33,171] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. 20: [2022-11-25 16:09:33,171] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt 20: [2022-11-25 16:09:33,171] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 21: [2022-11-25 16:09:33,172] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. 38: [2022-11-25 16:09:33,172] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt. 38: [2022-11-25 16:09:33,172] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt 38: [2022-11-25 16:09:33,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 52: [2022-11-25 16:09:33,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt. 52: [2022-11-25 16:09:33,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt 52: [2022-11-25 16:09:33,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 44: [2022-11-25 16:09:33,173] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt. 44: [2022-11-25 16:09:33,173] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt 44: [2022-11-25 16:09:33,173] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 63: [2022-11-25 16:09:33,176] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt. 63: [2022-11-25 16:09:33,176] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt 63: [2022-11-25 16:09:33,176] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 56: [2022-11-25 16:09:33,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt. 56: [2022-11-25 16:09:33,177] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt 56: [2022-11-25 16:09:33,177] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 59: [2022-11-25 16:09:33,177] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt. 59: [2022-11-25 16:09:33,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt 59: [2022-11-25 16:09:33,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 35: [2022-11-25 16:09:33,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt. 35: [2022-11-25 16:09:33,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt 35: [2022-11-25 16:09:33,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 54: [2022-11-25 16:09:33,182] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt. 54: [2022-11-25 16:09:33,182] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt 54: [2022-11-25 16:09:33,182] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 6: [2022-11-25 16:09:33,183] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. 6: [2022-11-25 16:09:33,183] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt 6: [2022-11-25 16:09:33,183] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 34: [2022-11-25 16:09:33,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt. 34: [2022-11-25 16:09:33,184] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt 34: [2022-11-25 16:09:33,184] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 35: [2022-11-25 16:09:33,184] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt. 35: [2022-11-25 16:09:33,185] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt 35: [2022-11-25 16:09:33,185] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 17: [2022-11-25 16:09:33,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. 17: [2022-11-25 16:09:33,186] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt 17: [2022-11-25 16:09:33,186] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 15: [2022-11-25 16:09:33,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. 15: [2022-11-25 16:09:33,186] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt 15: [2022-11-25 16:09:33,186] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 14: [2022-11-25 16:09:33,186] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. 14: [2022-11-25 16:09:33,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt 14: [2022-11-25 16:09:33,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 8: [2022-11-25 16:09:33,187] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. 8: [2022-11-25 16:09:33,187] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt 8: [2022-11-25 16:09:33,187] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 33: [2022-11-25 16:09:33,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt. 33: [2022-11-25 16:09:33,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt 33: [2022-11-25 16:09:33,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 28: [2022-11-25 16:09:33,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. 28: [2022-11-25 16:09:33,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt 28: [2022-11-25 16:09:33,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 11: [2022-11-25 16:09:33,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. 11: [2022-11-25 16:09:33,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt 11: [2022-11-25 16:09:33,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 31: [2022-11-25 16:09:33,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. 31: [2022-11-25 16:09:33,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt 31: [2022-11-25 16:09:33,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 10: [2022-11-25 16:09:33,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. 10: [2022-11-25 16:09:33,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt 45: [2022-11-25 16:09:33,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt. 10: [2022-11-25 16:09:33,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 45: [2022-11-25 16:09:33,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt 22: [2022-11-25 16:09:33,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. 45: [2022-11-25 16:09:33,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 22: [2022-11-25 16:09:33,190] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt 22: [2022-11-25 16:09:33,190] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 23: [2022-11-25 16:09:33,190] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. 23: [2022-11-25 16:09:33,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt 23: [2022-11-25 16:09:33,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 51: [2022-11-25 16:09:33,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt. 16: [2022-11-25 16:09:33,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. 51: [2022-11-25 16:09:33,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt 16: [2022-11-25 16:09:33,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt 16: [2022-11-25 16:09:33,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 51: [2022-11-25 16:09:33,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 4: [2022-11-25 16:09:33,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. 32: [2022-11-25 16:09:33,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt. 4: [2022-11-25 16:09:33,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt 32: [2022-11-25 16:09:33,191] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt 4: [2022-11-25 16:09:33,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 32: [2022-11-25 16:09:33,191] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 13: [2022-11-25 16:09:33,191] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. 1: [2022-11-25 16:09:33,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. 13: [2022-11-25 16:09:33,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt 13: [2022-11-25 16:09:33,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 1: [2022-11-25 16:09:33,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt 1: [2022-11-25 16:09:33,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 27: [2022-11-25 16:09:33,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. 27: [2022-11-25 16:09:33,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt 27: [2022-11-25 16:09:33,192] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 60: [2022-11-25 16:09:33,192] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt. 60: [2022-11-25 16:09:33,192] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt 60: [2022-11-25 16:09:33,193] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 3: [2022-11-25 16:09:33,194] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. 3: [2022-11-25 16:09:33,194] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt 41: [2022-11-25 16:09:33,194] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt. 3: [2022-11-25 16:09:33,194] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 41: [2022-11-25 16:09:33,194] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt 41: [2022-11-25 16:09:33,194] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 21: [2022-11-25 16:09:33,172] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt 21: [2022-11-25 16:09:33,172] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 50: [2022-11-25 16:09:33,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt. 50: [2022-11-25 16:09:33,195] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt 42: [2022-11-25 16:09:33,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt. 50: [2022-11-25 16:09:33,195] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 42: [2022-11-25 16:09:33,195] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt 42: [2022-11-25 16:09:33,195] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 46: [2022-11-25 16:09:33,195] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt. 46: [2022-11-25 16:09:33,195] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt 46: [2022-11-25 16:09:33,195] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 2: [2022-11-25 16:09:33,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. 2: [2022-11-25 16:09:33,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt 25: [2022-11-25 16:09:33,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. 2: [2022-11-25 16:09:33,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 25: [2022-11-25 16:09:33,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt 25: [2022-11-25 16:09:33,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 58: [2022-11-25 16:09:33,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt. 58: [2022-11-25 16:09:33,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt 37: [2022-11-25 16:09:33,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt. 58: [2022-11-25 16:09:33,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 37: [2022-11-25 16:09:33,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt 54: [2022-11-25 16:09:33,196] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt. 37: [2022-11-25 16:09:33,196] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 54: [2022-11-25 16:09:33,196] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt 54: [2022-11-25 16:09:33,197] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 49: [2022-11-25 16:09:33,197] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt. 49: [2022-11-25 16:09:33,197] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt 49: [2022-11-25 16:09:33,197] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 36: [2022-11-25 16:09:33,197] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt. 36: [2022-11-25 16:09:33,197] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt 36: [2022-11-25 16:09:33,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 12: [2022-11-25 16:09:33,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. 12: [2022-11-25 16:09:33,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt 12: [2022-11-25 16:09:33,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 24: [2022-11-25 16:09:33,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. 24: [2022-11-25 16:09:33,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt 24: [2022-11-25 16:09:33,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 5: [2022-11-25 16:09:33,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. 62: [2022-11-25 16:09:33,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt. 62: [2022-11-25 16:09:33,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt 62: [2022-11-25 16:09:33,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 44: [2022-11-25 16:09:33,199] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt. 44: [2022-11-25 16:09:33,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt 44: [2022-11-25 16:09:33,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 5: [2022-11-25 16:09:33,199] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt 5: [2022-11-25 16:09:33,199] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 47: [2022-11-25 16:09:33,201] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt. 61: [2022-11-25 16:09:33,203] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt. 61: [2022-11-25 16:09:33,203] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt 61: [2022-11-25 16:09:33,203] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 7: [2022-11-25 16:09:33,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. 48: [2022-11-25 16:09:33,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt. 48: [2022-11-25 16:09:33,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt 48: [2022-11-25 16:09:33,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 7: [2022-11-25 16:09:33,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt 7: [2022-11-25 16:09:33,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 57: [2022-11-25 16:09:33,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt. 57: [2022-11-25 16:09:33,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt 57: [2022-11-25 16:09:33,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 9: [2022-11-25 16:09:33,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. 56: [2022-11-25 16:09:33,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt. 9: [2022-11-25 16:09:33,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt 56: [2022-11-25 16:09:33,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt 9: [2022-11-25 16:09:33,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 56: [2022-11-25 16:09:33,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 55: [2022-11-25 16:09:33,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt. 39: [2022-11-25 16:09:33,208] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt. 39: [2022-11-25 16:09:33,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt 55: [2022-11-25 16:09:33,208] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt 39: [2022-11-25 16:09:33,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 55: [2022-11-25 16:09:33,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 43: [2022-11-25 16:09:33,210] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt. 43: [2022-11-25 16:09:33,210] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt 43: [2022-11-25 16:09:33,210] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 20: [2022-11-25 16:09:33,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. 20: [2022-11-25 16:09:33,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt 20: [2022-11-25 16:09:33,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 51: [2022-11-25 16:09:33,211] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt. 51: [2022-11-25 16:09:33,211] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt 51: [2022-11-25 16:09:33,211] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 47: [2022-11-25 16:09:33,201] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt 47: [2022-11-25 16:09:33,201] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 38: [2022-11-25 16:09:33,218] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt. 38: [2022-11-25 16:09:33,218] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt 38: [2022-11-25 16:09:33,218] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 21: [2022-11-25 16:09:33,219] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. 0: [2022-11-25 16:09:33,222] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. 0: [2022-11-25 16:09:33,223] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt 0: [2022-11-25 16:09:33,223] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 40: [2022-11-25 16:09:33,224] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt. 40: [2022-11-25 16:09:33,224] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt 40: [2022-11-25 16:09:33,224] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 11: [2022-11-25 16:09:33,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. 11: [2022-11-25 16:09:33,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt 11: [2022-11-25 16:09:33,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 63: [2022-11-25 16:09:33,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt. 63: [2022-11-25 16:09:33,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt 63: [2022-11-25 16:09:33,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 6: [2022-11-25 16:09:33,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. 6: [2022-11-25 16:09:33,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt 6: [2022-11-25 16:09:33,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 8: [2022-11-25 16:09:33,226] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. 8: [2022-11-25 16:09:33,226] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt 8: [2022-11-25 16:09:33,226] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 29: [2022-11-25 16:09:33,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. 29: [2022-11-25 16:09:33,227] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt 29: [2022-11-25 16:09:33,227] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 59: [2022-11-25 16:09:33,227] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt. 59: [2022-11-25 16:09:33,227] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt 59: [2022-11-25 16:09:33,227] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 18: [2022-11-25 16:09:33,232] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. 18: [2022-11-25 16:09:33,232] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt 18: [2022-11-25 16:09:33,232] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 12: [2022-11-25 16:09:33,233] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. 12: [2022-11-25 16:09:33,233] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt 12: [2022-11-25 16:09:33,233] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 45: [2022-11-25 16:09:33,240] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt. 45: [2022-11-25 16:09:33,240] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt 45: [2022-11-25 16:09:33,240] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 32: [2022-11-25 16:09:33,242] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt. 32: [2022-11-25 16:09:33,242] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt 32: [2022-11-25 16:09:33,242] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 29: [2022-11-25 16:09:33,242] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. 29: [2022-11-25 16:09:33,243] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt 29: [2022-11-25 16:09:33,243] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 22: [2022-11-25 16:09:33,244] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. 22: [2022-11-25 16:09:33,244] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt 22: [2022-11-25 16:09:33,244] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 54: [2022-11-25 16:09:33,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt. 5: [2022-11-25 16:09:33,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. 54: [2022-11-25 16:09:33,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt 5: [2022-11-25 16:09:33,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt 54: [2022-11-25 16:09:33,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 5: [2022-11-25 16:09:33,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 38: [2022-11-25 16:09:33,245] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt. 38: [2022-11-25 16:09:33,245] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt 38: [2022-11-25 16:09:33,245] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 61: [2022-11-25 16:09:33,247] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt. 61: [2022-11-25 16:09:33,247] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt 61: [2022-11-25 16:09:33,247] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 6: [2022-11-25 16:09:33,248] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. 6: [2022-11-25 16:09:33,248] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt 6: [2022-11-25 16:09:33,248] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 35: [2022-11-25 16:09:33,249] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt. 35: [2022-11-25 16:09:33,249] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt 35: [2022-11-25 16:09:33,249] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 17: [2022-11-25 16:09:33,249] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. 17: [2022-11-25 16:09:33,249] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt 17: [2022-11-25 16:09:33,249] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 7: [2022-11-25 16:09:33,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. 7: [2022-11-25 16:09:33,251] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt 8: [2022-11-25 16:09:33,251] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. 7: [2022-11-25 16:09:33,251] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 8: [2022-11-25 16:09:33,251] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt 8: [2022-11-25 16:09:33,251] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 41: [2022-11-25 16:09:33,252] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt. 37: [2022-11-25 16:09:33,252] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt. 41: [2022-11-25 16:09:33,252] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt 37: [2022-11-25 16:09:33,252] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt 41: [2022-11-25 16:09:33,252] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 37: [2022-11-25 16:09:33,252] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 41: [2022-11-25 16:09:33,253] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt. 41: [2022-11-25 16:09:33,253] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt 41: [2022-11-25 16:09:33,253] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 34: [2022-11-25 16:09:33,254] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt. 34: [2022-11-25 16:09:33,254] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt 34: [2022-11-25 16:09:33,254] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 9: [2022-11-25 16:09:33,254] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. 9: [2022-11-25 16:09:33,255] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt 48: [2022-11-25 16:09:33,255] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt. 21: [2022-11-25 16:09:33,219] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt 21: [2022-11-25 16:09:33,219] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 9: [2022-11-25 16:09:33,255] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 48: [2022-11-25 16:09:33,255] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt 48: [2022-11-25 16:09:33,255] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 21: [2022-11-25 16:09:33,255] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. 33: [2022-11-25 16:09:33,255] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt. 21: [2022-11-25 16:09:33,255] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt 33: [2022-11-25 16:09:33,255] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt 21: [2022-11-25 16:09:33,255] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 33: [2022-11-25 16:09:33,255] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 47: [2022-11-25 16:09:33,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt. 19: [2022-11-25 16:09:33,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. 19: [2022-11-25 16:09:33,258] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt 47: [2022-11-25 16:09:33,258] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt 19: [2022-11-25 16:09:33,258] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 47: [2022-11-25 16:09:33,258] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 44: [2022-11-25 16:09:33,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt. 44: [2022-11-25 16:09:33,258] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt 44: [2022-11-25 16:09:33,258] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 58: [2022-11-25 16:09:33,259] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt. 58: [2022-11-25 16:09:33,259] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt 58: [2022-11-25 16:09:33,259] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 59: [2022-11-25 16:09:33,259] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt. 59: [2022-11-25 16:09:33,259] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt 59: [2022-11-25 16:09:33,259] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 19: [2022-11-25 16:09:33,261] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. 19: [2022-11-25 16:09:33,261] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt 19: [2022-11-25 16:09:33,261] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 19: [2022-11-25 16:09:33,262] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. 19: [2022-11-25 16:09:33,262] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt 19: [2022-11-25 16:09:33,262] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 63: [2022-11-25 16:09:33,262] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt. 63: [2022-11-25 16:09:33,262] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt 63: [2022-11-25 16:09:33,262] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 18: [2022-11-25 16:09:33,269] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. 18: [2022-11-25 16:09:33,269] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt 18: [2022-11-25 16:09:33,269] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 48: [2022-11-25 16:09:33,269] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt. 48: [2022-11-25 16:09:33,269] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt 48: [2022-11-25 16:09:33,269] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 36: [2022-11-25 16:09:33,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt. 36: [2022-11-25 16:09:33,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt 36: [2022-11-25 16:09:33,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 52: [2022-11-25 16:09:33,278] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt. 52: [2022-11-25 16:09:33,278] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt 52: [2022-11-25 16:09:33,278] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 11: [2022-11-25 16:09:33,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. 11: [2022-11-25 16:09:33,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt 11: [2022-11-25 16:09:33,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 35: [2022-11-25 16:09:33,283] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt. 35: [2022-11-25 16:09:33,283] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt 35: [2022-11-25 16:09:33,283] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 41: [2022-11-25 16:09:33,286] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt. 41: [2022-11-25 16:09:33,286] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt 41: [2022-11-25 16:09:33,286] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 51: [2022-11-25 16:09:33,287] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt. 51: [2022-11-25 16:09:33,287] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt 51: [2022-11-25 16:09:33,287] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 34: [2022-11-25 16:09:33,289] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt. 34: [2022-11-25 16:09:33,289] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt 34: [2022-11-25 16:09:33,289] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 12: [2022-11-25 16:09:33,293] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. 12: [2022-11-25 16:09:33,293] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt 12: [2022-11-25 16:09:33,293] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 47: [2022-11-25 16:09:33,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt. 45: [2022-11-25 16:09:33,295] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt. 45: [2022-11-25 16:09:33,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt 45: [2022-11-25 16:09:33,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 58: [2022-11-25 16:09:33,296] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt. 58: [2022-11-25 16:09:33,296] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt 58: [2022-11-25 16:09:33,296] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 22: [2022-11-25 16:09:33,297] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. 22: [2022-11-25 16:09:33,297] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt 22: [2022-11-25 16:09:33,297] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 51: [2022-11-25 16:09:33,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt. 51: [2022-11-25 16:09:33,299] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt 51: [2022-11-25 16:09:33,299] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 52: [2022-11-25 16:09:33,299] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt. 52: [2022-11-25 16:09:33,300] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt 52: [2022-11-25 16:09:33,300] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 43: [2022-11-25 16:09:33,300] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt. 43: [2022-11-25 16:09:33,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt 43: [2022-11-25 16:09:33,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 33: [2022-11-25 16:09:33,301] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt. 33: [2022-11-25 16:09:33,301] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt 33: [2022-11-25 16:09:33,301] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 29: [2022-11-25 16:09:33,303] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. 29: [2022-11-25 16:09:33,303] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt 29: [2022-11-25 16:09:33,303] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 36: [2022-11-25 16:09:33,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt. 36: [2022-11-25 16:09:33,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt 36: [2022-11-25 16:09:33,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 17: [2022-11-25 16:09:33,305] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. 17: [2022-11-25 16:09:33,305] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt 17: [2022-11-25 16:09:33,305] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 47: [2022-11-25 16:09:33,295] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt 47: [2022-11-25 16:09:33,295] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 5: [2022-11-25 16:09:33,308] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. 5: [2022-11-25 16:09:33,308] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt 5: [2022-11-25 16:09:33,308] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 19: [2022-11-25 16:09:33,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. 19: [2022-11-25 16:09:33,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt 41: [2022-11-25 16:09:33,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt. 19: [2022-11-25 16:09:33,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 41: [2022-11-25 16:09:33,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt 54: [2022-11-25 16:09:33,309] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt. 41: [2022-11-25 16:09:33,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 54: [2022-11-25 16:09:33,309] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt 54: [2022-11-25 16:09:33,309] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 21: [2022-11-25 16:09:33,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. 61: [2022-11-25 16:09:33,310] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt. 61: [2022-11-25 16:09:33,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt 61: [2022-11-25 16:09:33,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 37: [2022-11-25 16:09:33,311] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt. 37: [2022-11-25 16:09:33,311] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt 37: [2022-11-25 16:09:33,311] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 45: [2022-11-25 16:09:33,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt. 44: [2022-11-25 16:09:33,312] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt. 45: [2022-11-25 16:09:33,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt 44: [2022-11-25 16:09:33,312] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt 45: [2022-11-25 16:09:33,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 44: [2022-11-25 16:09:33,312] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 37: [2022-11-25 16:09:33,313] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt. 37: [2022-11-25 16:09:33,313] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt 37: [2022-11-25 16:09:33,313] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 21: [2022-11-25 16:09:33,310] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt 21: [2022-11-25 16:09:33,310] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 21: [2022-11-25 16:09:33,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. 21: [2022-11-25 16:09:33,314] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt 21: [2022-11-25 16:09:33,314] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 54: [2022-11-25 16:09:33,314] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt. 54: [2022-11-25 16:09:33,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt 54: [2022-11-25 16:09:33,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 5: [2022-11-25 16:09:33,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. 5: [2022-11-25 16:09:33,315] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt 5: [2022-11-25 16:09:33,315] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 7: [2022-11-25 16:09:33,317] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. 7: [2022-11-25 16:09:33,317] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt 7: [2022-11-25 16:09:33,317] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 58: [2022-11-25 16:09:33,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt. 58: [2022-11-25 16:09:33,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt 58: [2022-11-25 16:09:33,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 9: [2022-11-25 16:09:33,318] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. 9: [2022-11-25 16:09:33,318] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt 9: [2022-11-25 16:09:33,318] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 34: [2022-11-25 16:09:33,319] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt. 34: [2022-11-25 16:09:33,319] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt 34: [2022-11-25 16:09:33,319] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 43: [2022-11-25 16:09:33,320] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt. 43: [2022-11-25 16:09:33,320] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt 43: [2022-11-25 16:09:33,320] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 47: [2022-11-25 16:09:33,321] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt. 51: [2022-11-25 16:09:33,322] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt. 51: [2022-11-25 16:09:33,322] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt 51: [2022-11-25 16:09:33,322] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 35: [2022-11-25 16:09:33,325] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt. 35: [2022-11-25 16:09:33,325] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt 35: [2022-11-25 16:09:33,325] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 7: [2022-11-25 16:09:33,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. 36: [2022-11-25 16:09:33,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt. 7: [2022-11-25 16:09:33,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt 36: [2022-11-25 16:09:33,326] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt 36: [2022-11-25 16:09:33,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 7: [2022-11-25 16:09:33,326] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 33: [2022-11-25 16:09:33,326] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt. 33: [2022-11-25 16:09:33,327] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt 33: [2022-11-25 16:09:33,327] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 47: [2022-11-25 16:09:33,321] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt 47: [2022-11-25 16:09:33,321] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 9: [2022-11-25 16:09:33,337] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. 9: [2022-11-25 16:09:33,337] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt 9: [2022-11-25 16:09:33,337] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 44: [2022-11-25 16:09:33,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt. 44: [2022-11-25 16:09:33,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step4000/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt 44: [2022-11-25 16:09:33,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! 0: successfully saved checkpoint at iteration 4000 to checkpoints_8b7 63: time (ms) | save-checkpoint: 6997.07 63: iteration 4010/ 5494 | consumed samples: 4106240 | consumed tokens: 8409579520 | elapsed time per iteration (s): 6.37 | learning rate: 5.109E-05 | global batch size: 1024 | lm loss: 2.244633E+00 | grad norm: 0.149 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 160.705 | TFLOPs: 35.93 | 63: iteration 4020/ 5494 | consumed samples: 4116480 | consumed tokens: 8430551040 | elapsed time per iteration (s): 5.67 | learning rate: 5.070E-05 | global batch size: 1024 | lm loss: 2.240937E+00 | grad norm: 0.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.658 | TFLOPs: 40.39 | 63: iteration 4030/ 5494 | consumed samples: 4126720 | consumed tokens: 8451522560 | elapsed time per iteration (s): 5.66 | learning rate: 5.031E-05 | global batch size: 1024 | lm loss: 2.244235E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.046 | TFLOPs: 40.48 | 63: iteration 4040/ 5494 | consumed samples: 4136960 | consumed tokens: 8472494080 | elapsed time per iteration (s): 5.40 | learning rate: 4.992E-05 | global batch size: 1024 | lm loss: 2.231983E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.615 | TFLOPs: 42.39 | 63: iteration 4050/ 5494 | consumed samples: 4147200 | consumed tokens: 8493465600 | elapsed time per iteration (s): 5.74 | learning rate: 4.953E-05 | global batch size: 1024 | lm loss: 2.232835E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.531 | TFLOPs: 39.91 | 63: iteration 4060/ 5494 | consumed samples: 4157440 | consumed tokens: 8514437120 | elapsed time per iteration (s): 5.54 | learning rate: 4.915E-05 | global batch size: 1024 | lm loss: 2.226386E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.884 | TFLOPs: 41.33 | 63: iteration 4070/ 5494 | consumed samples: 4167680 | consumed tokens: 8535408640 | elapsed time per iteration (s): 5.49 | learning rate: 4.877E-05 | global batch size: 1024 | lm loss: 2.229890E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.681 | TFLOPs: 41.74 | 63: iteration 4080/ 5494 | consumed samples: 4177920 | consumed tokens: 8556380160 | elapsed time per iteration (s): 5.74 | learning rate: 4.839E-05 | global batch size: 1024 | lm loss: 2.243035E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.447 | TFLOPs: 39.89 | 63: iteration 4090/ 5494 | consumed samples: 4188160 | consumed tokens: 8577351680 | elapsed time per iteration (s): 5.81 | learning rate: 4.801E-05 | global batch size: 1024 | lm loss: 2.235854E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.181 | TFLOPs: 39.39 | 63: iteration 4100/ 5494 | consumed samples: 4198400 | consumed tokens: 8598323200 | elapsed time per iteration (s): 5.40 | learning rate: 4.763E-05 | global batch size: 1024 | lm loss: 2.249084E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.663 | TFLOPs: 42.40 | 63: iteration 4110/ 5494 | consumed samples: 4208640 | consumed tokens: 8619294720 | elapsed time per iteration (s): 5.52 | learning rate: 4.726E-05 | global batch size: 1024 | lm loss: 2.231608E+00 | grad norm: 0.153 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.351 | TFLOPs: 41.44 | 63: iteration 4120/ 5494 | consumed samples: 4218880 | consumed tokens: 8640266240 | elapsed time per iteration (s): 5.64 | learning rate: 4.689E-05 | global batch size: 1024 | lm loss: 2.235134E+00 | grad norm: 0.161 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.504 | TFLOPs: 40.58 | 63: iteration 4130/ 5494 | consumed samples: 4229120 | consumed tokens: 8661237760 | elapsed time per iteration (s): 5.74 | learning rate: 4.652E-05 | global batch size: 1024 | lm loss: 2.234351E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.378 | TFLOPs: 39.88 | 63: iteration 4140/ 5494 | consumed samples: 4239360 | consumed tokens: 8682209280 | elapsed time per iteration (s): 5.76 | learning rate: 4.615E-05 | global batch size: 1024 | lm loss: 2.230442E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.857 | TFLOPs: 39.76 | 63: iteration 4150/ 5494 | consumed samples: 4249600 | consumed tokens: 8703180800 | elapsed time per iteration (s): 5.53 | learning rate: 4.579E-05 | global batch size: 1024 | lm loss: 2.227919E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.254 | TFLOPs: 41.42 | 63: iteration 4160/ 5494 | consumed samples: 4259840 | consumed tokens: 8724152320 | elapsed time per iteration (s): 5.68 | learning rate: 4.542E-05 | global batch size: 1024 | lm loss: 2.239543E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.339 | TFLOPs: 40.32 | 63: iteration 4170/ 5494 | consumed samples: 4270080 | consumed tokens: 8745123840 | elapsed time per iteration (s): 5.63 | learning rate: 4.506E-05 | global batch size: 1024 | lm loss: 2.221701E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.748 | TFLOPs: 40.63 | 63: iteration 4180/ 5494 | consumed samples: 4280320 | consumed tokens: 8766095360 | elapsed time per iteration (s): 5.90 | learning rate: 4.470E-05 | global batch size: 1024 | lm loss: 2.215150E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.607 | TFLOPs: 38.81 | 63: iteration 4190/ 5494 | consumed samples: 4290560 | consumed tokens: 8787066880 | elapsed time per iteration (s): 5.52 | learning rate: 4.435E-05 | global batch size: 1024 | lm loss: 2.241295E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.471 | TFLOPs: 41.47 | 63: iteration 4200/ 5494 | consumed samples: 4300800 | consumed tokens: 8808038400 | elapsed time per iteration (s): 5.54 | learning rate: 4.399E-05 | global batch size: 1024 | lm loss: 2.218307E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.886 | TFLOPs: 41.33 | 63: iteration 4210/ 5494 | consumed samples: 4311040 | consumed tokens: 8829009920 | elapsed time per iteration (s): 5.43 | learning rate: 4.364E-05 | global batch size: 1024 | lm loss: 2.209278E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 188.736 | TFLOPs: 42.20 | 63: iteration 4220/ 5494 | consumed samples: 4321280 | consumed tokens: 8849981440 | elapsed time per iteration (s): 5.62 | learning rate: 4.329E-05 | global batch size: 1024 | lm loss: 2.226402E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.123 | TFLOPs: 40.72 | 63: iteration 4230/ 5494 | consumed samples: 4331520 | consumed tokens: 8870952960 | elapsed time per iteration (s): 5.85 | learning rate: 4.294E-05 | global batch size: 1024 | lm loss: 2.214868E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 174.935 | TFLOPs: 39.11 | 63: iteration 4240/ 5494 | consumed samples: 4341760 | consumed tokens: 8891924480 | elapsed time per iteration (s): 6.03 | learning rate: 4.260E-05 | global batch size: 1024 | lm loss: 2.213839E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 169.792 | TFLOPs: 37.96 | 63: iteration 4250/ 5494 | consumed samples: 4352000 | consumed tokens: 8912896000 | elapsed time per iteration (s): 9.25 | learning rate: 4.225E-05 | global batch size: 1024 | lm loss: 2.236757E+00 | grad norm: 0.155 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 110.744 | TFLOPs: 24.76 | 63: iteration 4260/ 5494 | consumed samples: 4362240 | consumed tokens: 8933867520 | elapsed time per iteration (s): 6.40 | learning rate: 4.191E-05 | global batch size: 1024 | lm loss: 2.238665E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 159.933 | TFLOPs: 35.76 | 63: iteration 4270/ 5494 | consumed samples: 4372480 | consumed tokens: 8954839040 | elapsed time per iteration (s): 5.78 | learning rate: 4.157E-05 | global batch size: 1024 | lm loss: 2.219238E+00 | grad norm: 0.148 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.294 | TFLOPs: 39.64 | 63: iteration 4280/ 5494 | consumed samples: 4382720 | consumed tokens: 8975810560 | elapsed time per iteration (s): 5.50 | learning rate: 4.124E-05 | global batch size: 1024 | lm loss: 2.229868E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.059 | TFLOPs: 41.60 | 63: iteration 4290/ 5494 | consumed samples: 4392960 | consumed tokens: 8996782080 | elapsed time per iteration (s): 5.74 | learning rate: 4.090E-05 | global batch size: 1024 | lm loss: 2.215840E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.289 | TFLOPs: 39.86 | 63: iteration 4300/ 5494 | consumed samples: 4403200 | consumed tokens: 9017753600 | elapsed time per iteration (s): 5.81 | learning rate: 4.057E-05 | global batch size: 1024 | lm loss: 2.219283E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.262 | TFLOPs: 39.41 | 63: iteration 4310/ 5494 | consumed samples: 4413440 | consumed tokens: 9038725120 | elapsed time per iteration (s): 5.56 | learning rate: 4.024E-05 | global batch size: 1024 | lm loss: 2.222147E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.040 | TFLOPs: 41.15 | 63: iteration 4320/ 5494 | consumed samples: 4423680 | consumed tokens: 9059696640 | elapsed time per iteration (s): 5.50 | learning rate: 3.991E-05 | global batch size: 1024 | lm loss: 2.224061E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.184 | TFLOPs: 41.62 | 63: iteration 4330/ 5494 | consumed samples: 4433920 | consumed tokens: 9080668160 | elapsed time per iteration (s): 5.78 | learning rate: 3.959E-05 | global batch size: 1024 | lm loss: 2.218119E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.183 | TFLOPs: 39.61 | 63: iteration 4340/ 5494 | consumed samples: 4444160 | consumed tokens: 9101639680 | elapsed time per iteration (s): 5.68 | learning rate: 3.927E-05 | global batch size: 1024 | lm loss: 2.208767E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.244 | TFLOPs: 40.30 | 63: iteration 4350/ 5494 | consumed samples: 4454400 | consumed tokens: 9122611200 | elapsed time per iteration (s): 5.66 | learning rate: 3.895E-05 | global batch size: 1024 | lm loss: 2.225300E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.961 | TFLOPs: 40.46 | 63: iteration 4360/ 5494 | consumed samples: 4464640 | consumed tokens: 9143582720 | elapsed time per iteration (s): 5.90 | learning rate: 3.863E-05 | global batch size: 1024 | lm loss: 2.217171E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.584 | TFLOPs: 38.81 | 63: iteration 4370/ 5494 | consumed samples: 4474880 | consumed tokens: 9164554240 | elapsed time per iteration (s): 7.12 | learning rate: 3.831E-05 | global batch size: 1024 | lm loss: 2.218223E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 143.775 | TFLOPs: 32.14 | 63: iteration 4380/ 5494 | consumed samples: 4485120 | consumed tokens: 9185525760 | elapsed time per iteration (s): 10.14 | learning rate: 3.800E-05 | global batch size: 1024 | lm loss: 2.214470E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 100.937 | TFLOPs: 22.57 | 63: iteration 4390/ 5494 | consumed samples: 4495360 | consumed tokens: 9206497280 | elapsed time per iteration (s): 5.92 | learning rate: 3.769E-05 | global batch size: 1024 | lm loss: 2.232302E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.111 | TFLOPs: 38.70 | 63: iteration 4400/ 5494 | consumed samples: 4505600 | consumed tokens: 9227468800 | elapsed time per iteration (s): 5.56 | learning rate: 3.738E-05 | global batch size: 1024 | lm loss: 2.217585E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.158 | TFLOPs: 41.17 | 63: iteration 4410/ 5494 | consumed samples: 4515840 | consumed tokens: 9248440320 | elapsed time per iteration (s): 5.54 | learning rate: 3.708E-05 | global batch size: 1024 | lm loss: 2.214284E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.957 | TFLOPs: 41.35 | 63: iteration 4420/ 5494 | consumed samples: 4526080 | consumed tokens: 9269411840 | elapsed time per iteration (s): 5.65 | learning rate: 3.677E-05 | global batch size: 1024 | lm loss: 2.211942E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.359 | TFLOPs: 40.55 | 63: iteration 4430/ 5494 | consumed samples: 4536320 | consumed tokens: 9290383360 | elapsed time per iteration (s): 5.74 | learning rate: 3.647E-05 | global batch size: 1024 | lm loss: 2.224935E+00 | grad norm: 0.145 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.515 | TFLOPs: 39.91 | 63: iteration 4440/ 5494 | consumed samples: 4546560 | consumed tokens: 9311354880 | elapsed time per iteration (s): 5.82 | learning rate: 3.617E-05 | global batch size: 1024 | lm loss: 2.217514E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 176.068 | TFLOPs: 39.36 | 63: iteration 4450/ 5494 | consumed samples: 4556800 | consumed tokens: 9332326400 | elapsed time per iteration (s): 5.75 | learning rate: 3.588E-05 | global batch size: 1024 | lm loss: 2.221024E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.226 | TFLOPs: 39.85 | 63: iteration 4460/ 5494 | consumed samples: 4567040 | consumed tokens: 9353297920 | elapsed time per iteration (s): 5.59 | learning rate: 3.558E-05 | global batch size: 1024 | lm loss: 2.219239E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 183.262 | TFLOPs: 40.97 | 63: iteration 4470/ 5494 | consumed samples: 4577280 | consumed tokens: 9374269440 | elapsed time per iteration (s): 5.54 | learning rate: 3.529E-05 | global batch size: 1024 | lm loss: 2.214919E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.781 | TFLOPs: 41.31 | 63: iteration 4480/ 5494 | consumed samples: 4587520 | consumed tokens: 9395240960 | elapsed time per iteration (s): 5.53 | learning rate: 3.500E-05 | global batch size: 1024 | lm loss: 2.207034E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.013 | TFLOPs: 41.36 | 63: iteration 4490/ 5494 | consumed samples: 4597760 | consumed tokens: 9416212480 | elapsed time per iteration (s): 5.95 | learning rate: 3.472E-05 | global batch size: 1024 | lm loss: 2.217616E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 172.238 | TFLOPs: 38.51 | 63: iteration 4500/ 5494 | consumed samples: 4608000 | consumed tokens: 9437184000 | elapsed time per iteration (s): 5.77 | learning rate: 3.443E-05 | global batch size: 1024 | lm loss: 2.216915E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.402 | TFLOPs: 39.66 | 63: iteration 4510/ 5494 | consumed samples: 4618240 | consumed tokens: 9458155520 | elapsed time per iteration (s): 5.61 | learning rate: 3.415E-05 | global batch size: 1024 | lm loss: 2.216415E+00 | grad norm: 0.146 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.602 | TFLOPs: 40.82 | 63: iteration 4520/ 5494 | consumed samples: 4628480 | consumed tokens: 9479127040 | elapsed time per iteration (s): 5.43 | learning rate: 3.387E-05 | global batch size: 1024 | lm loss: 2.206853E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 188.669 | TFLOPs: 42.18 | 63: iteration 4530/ 5494 | consumed samples: 4638720 | consumed tokens: 9500098560 | elapsed time per iteration (s): 5.54 | learning rate: 3.360E-05 | global batch size: 1024 | lm loss: 2.228224E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.713 | TFLOPs: 41.30 | 63: iteration 4540/ 5494 | consumed samples: 4648960 | consumed tokens: 9521070080 | elapsed time per iteration (s): 5.91 | learning rate: 3.332E-05 | global batch size: 1024 | lm loss: 2.195012E+00 | grad norm: 0.154 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.356 | TFLOPs: 38.76 | 63: iteration 4550/ 5494 | consumed samples: 4659200 | consumed tokens: 9542041600 | elapsed time per iteration (s): 5.61 | learning rate: 3.305E-05 | global batch size: 1024 | lm loss: 2.204505E+00 | grad norm: 0.144 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.412 | TFLOPs: 40.78 | 63: iteration 4560/ 5494 | consumed samples: 4669440 | consumed tokens: 9563013120 | elapsed time per iteration (s): 5.66 | learning rate: 3.278E-05 | global batch size: 1024 | lm loss: 2.198723E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.907 | TFLOPs: 40.44 | 63: iteration 4570/ 5494 | consumed samples: 4679680 | consumed tokens: 9583984640 | elapsed time per iteration (s): 5.40 | learning rate: 3.252E-05 | global batch size: 1024 | lm loss: 2.200946E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.668 | TFLOPs: 42.40 | 63: iteration 4580/ 5494 | consumed samples: 4689920 | consumed tokens: 9604956160 | elapsed time per iteration (s): 5.51 | learning rate: 3.226E-05 | global batch size: 1024 | lm loss: 2.216247E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.864 | TFLOPs: 41.55 | 63: iteration 4590/ 5494 | consumed samples: 4700160 | consumed tokens: 9625927680 | elapsed time per iteration (s): 5.52 | learning rate: 3.200E-05 | global batch size: 1024 | lm loss: 2.206020E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.670 | TFLOPs: 41.51 | 63: iteration 4600/ 5494 | consumed samples: 4710400 | consumed tokens: 9646899200 | elapsed time per iteration (s): 5.60 | learning rate: 3.174E-05 | global batch size: 1024 | lm loss: 2.222094E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.989 | TFLOPs: 40.91 | 63: iteration 4610/ 5494 | consumed samples: 4720640 | consumed tokens: 9667870720 | elapsed time per iteration (s): 5.70 | learning rate: 3.148E-05 | global batch size: 1024 | lm loss: 2.209045E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.645 | TFLOPs: 40.16 | 63: iteration 4620/ 5494 | consumed samples: 4730880 | consumed tokens: 9688842240 | elapsed time per iteration (s): 5.51 | learning rate: 3.123E-05 | global batch size: 1024 | lm loss: 2.188488E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.771 | TFLOPs: 41.53 | 63: iteration 4630/ 5494 | consumed samples: 4741120 | consumed tokens: 9709813760 | elapsed time per iteration (s): 5.72 | learning rate: 3.098E-05 | global batch size: 1024 | lm loss: 2.198794E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.140 | TFLOPs: 40.05 | 63: iteration 4640/ 5494 | consumed samples: 4751360 | consumed tokens: 9730785280 | elapsed time per iteration (s): 5.61 | learning rate: 3.073E-05 | global batch size: 1024 | lm loss: 2.216370E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.671 | TFLOPs: 40.84 | 63: iteration 4650/ 5494 | consumed samples: 4761600 | consumed tokens: 9751756800 | elapsed time per iteration (s): 5.52 | learning rate: 3.049E-05 | global batch size: 1024 | lm loss: 2.197796E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.526 | TFLOPs: 41.48 | 63: iteration 4660/ 5494 | consumed samples: 4771840 | consumed tokens: 9772728320 | elapsed time per iteration (s): 5.97 | learning rate: 3.024E-05 | global batch size: 1024 | lm loss: 2.210373E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 171.490 | TFLOPs: 38.34 | 63: iteration 4670/ 5494 | consumed samples: 4782080 | consumed tokens: 9793699840 | elapsed time per iteration (s): 5.65 | learning rate: 3.000E-05 | global batch size: 1024 | lm loss: 2.215950E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.228 | TFLOPs: 40.52 | 63: iteration 4680/ 5494 | consumed samples: 4792320 | consumed tokens: 9814671360 | elapsed time per iteration (s): 6.02 | learning rate: 2.977E-05 | global batch size: 1024 | lm loss: 2.209864E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 170.119 | TFLOPs: 38.03 | 63: iteration 4690/ 5494 | consumed samples: 4802560 | consumed tokens: 9835642880 | elapsed time per iteration (s): 5.57 | learning rate: 2.953E-05 | global batch size: 1024 | lm loss: 2.210680E+00 | grad norm: 0.157 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 183.988 | TFLOPs: 41.13 | 63: iteration 4700/ 5494 | consumed samples: 4812800 | consumed tokens: 9856614400 | elapsed time per iteration (s): 5.76 | learning rate: 2.930E-05 | global batch size: 1024 | lm loss: 2.203060E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.889 | TFLOPs: 39.77 | 63: iteration 4710/ 5494 | consumed samples: 4823040 | consumed tokens: 9877585920 | elapsed time per iteration (s): 5.55 | learning rate: 2.907E-05 | global batch size: 1024 | lm loss: 2.192744E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.575 | TFLOPs: 41.26 | 63: iteration 4720/ 5494 | consumed samples: 4833280 | consumed tokens: 9898557440 | elapsed time per iteration (s): 5.90 | learning rate: 2.885E-05 | global batch size: 1024 | lm loss: 2.202418E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.522 | TFLOPs: 38.79 | 63: iteration 4730/ 5494 | consumed samples: 4843520 | consumed tokens: 9919528960 | elapsed time per iteration (s): 5.41 | learning rate: 2.862E-05 | global batch size: 1024 | lm loss: 2.201526E+00 | grad norm: 0.140 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.197 | TFLOPs: 42.30 | 63: iteration 4740/ 5494 | consumed samples: 4853760 | consumed tokens: 9940500480 | elapsed time per iteration (s): 5.50 | learning rate: 2.840E-05 | global batch size: 1024 | lm loss: 2.202274E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.278 | TFLOPs: 41.65 | 63: iteration 4750/ 5494 | consumed samples: 4864000 | consumed tokens: 9961472000 | elapsed time per iteration (s): 5.64 | learning rate: 2.819E-05 | global batch size: 1024 | lm loss: 2.219603E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.685 | TFLOPs: 40.62 | 63: iteration 4760/ 5494 | consumed samples: 4874240 | consumed tokens: 9982443520 | elapsed time per iteration (s): 5.42 | learning rate: 2.797E-05 | global batch size: 1024 | lm loss: 2.213585E+00 | grad norm: 0.123 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 188.967 | TFLOPs: 42.25 | 63: iteration 4770/ 5494 | consumed samples: 4884480 | consumed tokens: 10003415040 | elapsed time per iteration (s): 5.52 | learning rate: 2.776E-05 | global batch size: 1024 | lm loss: 2.193128E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.464 | TFLOPs: 41.46 | 63: iteration 4780/ 5494 | consumed samples: 4894720 | consumed tokens: 10024386560 | elapsed time per iteration (s): 5.74 | learning rate: 2.755E-05 | global batch size: 1024 | lm loss: 2.220637E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.468 | TFLOPs: 39.90 | 63: iteration 4790/ 5494 | consumed samples: 4904960 | consumed tokens: 10045358080 | elapsed time per iteration (s): 5.76 | learning rate: 2.734E-05 | global batch size: 1024 | lm loss: 2.194825E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.838 | TFLOPs: 39.76 | 63: iteration 4800/ 5494 | consumed samples: 4915200 | consumed tokens: 10066329600 | elapsed time per iteration (s): 5.53 | learning rate: 2.714E-05 | global batch size: 1024 | lm loss: 2.203054E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.052 | TFLOPs: 41.37 | 63: iteration 4810/ 5494 | consumed samples: 4925440 | consumed tokens: 10087301120 | elapsed time per iteration (s): 5.83 | learning rate: 2.694E-05 | global batch size: 1024 | lm loss: 2.202653E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 175.610 | TFLOPs: 39.26 | 63: iteration 4820/ 5494 | consumed samples: 4935680 | consumed tokens: 10108272640 | elapsed time per iteration (s): 5.70 | learning rate: 2.674E-05 | global batch size: 1024 | lm loss: 2.198432E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.757 | TFLOPs: 40.19 | 63: iteration 4830/ 5494 | consumed samples: 4945920 | consumed tokens: 10129244160 | elapsed time per iteration (s): 5.75 | learning rate: 2.654E-05 | global batch size: 1024 | lm loss: 2.194366E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.159 | TFLOPs: 39.83 | 63: iteration 4840/ 5494 | consumed samples: 4956160 | consumed tokens: 10150215680 | elapsed time per iteration (s): 5.41 | learning rate: 2.635E-05 | global batch size: 1024 | lm loss: 2.190760E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.107 | TFLOPs: 42.28 | 63: iteration 4850/ 5494 | consumed samples: 4966400 | consumed tokens: 10171187200 | elapsed time per iteration (s): 5.56 | learning rate: 2.616E-05 | global batch size: 1024 | lm loss: 2.192096E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.022 | TFLOPs: 41.14 | 63: iteration 4860/ 5494 | consumed samples: 4976640 | consumed tokens: 10192158720 | elapsed time per iteration (s): 5.52 | learning rate: 2.597E-05 | global batch size: 1024 | lm loss: 2.208235E+00 | grad norm: 0.147 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.385 | TFLOPs: 41.45 | 63: iteration 4870/ 5494 | consumed samples: 4986880 | consumed tokens: 10213130240 | elapsed time per iteration (s): 5.77 | learning rate: 2.578E-05 | global batch size: 1024 | lm loss: 2.191154E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.528 | TFLOPs: 39.69 | 63: iteration 4880/ 5494 | consumed samples: 4997120 | consumed tokens: 10234101760 | elapsed time per iteration (s): 5.40 | learning rate: 2.560E-05 | global batch size: 1024 | lm loss: 2.202100E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.551 | TFLOPs: 42.38 | 63: iteration 4890/ 5494 | consumed samples: 5007360 | consumed tokens: 10255073280 | elapsed time per iteration (s): 5.66 | learning rate: 2.542E-05 | global batch size: 1024 | lm loss: 2.199622E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.874 | TFLOPs: 40.44 | 63: iteration 4900/ 5494 | consumed samples: 5017600 | consumed tokens: 10276044800 | elapsed time per iteration (s): 5.40 | learning rate: 2.525E-05 | global batch size: 1024 | lm loss: 2.182106E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.677 | TFLOPs: 42.41 | 63: iteration 4910/ 5494 | consumed samples: 5027840 | consumed tokens: 10297016320 | elapsed time per iteration (s): 5.64 | learning rate: 2.507E-05 | global batch size: 1024 | lm loss: 2.208101E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.522 | TFLOPs: 40.58 | 63: iteration 4920/ 5494 | consumed samples: 5038080 | consumed tokens: 10317987840 | elapsed time per iteration (s): 5.51 | learning rate: 2.490E-05 | global batch size: 1024 | lm loss: 2.181703E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.963 | TFLOPs: 41.58 | 63: iteration 4930/ 5494 | consumed samples: 5048320 | consumed tokens: 10338959360 | elapsed time per iteration (s): 5.74 | learning rate: 2.474E-05 | global batch size: 1024 | lm loss: 2.181737E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.245 | TFLOPs: 39.85 | 63: iteration 4940/ 5494 | consumed samples: 5058560 | consumed tokens: 10359930880 | elapsed time per iteration (s): 5.51 | learning rate: 2.457E-05 | global batch size: 1024 | lm loss: 2.198078E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.875 | TFLOPs: 41.56 | 63: iteration 4950/ 5494 | consumed samples: 5068800 | consumed tokens: 10380902400 | elapsed time per iteration (s): 5.62 | learning rate: 2.441E-05 | global batch size: 1024 | lm loss: 2.190603E+00 | grad norm: 0.124 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.054 | TFLOPs: 40.70 | 63: iteration 4960/ 5494 | consumed samples: 5079040 | consumed tokens: 10401873920 | elapsed time per iteration (s): 5.53 | learning rate: 2.425E-05 | global batch size: 1024 | lm loss: 2.184713E+00 | grad norm: 0.125 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.115 | TFLOPs: 41.39 | 63: iteration 4970/ 5494 | consumed samples: 5089280 | consumed tokens: 10422845440 | elapsed time per iteration (s): 5.61 | learning rate: 2.409E-05 | global batch size: 1024 | lm loss: 2.206613E+00 | grad norm: 0.127 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.664 | TFLOPs: 40.84 | 63: iteration 4980/ 5494 | consumed samples: 5099520 | consumed tokens: 10443816960 | elapsed time per iteration (s): 5.74 | learning rate: 2.394E-05 | global batch size: 1024 | lm loss: 2.204055E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.352 | TFLOPs: 39.87 | 63: iteration 4990/ 5494 | consumed samples: 5109760 | consumed tokens: 10464788480 | elapsed time per iteration (s): 5.58 | learning rate: 2.379E-05 | global batch size: 1024 | lm loss: 2.179170E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 183.391 | TFLOPs: 41.00 | 63: iteration 5000/ 5494 | consumed samples: 5120000 | consumed tokens: 10485760000 | elapsed time per iteration (s): 5.64 | learning rate: 2.364E-05 | global batch size: 1024 | lm loss: 2.211388E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.571 | TFLOPs: 40.59 | 63: ------------------------------------------------------------------------------------------ 63: valid loss at iteration 5000 | lm loss value: 2.128354E+00 | lm loss PPL: 8.401028E+00 | 63: ------------------------------------------------------------------------------------------ 0: saving checkpoint at iteration 5000 to checkpoints_8b7 0: [2022-11-25 17:45:19,182] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step5000 is begin to save! 0: [2022-11-25 17:45:19,220] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_01-model_00-model_states.pt... 0: [2022-11-25 17:45:19,220] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_01-model_01-model_states.pt... 32: [2022-11-25 17:45:19,220] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_24-model_00-model_states.pt... 32: [2022-11-25 17:45:19,220] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_24-model_01-model_states.pt... 32: [2022-11-25 17:45:19,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_24-model_00-model_states.pt. 32: [2022-11-25 17:45:19,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_24-model_01-model_states.pt. 32: [2022-11-25 17:45:19,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_25-model_01-model_states.pt... 32: [2022-11-25 17:45:19,588] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_25-model_00-model_states.pt... 0: [2022-11-25 17:45:19,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_01-model_01-model_states.pt. 0: [2022-11-25 17:45:19,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_01-model_00-model_states.pt. 0: [2022-11-25 17:45:19,589] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_03-model_01-model_states.pt... 0: [2022-11-25 17:45:19,589] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_03-model_00-model_states.pt... 0: [2022-11-25 17:45:19,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_03-model_01-model_states.pt. 0: [2022-11-25 17:45:19,828] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_04-model_01-model_states.pt... 32: [2022-11-25 17:45:19,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_25-model_00-model_states.pt. 32: [2022-11-25 17:45:19,832] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_26-model_00-model_states.pt... 0: [2022-11-25 17:45:19,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_03-model_00-model_states.pt. 0: [2022-11-25 17:45:19,833] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_04-model_00-model_states.pt... 32: [2022-11-25 17:45:19,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_25-model_01-model_states.pt. 32: [2022-11-25 17:45:19,843] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_26-model_01-model_states.pt... 0: [2022-11-25 17:45:20,075] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_04-model_01-model_states.pt. 0: [2022-11-25 17:45:20,076] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_05-model_01-model_states.pt... 0: [2022-11-25 17:45:20,082] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_04-model_00-model_states.pt. 0: [2022-11-25 17:45:20,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_05-model_00-model_states.pt... 32: [2022-11-25 17:45:20,104] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_26-model_01-model_states.pt. 32: [2022-11-25 17:45:20,104] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_26-model_00-model_states.pt. 32: [2022-11-25 17:45:20,104] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_27-model_01-model_states.pt... 32: [2022-11-25 17:45:20,104] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_27-model_00-model_states.pt... 0: [2022-11-25 17:45:20,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_05-model_01-model_states.pt. 0: [2022-11-25 17:45:20,345] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_06-model_01-model_states.pt... 32: [2022-11-25 17:45:20,345] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_27-model_01-model_states.pt. 32: [2022-11-25 17:45:20,346] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_28-model_01-model_states.pt... 0: [2022-11-25 17:45:20,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_05-model_00-model_states.pt. 0: [2022-11-25 17:45:20,348] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_06-model_00-model_states.pt... 32: [2022-11-25 17:45:20,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_27-model_00-model_states.pt. 32: [2022-11-25 17:45:20,354] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_28-model_00-model_states.pt... 32: [2022-11-25 17:45:20,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_28-model_01-model_states.pt. 32: [2022-11-25 17:45:20,598] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_29-model_01-model_states.pt... 0: [2022-11-25 17:45:20,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_06-model_01-model_states.pt. 0: [2022-11-25 17:45:20,607] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_07-model_01-model_states.pt... 32: [2022-11-25 17:45:20,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_28-model_00-model_states.pt. 32: [2022-11-25 17:45:20,610] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_29-model_00-model_states.pt... 0: [2022-11-25 17:45:20,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_06-model_00-model_states.pt. 0: [2022-11-25 17:45:20,614] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_07-model_00-model_states.pt... 32: [2022-11-25 17:45:20,853] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_29-model_01-model_states.pt. 32: [2022-11-25 17:45:20,854] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_30-model_01-model_states.pt... 32: [2022-11-25 17:45:20,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_29-model_00-model_states.pt. 32: [2022-11-25 17:45:20,860] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_30-model_00-model_states.pt... 0: [2022-11-25 17:45:20,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_07-model_01-model_states.pt. 0: [2022-11-25 17:45:20,862] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_08-model_01-model_states.pt... 0: [2022-11-25 17:45:20,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_07-model_00-model_states.pt. 0: [2022-11-25 17:45:20,870] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_08-model_00-model_states.pt... 0: [2022-11-25 17:45:21,123] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_08-model_01-model_states.pt. 0: [2022-11-25 17:45:21,123] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_09-model_01-model_states.pt... 0: [2022-11-25 17:45:21,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_08-model_00-model_states.pt. 0: [2022-11-25 17:45:21,126] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_09-model_00-model_states.pt... 32: [2022-11-25 17:45:21,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_30-model_01-model_states.pt. 32: [2022-11-25 17:45:21,126] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_31-model_01-model_states.pt... 32: [2022-11-25 17:45:21,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_30-model_00-model_states.pt. 32: [2022-11-25 17:45:21,145] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_31-model_00-model_states.pt... 32: [2022-11-25 17:45:21,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_31-model_01-model_states.pt. 32: [2022-11-25 17:45:21,383] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_32-model_01-model_states.pt... 0: [2022-11-25 17:45:21,388] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_09-model_01-model_states.pt. 0: [2022-11-25 17:45:21,389] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_10-model_01-model_states.pt... 32: [2022-11-25 17:45:21,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_31-model_00-model_states.pt. 32: [2022-11-25 17:45:21,393] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_32-model_00-model_states.pt... 0: [2022-11-25 17:45:21,396] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_09-model_00-model_states.pt. 0: [2022-11-25 17:45:21,396] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_10-model_00-model_states.pt... 32: [2022-11-25 17:45:21,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_32-model_01-model_states.pt. 32: [2022-11-25 17:45:21,618] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_33-model_01-model_states.pt... 32: [2022-11-25 17:45:21,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_32-model_00-model_states.pt. 32: [2022-11-25 17:45:21,625] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_33-model_00-model_states.pt... 0: [2022-11-25 17:45:21,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_10-model_01-model_states.pt. 0: [2022-11-25 17:45:21,654] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_10-model_00-model_states.pt. 0: [2022-11-25 17:45:21,654] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_11-model_01-model_states.pt... 0: [2022-11-25 17:45:21,654] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_11-model_00-model_states.pt... 32: [2022-11-25 17:45:21,881] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_33-model_01-model_states.pt. 32: [2022-11-25 17:45:21,881] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_33-model_00-model_states.pt. 32: [2022-11-25 17:45:21,881] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_34-model_01-model_states.pt... 32: [2022-11-25 17:45:21,881] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_34-model_00-model_states.pt... 0: [2022-11-25 17:45:21,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_11-model_01-model_states.pt. 0: [2022-11-25 17:45:21,903] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_12-model_01-model_states.pt... 0: [2022-11-25 17:45:21,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_11-model_00-model_states.pt. 0: [2022-11-25 17:45:21,913] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_12-model_00-model_states.pt... 32: [2022-11-25 17:45:22,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_34-model_01-model_states.pt. 32: [2022-11-25 17:45:22,129] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_35-model_01-model_states.pt... 0: [2022-11-25 17:45:22,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_12-model_01-model_states.pt. 0: [2022-11-25 17:45:22,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_13-model_01-model_states.pt... 32: [2022-11-25 17:45:22,153] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_34-model_00-model_states.pt. 32: [2022-11-25 17:45:22,153] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_35-model_00-model_states.pt... 0: [2022-11-25 17:45:22,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_12-model_00-model_states.pt. 0: [2022-11-25 17:45:22,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_13-model_00-model_states.pt... 0: [2022-11-25 17:45:22,383] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_13-model_01-model_states.pt. 0: [2022-11-25 17:45:22,384] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_14-model_01-model_states.pt... 32: [2022-11-25 17:45:22,394] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_35-model_00-model_states.pt. 32: [2022-11-25 17:45:22,394] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_36-model_00-model_states.pt... 0: [2022-11-25 17:45:22,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_13-model_00-model_states.pt. 0: [2022-11-25 17:45:22,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_14-model_00-model_states.pt... 32: [2022-11-25 17:45:22,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_35-model_01-model_states.pt. 32: [2022-11-25 17:45:22,405] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_36-model_01-model_states.pt... 32: [2022-11-25 17:45:22,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_36-model_01-model_states.pt. 32: [2022-11-25 17:45:22,628] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_37-model_01-model_states.pt... 32: [2022-11-25 17:45:22,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_36-model_00-model_states.pt. 32: [2022-11-25 17:45:22,629] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_37-model_00-model_states.pt... 0: [2022-11-25 17:45:22,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_14-model_01-model_states.pt. 0: [2022-11-25 17:45:22,642] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_15-model_01-model_states.pt... 0: [2022-11-25 17:45:22,660] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_14-model_00-model_states.pt. 0: [2022-11-25 17:45:22,661] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_15-model_00-model_states.pt... 32: [2022-11-25 17:45:22,885] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_37-model_01-model_states.pt. 32: [2022-11-25 17:45:22,886] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_37-model_00-model_states.pt. 32: [2022-11-25 17:45:22,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_38-model_01-model_states.pt... 32: [2022-11-25 17:45:22,886] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_38-model_00-model_states.pt... 0: [2022-11-25 17:45:22,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_15-model_01-model_states.pt. 0: [2022-11-25 17:45:22,892] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_16-model_01-model_states.pt... 0: [2022-11-25 17:45:22,895] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_15-model_00-model_states.pt. 0: [2022-11-25 17:45:22,895] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_16-model_00-model_states.pt... 32: [2022-11-25 17:45:23,135] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_38-model_01-model_states.pt. 32: [2022-11-25 17:45:23,136] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_39-model_01-model_states.pt... 32: [2022-11-25 17:45:23,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_38-model_00-model_states.pt. 32: [2022-11-25 17:45:23,145] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_39-model_00-model_states.pt... 0: [2022-11-25 17:45:23,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_16-model_01-model_states.pt. 0: [2022-11-25 17:45:23,151] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_17-model_01-model_states.pt... 0: [2022-11-25 17:45:23,167] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_16-model_00-model_states.pt. 0: [2022-11-25 17:45:23,168] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_17-model_00-model_states.pt... 32: [2022-11-25 17:45:23,369] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_39-model_01-model_states.pt. 32: [2022-11-25 17:45:23,369] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_40-model_01-model_states.pt... 32: [2022-11-25 17:45:23,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_39-model_00-model_states.pt. 32: [2022-11-25 17:45:23,381] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_40-model_00-model_states.pt... 0: [2022-11-25 17:45:23,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_17-model_00-model_states.pt. 0: [2022-11-25 17:45:23,413] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_18-model_00-model_states.pt... 0: [2022-11-25 17:45:23,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_17-model_01-model_states.pt. 0: [2022-11-25 17:45:23,416] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_18-model_01-model_states.pt... 32: [2022-11-25 17:45:23,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_40-model_01-model_states.pt. 32: [2022-11-25 17:45:23,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_41-model_01-model_states.pt... 32: [2022-11-25 17:45:23,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_40-model_00-model_states.pt. 32: [2022-11-25 17:45:23,612] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_41-model_00-model_states.pt... 0: [2022-11-25 17:45:23,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_18-model_00-model_states.pt. 0: [2022-11-25 17:45:23,657] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_18-model_01-model_states.pt. 0: [2022-11-25 17:45:23,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_19-model_00-model_states.pt... 0: [2022-11-25 17:45:23,657] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_19-model_01-model_states.pt... 32: [2022-11-25 17:45:23,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_41-model_01-model_states.pt. 32: [2022-11-25 17:45:23,861] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_42-model_01-model_states.pt... 32: [2022-11-25 17:45:23,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_41-model_00-model_states.pt. 32: [2022-11-25 17:45:23,873] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_42-model_00-model_states.pt... 0: [2022-11-25 17:45:23,891] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_19-model_01-model_states.pt. 0: [2022-11-25 17:45:23,891] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_20-model_01-model_states.pt... 0: [2022-11-25 17:45:23,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_19-model_00-model_states.pt. 0: [2022-11-25 17:45:23,900] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_20-model_00-model_states.pt... 32: [2022-11-25 17:45:24,114] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_42-model_01-model_states.pt. 32: [2022-11-25 17:45:24,114] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_43-model_01-model_states.pt... 32: [2022-11-25 17:45:24,127] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_42-model_00-model_states.pt. 32: [2022-11-25 17:45:24,128] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_43-model_00-model_states.pt... 0: [2022-11-25 17:45:24,147] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_20-model_00-model_states.pt. 0: [2022-11-25 17:45:24,147] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_21-model_00-model_states.pt... 0: [2022-11-25 17:45:24,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_20-model_01-model_states.pt. 0: [2022-11-25 17:45:24,150] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_21-model_01-model_states.pt... 32: [2022-11-25 17:45:24,371] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_43-model_01-model_states.pt. 32: [2022-11-25 17:45:24,371] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_43-model_00-model_states.pt. 32: [2022-11-25 17:45:24,371] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_44-model_01-model_states.pt... 32: [2022-11-25 17:45:24,371] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_44-model_00-model_states.pt... 0: [2022-11-25 17:45:24,397] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_21-model_01-model_states.pt. 0: [2022-11-25 17:45:24,398] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_22-model_01-model_states.pt... 0: [2022-11-25 17:45:24,407] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_21-model_00-model_states.pt. 0: [2022-11-25 17:45:24,407] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_22-model_00-model_states.pt... 32: [2022-11-25 17:45:24,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_44-model_01-model_states.pt. 32: [2022-11-25 17:45:24,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_46-model_01-model_states.pt... 32: [2022-11-25 17:45:24,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_44-model_00-model_states.pt. 32: [2022-11-25 17:45:24,602] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_46-model_00-model_states.pt... 32: [2022-11-25 17:45:24,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_46-model_01-model_states.pt. 32: [2022-11-25 17:45:24,624] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/mp_rank_03_model_states.pt... 32: [2022-11-25 17:45:24,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_46-model_00-model_states.pt. 32: [2022-11-25 17:45:24,626] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/mp_rank_02_model_states.pt... 32: [2022-11-25 17:45:24,629] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/mp_rank_02_model_states.pt. 32: [2022-11-25 17:45:24,630] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/mp_rank_03_model_states.pt. 0: [2022-11-25 17:45:24,644] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_22-model_01-model_states.pt. 0: [2022-11-25 17:45:24,644] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_23-model_01-model_states.pt... 0: [2022-11-25 17:45:24,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_22-model_00-model_states.pt. 0: [2022-11-25 17:45:24,651] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/layer_23-model_00-model_states.pt... 0: [2022-11-25 17:45:24,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_23-model_01-model_states.pt. 0: [2022-11-25 17:45:24,899] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: checkpoints_8b7/global_step5000/mp_rank_01_model_states.pt 0: [2022-11-25 17:45:24,899] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/mp_rank_01_model_states.pt... 0: [2022-11-25 17:45:24,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/layer_23-model_00-model_states.pt. 0: [2022-11-25 17:45:24,903] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_8b7/global_step5000/mp_rank_00_model_states.pt 0: [2022-11-25 17:45:24,903] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/mp_rank_00_model_states.pt... 0: [2022-11-25 17:45:24,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/mp_rank_00_model_states.pt. 0: [2022-11-25 17:45:24,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/mp_rank_01_model_states.pt. 62: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt... 62: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt... 62: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt... 62: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt... 46: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt... 46: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt... 46: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt... 46: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt... 44: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt... 44: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt... 44: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt... 48: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt... 48: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt... 48: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt... 48: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt... 42: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt... 42: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt... 42: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt... 42: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt... 32: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt... 57: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt... 57: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt... 57: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt... 57: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt... 63: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt... 63: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt... 63: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt... 63: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt... 39: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt... 39: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt... 39: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt... 39: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt... 53: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt... 53: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt... 53: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt... 53: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt... 55: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt... 55: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt... 55: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt... 51: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt... 51: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt... 51: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt... 51: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt... 33: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt... 33: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt... 33: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt... 33: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt... 61: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt... 61: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt... 59: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt... 59: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt... 59: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt... 59: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt... 35: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt... 35: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt... 35: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt... 35: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt... 37: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt... 37: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt... 37: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt... 37: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt... 58: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt... 58: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt... 58: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt... 58: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt... 40: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt... 40: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt... 40: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt... 40: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt... 52: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt... 52: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt... 52: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt... 52: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt... 4: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... 56: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt... 56: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt... 56: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt... 54: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt... 54: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt... 54: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt... 54: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt... 62: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt... 62: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt... 62: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt... 36: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt... 36: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt... 36: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt... 38: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt... 38: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt... 0: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... 49: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt... 49: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt... 49: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt... 47: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt... 47: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt... 47: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt... 47: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt... 41: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt... 41: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt... 45: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt... 45: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt... 45: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt... 9: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... 43: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt... 43: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt... 43: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt... 43: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt... 34: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt... 34: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt... 34: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt... 34: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt... 44: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt... 44: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt... 48: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt... 48: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt... 48: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt... 48: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt... 50: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt... 50: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt... 50: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt... 50: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt... 42: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt... 42: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt... 32: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt... 57: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt... 63: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt... 63: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt... 39: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt... 53: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt... 53: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt... 55: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt... 55: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt... 55: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt... 51: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt... 51: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt... 33: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt... 33: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt... 1: [2022-11-25 17:45:25,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... 1: [2022-11-25 17:45:25,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... 1: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... 61: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt... 61: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt... 59: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt... 13: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... 13: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... 13: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... 35: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt... 5: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... 29: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... 29: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... 29: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... 29: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... 29: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... 37: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt... 58: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt... 40: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt... 40: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt... 60: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt... 60: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt... 60: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt... 60: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt... 52: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt... 30: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... 30: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... 4: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... 4: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... 56: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt... 54: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt... 62: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt... 36: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt... 36: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt... 36: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt... 36: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt... 28: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... 38: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt... 38: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt... 0: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... 0: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... 2: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... 24: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... 24: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... 49: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt... 49: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt... 47: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt... 47: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt... 41: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt... 41: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt... 41: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt... 41: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt... 41: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt... 45: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt... 45: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt... 9: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... 43: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt... 43: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt... 27: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... 27: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... 25: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... 3: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... 7: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... 7: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... 7: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... 7: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... 17: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... 17: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... 17: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... 23: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... 11: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... 31: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... 19: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... 19: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... 34: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt... 34: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt... 46: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt... 46: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt... 44: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt... 6: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... 16: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... 16: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... 16: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... 18: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... 10: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... 20: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... 20: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... 8: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... 8: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... 50: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt... 50: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt... 42: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt... 32: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt... 12: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... 12: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... 12: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... 26: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... 26: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... 26: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... 26: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... 14: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... 15: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... 15: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... 22: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... 21: [2022-11-25 17:45:25,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... 21: [2022-11-25 17:45:25,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... 21: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... 57: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt... 57: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt... 63: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt... 63: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt... 39: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt... 39: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt... 53: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt... 53: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt... 55: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt... 51: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt... 33: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt... 33: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt... 1: [2022-11-25 17:45:25,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... 61: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt... 61: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt... 59: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt... 59: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt... 13: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... 13: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... 35: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt... 5: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... 5: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... 29: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... 29: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... 37: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt... 37: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt... 37: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt... 58: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt... 58: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt... 58: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt... 40: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt... 60: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt... 60: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt... 60: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt... 60: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt... 52: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt... 52: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt... 52: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt... 30: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... 4: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... 56: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt... 56: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt... 56: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt... 54: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt... 54: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt... 54: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt... 36: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt... 28: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... 28: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... 38: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt... 38: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt... 0: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... 0: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... 2: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... 2: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... 24: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... 49: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt... 49: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt... 47: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt... 47: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt... 41: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt... 45: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt... 45: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt... 9: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... 43: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt... 43: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt... 27: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... 27: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... 25: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... 25: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... 25: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... 3: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... 3: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... 3: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... 3: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... 7: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... 17: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... 17: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... 23: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... 11: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... 11: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... 11: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... 11: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... 31: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... 19: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... 34: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt... 46: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt... 44: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt... 6: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... 16: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... 18: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... 10: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... 10: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... 10: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... 10: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... 20: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... 20: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... 8: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... 8: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... 8: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... 50: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt... 42: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt... 32: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt... 12: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... 26: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... 26: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... 14: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... 14: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... 14: [2022-11-25 17:45:25,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... 15: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... 15: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... 15: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... 22: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... 21: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... 57: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt... 39: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt... 55: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt... 51: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt... 1: [2022-11-25 17:45:25,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... 61: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt... 59: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt... 13: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... 35: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt... 35: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt... 5: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... 29: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... 40: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt... 30: [2022-11-25 17:45:25,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... 30: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... 4: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... 56: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt... 28: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... 38: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt... 0: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... 2: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... 24: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... 24: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... 24: [2022-11-25 17:45:25,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... 49: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt... 45: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt... 9: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... 27: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... 25: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... 3: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... 3: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... 7: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... 17: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... 23: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... 11: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... 31: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... 19: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... 34: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt... 46: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt... 44: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt... 6: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... 6: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... 16: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... 18: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... 10: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... 20: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... 20: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... 20: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... 8: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... 50: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt... 32: [2022-11-25 17:45:25,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt... 12: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... 26: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... 14: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... 14: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... 15: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... 15: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... 22: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... 22: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... 22: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... 21: [2022-11-25 17:45:25,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... 21: [2022-11-25 17:45:25,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... 21: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... 1: [2022-11-25 17:45:25,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... 1: [2022-11-25 17:45:25,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... 1: [2022-11-25 17:45:25,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... 61: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt... 13: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... 5: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... 5: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... 30: [2022-11-25 17:45:25,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... 4: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... 28: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... 28: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... 38: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt... 0: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... 2: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... 2: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... 24: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... 9: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... 9: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... 27: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... 25: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... 3: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... 7: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... 17: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... 23: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... 11: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... 11: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... 31: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... 19: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... 6: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... 6: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... 16: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... 18: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... 10: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... 20: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... 8: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... 32: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt... 12: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... 12: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... 26: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... 14: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... 14: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... 15: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... 22: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... 21: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... 13: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... 5: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... 5: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... 30: [2022-11-25 17:45:25,140] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... 30: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... 4: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... 28: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... 0: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... 2: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... 24: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... 9: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... 27: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... 25: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... 7: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... 17: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... 23: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... 31: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... 31: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... 19: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... 6: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... 16: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... 18: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... 10: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... 8: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... 32: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt... 12: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... 22: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... 4: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... 28: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... 2: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... 9: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... 27: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... 25: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... 23: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... 31: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... 19: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... 6: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... 16: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... 18: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... 32: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt... 22: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... 23: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... 31: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... 19: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... 18: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... 23: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... 18: [2022-11-25 17:45:25,141] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... 32: [2022-11-25 17:45:25,389] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt. 32: [2022-11-25 17:45:25,389] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt 32: [2022-11-25 17:45:25,389] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 32: [2022-11-25 17:45:25,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt. 32: [2022-11-25 17:45:25,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt 32: [2022-11-25 17:45:25,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 0: [2022-11-25 17:45:25,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. 0: [2022-11-25 17:45:25,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt 0: [2022-11-25 17:45:25,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 15: [2022-11-25 17:45:25,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. 15: [2022-11-25 17:45:25,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt 15: [2022-11-25 17:45:25,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 0: [2022-11-25 17:45:25,403] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. 13: [2022-11-25 17:45:25,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. 13: [2022-11-25 17:45:25,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt 13: [2022-11-25 17:45:25,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 16: [2022-11-25 17:45:25,409] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. 16: [2022-11-25 17:45:25,409] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt 16: [2022-11-25 17:45:25,409] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 8: [2022-11-25 17:45:25,409] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. 8: [2022-11-25 17:45:25,409] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt 8: [2022-11-25 17:45:25,409] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 0: [2022-11-25 17:45:25,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. 0: [2022-11-25 17:45:25,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt 0: [2022-11-25 17:45:25,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 42: [2022-11-25 17:45:25,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt. 42: [2022-11-25 17:45:25,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt 42: [2022-11-25 17:45:25,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 32: [2022-11-25 17:45:25,411] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt. 32: [2022-11-25 17:45:25,411] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt 32: [2022-11-25 17:45:25,411] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 50: [2022-11-25 17:45:25,412] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt. 50: [2022-11-25 17:45:25,412] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt 50: [2022-11-25 17:45:25,412] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 60: [2022-11-25 17:45:25,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt. 60: [2022-11-25 17:45:25,413] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt 60: [2022-11-25 17:45:25,413] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 56: [2022-11-25 17:45:25,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt. 53: [2022-11-25 17:45:25,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt. 53: [2022-11-25 17:45:25,415] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt 53: [2022-11-25 17:45:25,415] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 20: [2022-11-25 17:45:25,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. 20: [2022-11-25 17:45:25,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt 20: [2022-11-25 17:45:25,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 19: [2022-11-25 17:45:25,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. 19: [2022-11-25 17:45:25,417] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. 19: [2022-11-25 17:45:25,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt 19: [2022-11-25 17:45:25,417] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt 19: [2022-11-25 17:45:25,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 19: [2022-11-25 17:45:25,417] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 13: [2022-11-25 17:45:25,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. 13: [2022-11-25 17:45:25,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt 13: [2022-11-25 17:45:25,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 45: [2022-11-25 17:45:25,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt. 45: [2022-11-25 17:45:25,418] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt 45: [2022-11-25 17:45:25,418] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 28: [2022-11-25 17:45:25,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. 17: [2022-11-25 17:45:25,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. 23: [2022-11-25 17:45:25,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. 28: [2022-11-25 17:45:25,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt 17: [2022-11-25 17:45:25,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt 23: [2022-11-25 17:45:25,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt 28: [2022-11-25 17:45:25,419] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 17: [2022-11-25 17:45:25,419] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 23: [2022-11-25 17:45:25,419] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 8: [2022-11-25 17:45:25,419] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. 8: [2022-11-25 17:45:25,419] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt 8: [2022-11-25 17:45:25,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 26: [2022-11-25 17:45:25,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. 26: [2022-11-25 17:45:25,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt 26: [2022-11-25 17:45:25,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 43: [2022-11-25 17:45:25,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt. 43: [2022-11-25 17:45:25,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt 26: [2022-11-25 17:45:25,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. 51: [2022-11-25 17:45:25,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt. 43: [2022-11-25 17:45:25,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 26: [2022-11-25 17:45:25,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt 26: [2022-11-25 17:45:25,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 51: [2022-11-25 17:45:25,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt 51: [2022-11-25 17:45:25,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 56: [2022-11-25 17:45:25,415] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt 56: [2022-11-25 17:45:25,415] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 51: [2022-11-25 17:45:25,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt. 51: [2022-11-25 17:45:25,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt 51: [2022-11-25 17:45:25,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 57: [2022-11-25 17:45:25,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt. 57: [2022-11-25 17:45:25,424] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt 57: [2022-11-25 17:45:25,424] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 60: [2022-11-25 17:45:25,423] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt. 60: [2022-11-25 17:45:25,423] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt 60: [2022-11-25 17:45:25,423] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 44: [2022-11-25 17:45:25,424] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt. 44: [2022-11-25 17:45:25,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt 44: [2022-11-25 17:45:25,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 16: [2022-11-25 17:45:25,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. 16: [2022-11-25 17:45:25,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt 47: [2022-11-25 17:45:25,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt. 16: [2022-11-25 17:45:25,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 47: [2022-11-25 17:45:25,425] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt 47: [2022-11-25 17:45:25,425] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 17: [2022-11-25 17:45:25,425] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. 17: [2022-11-25 17:45:25,426] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt 17: [2022-11-25 17:45:25,426] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 15: [2022-11-25 17:45:25,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. 56: [2022-11-25 17:45:25,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt. 15: [2022-11-25 17:45:25,426] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt 56: [2022-11-25 17:45:25,426] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt 15: [2022-11-25 17:45:25,426] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 56: [2022-11-25 17:45:25,426] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 53: [2022-11-25 17:45:25,427] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt. 53: [2022-11-25 17:45:25,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt 53: [2022-11-25 17:45:25,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 60: [2022-11-25 17:45:25,427] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt. 60: [2022-11-25 17:45:25,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt 60: [2022-11-25 17:45:25,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 36: [2022-11-25 17:45:25,428] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt. 36: [2022-11-25 17:45:25,428] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt 36: [2022-11-25 17:45:25,428] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 31: [2022-11-25 17:45:25,428] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. 31: [2022-11-25 17:45:25,428] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt 31: [2022-11-25 17:45:25,428] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 23: [2022-11-25 17:45:25,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. 23: [2022-11-25 17:45:25,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt 23: [2022-11-25 17:45:25,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 0: [2022-11-25 17:45:25,428] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt 0: [2022-11-25 17:45:25,428] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 16: [2022-11-25 17:45:25,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. 31: [2022-11-25 17:45:25,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. 16: [2022-11-25 17:45:25,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt 43: [2022-11-25 17:45:25,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt. 31: [2022-11-25 17:45:25,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt 16: [2022-11-25 17:45:25,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 43: [2022-11-25 17:45:25,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt 31: [2022-11-25 17:45:25,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 43: [2022-11-25 17:45:25,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 57: [2022-11-25 17:45:25,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt. 57: [2022-11-25 17:45:25,429] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt 57: [2022-11-25 17:45:25,429] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 28: [2022-11-25 17:45:25,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. 28: [2022-11-25 17:45:25,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt 28: [2022-11-25 17:45:25,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 53: [2022-11-25 17:45:25,430] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt. 53: [2022-11-25 17:45:25,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt 53: [2022-11-25 17:45:25,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 57: [2022-11-25 17:45:25,431] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt. 57: [2022-11-25 17:45:25,431] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt 57: [2022-11-25 17:45:25,431] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 51: [2022-11-25 17:45:25,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt. 51: [2022-11-25 17:45:25,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt 15: [2022-11-25 17:45:25,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. 51: [2022-11-25 17:45:25,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 13: [2022-11-25 17:45:25,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. 15: [2022-11-25 17:45:25,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt 15: [2022-11-25 17:45:25,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 13: [2022-11-25 17:45:25,432] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt 13: [2022-11-25 17:45:25,432] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 62: [2022-11-25 17:45:25,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt. 62: [2022-11-25 17:45:25,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt. 62: [2022-11-25 17:45:25,432] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt. 45: [2022-11-25 17:45:25,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt. 62: [2022-11-25 17:45:25,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt 62: [2022-11-25 17:45:25,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt 62: [2022-11-25 17:45:25,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt 45: [2022-11-25 17:45:25,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt 62: [2022-11-25 17:45:25,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 45: [2022-11-25 17:45:25,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 62: [2022-11-25 17:45:25,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 62: [2022-11-25 17:45:25,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 62: [2022-11-25 17:45:25,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt. 62: [2022-11-25 17:45:25,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt 62: [2022-11-25 17:45:25,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 36: [2022-11-25 17:45:25,433] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt. 36: [2022-11-25 17:45:25,433] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt 36: [2022-11-25 17:45:25,433] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 23: [2022-11-25 17:45:25,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. 23: [2022-11-25 17:45:25,435] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt 23: [2022-11-25 17:45:25,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 56: [2022-11-25 17:45:25,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt. 56: [2022-11-25 17:45:25,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt 56: [2022-11-25 17:45:25,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 16: [2022-11-25 17:45:25,436] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. 16: [2022-11-25 17:45:25,436] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt 16: [2022-11-25 17:45:25,436] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 8: [2022-11-25 17:45:25,437] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. 8: [2022-11-25 17:45:25,437] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt 8: [2022-11-25 17:45:25,437] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 13: [2022-11-25 17:45:25,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. 13: [2022-11-25 17:45:25,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt 20: [2022-11-25 17:45:25,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. 50: [2022-11-25 17:45:25,438] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt. 13: [2022-11-25 17:45:25,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 50: [2022-11-25 17:45:25,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt 50: [2022-11-25 17:45:25,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 23: [2022-11-25 17:45:25,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. 23: [2022-11-25 17:45:25,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt 23: [2022-11-25 17:45:25,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 1: [2022-11-25 17:45:25,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. 1: [2022-11-25 17:45:25,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. 20: [2022-11-25 17:45:25,438] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt 1: [2022-11-25 17:45:25,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. 1: [2022-11-25 17:45:25,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt 1: [2022-11-25 17:45:25,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt 20: [2022-11-25 17:45:25,438] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 1: [2022-11-25 17:45:25,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt 1: [2022-11-25 17:45:25,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 1: [2022-11-25 17:45:25,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 20: [2022-11-25 17:45:25,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. 1: [2022-11-25 17:45:25,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 20: [2022-11-25 17:45:25,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt 1: [2022-11-25 17:45:25,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. 27: [2022-11-25 17:45:25,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. 20: [2022-11-25 17:45:25,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 8: [2022-11-25 17:45:25,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. 1: [2022-11-25 17:45:25,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt 27: [2022-11-25 17:45:25,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt 1: [2022-11-25 17:45:25,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 27: [2022-11-25 17:45:25,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 8: [2022-11-25 17:45:25,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt 8: [2022-11-25 17:45:25,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 17: [2022-11-25 17:45:25,439] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. 17: [2022-11-25 17:45:25,439] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt 17: [2022-11-25 17:45:25,439] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 45: [2022-11-25 17:45:25,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt. 45: [2022-11-25 17:45:25,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt 45: [2022-11-25 17:45:25,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 0: [2022-11-25 17:45:25,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. 0: [2022-11-25 17:45:25,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt 0: [2022-11-25 17:45:25,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 32: [2022-11-25 17:45:25,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt. 32: [2022-11-25 17:45:25,441] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt 32: [2022-11-25 17:45:25,441] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 42: [2022-11-25 17:45:25,441] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt. 42: [2022-11-25 17:45:25,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt 42: [2022-11-25 17:45:25,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 60: [2022-11-25 17:45:25,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt. 60: [2022-11-25 17:45:25,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt 60: [2022-11-25 17:45:25,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 31: [2022-11-25 17:45:25,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. 31: [2022-11-25 17:45:25,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt 31: [2022-11-25 17:45:25,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 15: [2022-11-25 17:45:25,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. 15: [2022-11-25 17:45:25,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt 15: [2022-11-25 17:45:25,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 44: [2022-11-25 17:45:25,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt. 44: [2022-11-25 17:45:25,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt 44: [2022-11-25 17:45:25,442] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 19: [2022-11-25 17:45:25,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. 47: [2022-11-25 17:45:25,442] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt. 47: [2022-11-25 17:45:25,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt 47: [2022-11-25 17:45:25,443] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 47: [2022-11-25 17:45:25,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt. 47: [2022-11-25 17:45:25,443] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt 47: [2022-11-25 17:45:25,443] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt. 47: [2022-11-25 17:45:25,443] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 47: [2022-11-25 17:45:25,443] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt 47: [2022-11-25 17:45:25,443] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 17: [2022-11-25 17:45:25,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. 17: [2022-11-25 17:45:25,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt 17: [2022-11-25 17:45:25,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 22: [2022-11-25 17:45:25,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. 22: [2022-11-25 17:45:25,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. 22: [2022-11-25 17:45:25,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. 22: [2022-11-25 17:45:25,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt 22: [2022-11-25 17:45:25,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt 22: [2022-11-25 17:45:25,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt 56: [2022-11-25 17:45:25,444] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt. 56: [2022-11-25 17:45:25,444] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt 22: [2022-11-25 17:45:25,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 22: [2022-11-25 17:45:25,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 22: [2022-11-25 17:45:25,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 56: [2022-11-25 17:45:25,444] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 44: [2022-11-25 17:45:25,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt. 44: [2022-11-25 17:45:25,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt 44: [2022-11-25 17:45:25,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 21: [2022-11-25 17:45:25,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. 21: [2022-11-25 17:45:25,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. 21: [2022-11-25 17:45:25,446] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. 27: [2022-11-25 17:45:25,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. 27: [2022-11-25 17:45:25,447] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt 43: [2022-11-25 17:45:25,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt. 27: [2022-11-25 17:45:25,447] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 43: [2022-11-25 17:45:25,447] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt 43: [2022-11-25 17:45:25,447] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 21: [2022-11-25 17:45:25,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt 21: [2022-11-25 17:45:25,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt 21: [2022-11-25 17:45:25,446] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt 21: [2022-11-25 17:45:25,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 21: [2022-11-25 17:45:25,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 21: [2022-11-25 17:45:25,446] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 2: [2022-11-25 17:45:25,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. 2: [2022-11-25 17:45:25,448] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. 2: [2022-11-25 17:45:25,448] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt 2: [2022-11-25 17:45:25,448] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt 2: [2022-11-25 17:45:25,448] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 2: [2022-11-25 17:45:25,448] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 52: [2022-11-25 17:45:25,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt. 52: [2022-11-25 17:45:25,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt. 52: [2022-11-25 17:45:25,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt 52: [2022-11-25 17:45:25,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt 52: [2022-11-25 17:45:25,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 52: [2022-11-25 17:45:25,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 42: [2022-11-25 17:45:25,449] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt. 42: [2022-11-25 17:45:25,449] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt 42: [2022-11-25 17:45:25,449] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 49: [2022-11-25 17:45:25,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt. 49: [2022-11-25 17:45:25,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt. 49: [2022-11-25 17:45:25,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt 49: [2022-11-25 17:45:25,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 49: [2022-11-25 17:45:25,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt 61: [2022-11-25 17:45:25,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt. 61: [2022-11-25 17:45:25,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt. 61: [2022-11-25 17:45:25,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt. 49: [2022-11-25 17:45:25,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 51: [2022-11-25 17:45:25,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt. 61: [2022-11-25 17:45:25,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt. 61: [2022-11-25 17:45:25,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt 54: [2022-11-25 17:45:25,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt. 61: [2022-11-25 17:45:25,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt 58: [2022-11-25 17:45:25,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt. 54: [2022-11-25 17:45:25,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt 54: [2022-11-25 17:45:25,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt. 54: [2022-11-25 17:45:25,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt. 51: [2022-11-25 17:45:25,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt 61: [2022-11-25 17:45:25,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt 58: [2022-11-25 17:45:25,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt 54: [2022-11-25 17:45:25,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 51: [2022-11-25 17:45:25,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 61: [2022-11-25 17:45:25,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt 58: [2022-11-25 17:45:25,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 54: [2022-11-25 17:45:25,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt 54: [2022-11-25 17:45:25,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt 53: [2022-11-25 17:45:25,450] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt. 61: [2022-11-25 17:45:25,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 61: [2022-11-25 17:45:25,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 54: [2022-11-25 17:45:25,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 54: [2022-11-25 17:45:25,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 53: [2022-11-25 17:45:25,450] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt 61: [2022-11-25 17:45:25,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 61: [2022-11-25 17:45:25,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 53: [2022-11-25 17:45:25,450] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 49: [2022-11-25 17:45:25,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt. 58: [2022-11-25 17:45:25,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt. 58: [2022-11-25 17:45:25,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt 49: [2022-11-25 17:45:25,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt 49: [2022-11-25 17:45:25,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 58: [2022-11-25 17:45:25,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 49: [2022-11-25 17:45:25,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt. 49: [2022-11-25 17:45:25,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt 49: [2022-11-25 17:45:25,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 48: [2022-11-25 17:45:25,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt. 48: [2022-11-25 17:45:25,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt. 48: [2022-11-25 17:45:25,451] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt. 48: [2022-11-25 17:45:25,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt 48: [2022-11-25 17:45:25,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt 48: [2022-11-25 17:45:25,451] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt 48: [2022-11-25 17:45:25,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 48: [2022-11-25 17:45:25,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 48: [2022-11-25 17:45:25,451] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 43: [2022-11-25 17:45:25,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt. 33: [2022-11-25 17:45:25,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt. 33: [2022-11-25 17:45:25,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt. 43: [2022-11-25 17:45:25,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt 33: [2022-11-25 17:45:25,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt 33: [2022-11-25 17:45:25,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt 43: [2022-11-25 17:45:25,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 33: [2022-11-25 17:45:25,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 33: [2022-11-25 17:45:25,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 19: [2022-11-25 17:45:25,442] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt 19: [2022-11-25 17:45:25,443] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 27: [2022-11-25 17:45:25,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. 27: [2022-11-25 17:45:25,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt 27: [2022-11-25 17:45:25,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 30: [2022-11-25 17:45:25,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. 30: [2022-11-25 17:45:25,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. 20: [2022-11-25 17:45:25,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. 30: [2022-11-25 17:45:25,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt 20: [2022-11-25 17:45:25,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt 20: [2022-11-25 17:45:25,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 30: [2022-11-25 17:45:25,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt 30: [2022-11-25 17:45:25,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 30: [2022-11-25 17:45:25,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 52: [2022-11-25 17:45:25,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt. 38: [2022-11-25 17:45:25,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt. 38: [2022-11-25 17:45:25,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt. 58: [2022-11-25 17:45:25,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt. 52: [2022-11-25 17:45:25,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt 38: [2022-11-25 17:45:25,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt 38: [2022-11-25 17:45:25,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt 58: [2022-11-25 17:45:25,453] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt 52: [2022-11-25 17:45:25,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 38: [2022-11-25 17:45:25,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 38: [2022-11-25 17:45:25,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 44: [2022-11-25 17:45:25,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt. 42: [2022-11-25 17:45:25,453] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt. 58: [2022-11-25 17:45:25,453] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 44: [2022-11-25 17:45:25,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt 42: [2022-11-25 17:45:25,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt 44: [2022-11-25 17:45:25,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 42: [2022-11-25 17:45:25,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 2: [2022-11-25 17:45:25,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. 2: [2022-11-25 17:45:25,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt 2: [2022-11-25 17:45:25,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 38: [2022-11-25 17:45:25,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt. 38: [2022-11-25 17:45:25,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt 38: [2022-11-25 17:45:25,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 2: [2022-11-25 17:45:25,455] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. 2: [2022-11-25 17:45:25,455] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt 2: [2022-11-25 17:45:25,455] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 30: [2022-11-25 17:45:25,458] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. 30: [2022-11-25 17:45:25,458] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt 30: [2022-11-25 17:45:25,458] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 41: [2022-11-25 17:45:25,460] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt. 41: [2022-11-25 17:45:25,460] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt 41: [2022-11-25 17:45:25,460] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 34: [2022-11-25 17:45:25,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt. 34: [2022-11-25 17:45:25,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt. 34: [2022-11-25 17:45:25,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt. 34: [2022-11-25 17:45:25,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt 34: [2022-11-25 17:45:25,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt 34: [2022-11-25 17:45:25,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt 34: [2022-11-25 17:45:25,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 34: [2022-11-25 17:45:25,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 34: [2022-11-25 17:45:25,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 46: [2022-11-25 17:45:25,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt. 46: [2022-11-25 17:45:25,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt 46: [2022-11-25 17:45:25,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 19: [2022-11-25 17:45:25,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. 59: [2022-11-25 17:45:25,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt. 59: [2022-11-25 17:45:25,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt. 59: [2022-11-25 17:45:25,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt 59: [2022-11-25 17:45:25,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt 59: [2022-11-25 17:45:25,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 59: [2022-11-25 17:45:25,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 59: [2022-11-25 17:45:25,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt. 59: [2022-11-25 17:45:25,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt 59: [2022-11-25 17:45:25,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 55: [2022-11-25 17:45:25,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt. 55: [2022-11-25 17:45:25,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt. 55: [2022-11-25 17:45:25,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt. 55: [2022-11-25 17:45:25,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt. 46: [2022-11-25 17:45:25,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt. 6: [2022-11-25 17:45:25,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. 55: [2022-11-25 17:45:25,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt 55: [2022-11-25 17:45:25,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt 46: [2022-11-25 17:45:25,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt 6: [2022-11-25 17:45:25,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt 55: [2022-11-25 17:45:25,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt 46: [2022-11-25 17:45:25,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 6: [2022-11-25 17:45:25,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 55: [2022-11-25 17:45:25,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 55: [2022-11-25 17:45:25,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt 55: [2022-11-25 17:45:25,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 55: [2022-11-25 17:45:25,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 55: [2022-11-25 17:45:25,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 31: [2022-11-25 17:45:25,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. 28: [2022-11-25 17:45:25,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. 28: [2022-11-25 17:45:25,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt 28: [2022-11-25 17:45:25,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 31: [2022-11-25 17:45:25,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt 31: [2022-11-25 17:45:25,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 41: [2022-11-25 17:45:25,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt. 45: [2022-11-25 17:45:25,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt. 45: [2022-11-25 17:45:25,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt 45: [2022-11-25 17:45:25,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 41: [2022-11-25 17:45:25,465] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt 41: [2022-11-25 17:45:25,465] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 26: [2022-11-25 17:45:25,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. 26: [2022-11-25 17:45:25,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. 26: [2022-11-25 17:45:25,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt 26: [2022-11-25 17:45:25,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt 26: [2022-11-25 17:45:25,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 26: [2022-11-25 17:45:25,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 33: [2022-11-25 17:45:25,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt. 33: [2022-11-25 17:45:25,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt 33: [2022-11-25 17:45:25,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 41: [2022-11-25 17:45:25,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt. 41: [2022-11-25 17:45:25,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt 41: [2022-11-25 17:45:25,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 50: [2022-11-25 17:45:25,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt. 50: [2022-11-25 17:45:25,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt 50: [2022-11-25 17:45:25,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 24: [2022-11-25 17:45:25,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. 24: [2022-11-25 17:45:25,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt 24: [2022-11-25 17:45:25,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. 24: [2022-11-25 17:45:25,466] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. 24: [2022-11-25 17:45:25,466] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 24: [2022-11-25 17:45:25,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt 46: [2022-11-25 17:45:25,468] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt. 24: [2022-11-25 17:45:25,466] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt 24: [2022-11-25 17:45:25,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 24: [2022-11-25 17:45:25,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 46: [2022-11-25 17:45:25,468] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt 46: [2022-11-25 17:45:25,468] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 36: [2022-11-25 17:45:25,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt. 19: [2022-11-25 17:45:25,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt 19: [2022-11-25 17:45:25,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 36: [2022-11-25 17:45:25,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt 36: [2022-11-25 17:45:25,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 12: [2022-11-25 17:45:25,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. 12: [2022-11-25 17:45:25,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. 12: [2022-11-25 17:45:25,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. 12: [2022-11-25 17:45:25,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt 12: [2022-11-25 17:45:25,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt 12: [2022-11-25 17:45:25,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt 12: [2022-11-25 17:45:25,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 12: [2022-11-25 17:45:25,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 12: [2022-11-25 17:45:25,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 37: [2022-11-25 17:45:25,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt. 25: [2022-11-25 17:45:25,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. 25: [2022-11-25 17:45:25,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt 25: [2022-11-25 17:45:25,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 37: [2022-11-25 17:45:25,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt. 37: [2022-11-25 17:45:25,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt. 37: [2022-11-25 17:45:25,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt 37: [2022-11-25 17:45:25,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt 37: [2022-11-25 17:45:25,469] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt 37: [2022-11-25 17:45:25,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 37: [2022-11-25 17:45:25,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 37: [2022-11-25 17:45:25,469] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 29: [2022-11-25 17:45:25,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. 29: [2022-11-25 17:45:25,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. 29: [2022-11-25 17:45:25,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. 29: [2022-11-25 17:45:25,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt 29: [2022-11-25 17:45:25,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt 29: [2022-11-25 17:45:25,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 29: [2022-11-25 17:45:25,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 29: [2022-11-25 17:45:25,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt 29: [2022-11-25 17:45:25,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 15: [2022-11-25 17:45:25,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. 30: [2022-11-25 17:45:25,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. 15: [2022-11-25 17:45:25,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt 15: [2022-11-25 17:45:25,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 30: [2022-11-25 17:45:25,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt 30: [2022-11-25 17:45:25,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 35: [2022-11-25 17:45:25,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt. 35: [2022-11-25 17:45:25,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt. 35: [2022-11-25 17:45:25,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt. 35: [2022-11-25 17:45:25,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt 35: [2022-11-25 17:45:25,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt 35: [2022-11-25 17:45:25,473] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt 35: [2022-11-25 17:45:25,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 35: [2022-11-25 17:45:25,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 7: [2022-11-25 17:45:25,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. 7: [2022-11-25 17:45:25,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. 35: [2022-11-25 17:45:25,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 7: [2022-11-25 17:45:25,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. 7: [2022-11-25 17:45:25,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt 19: [2022-11-25 17:45:25,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. 63: [2022-11-25 17:45:25,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt. 63: [2022-11-25 17:45:25,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt. 63: [2022-11-25 17:45:25,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt. 7: [2022-11-25 17:45:25,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt 19: [2022-11-25 17:45:25,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt 63: [2022-11-25 17:45:25,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt 63: [2022-11-25 17:45:25,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt 63: [2022-11-25 17:45:25,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt 7: [2022-11-25 17:45:25,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt 19: [2022-11-25 17:45:25,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 63: [2022-11-25 17:45:25,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 63: [2022-11-25 17:45:25,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 63: [2022-11-25 17:45:25,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 13: [2022-11-25 17:45:25,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. 7: [2022-11-25 17:45:25,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 7: [2022-11-25 17:45:25,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 13: [2022-11-25 17:45:25,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt 7: [2022-11-25 17:45:25,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 13: [2022-11-25 17:45:25,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 10: [2022-11-25 17:45:25,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. 10: [2022-11-25 17:45:25,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. 10: [2022-11-25 17:45:25,474] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. 10: [2022-11-25 17:45:25,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt 10: [2022-11-25 17:45:25,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt 10: [2022-11-25 17:45:25,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt 10: [2022-11-25 17:45:25,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 10: [2022-11-25 17:45:25,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 10: [2022-11-25 17:45:25,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 28: [2022-11-25 17:45:25,475] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. 28: [2022-11-25 17:45:25,475] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt 28: [2022-11-25 17:45:25,475] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 39: [2022-11-25 17:45:25,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt. 39: [2022-11-25 17:45:25,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt. 39: [2022-11-25 17:45:25,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt. 39: [2022-11-25 17:45:25,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt. 39: [2022-11-25 17:45:25,476] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt 39: [2022-11-25 17:45:25,476] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt 39: [2022-11-25 17:45:25,476] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 39: [2022-11-25 17:45:25,476] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt 39: [2022-11-25 17:45:25,476] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt 39: [2022-11-25 17:45:25,476] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 39: [2022-11-25 17:45:25,476] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 39: [2022-11-25 17:45:25,476] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 18: [2022-11-25 17:45:25,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. 18: [2022-11-25 17:45:25,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. 18: [2022-11-25 17:45:25,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. 18: [2022-11-25 17:45:25,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. 18: [2022-11-25 17:45:25,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt 18: [2022-11-25 17:45:25,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt 18: [2022-11-25 17:45:25,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt 18: [2022-11-25 17:45:25,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt 18: [2022-11-25 17:45:25,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 18: [2022-11-25 17:45:25,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 18: [2022-11-25 17:45:25,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 18: [2022-11-25 17:45:25,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 25: [2022-11-25 17:45:25,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. 25: [2022-11-25 17:45:25,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt 25: [2022-11-25 17:45:25,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 50: [2022-11-25 17:45:25,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt. 50: [2022-11-25 17:45:25,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt 50: [2022-11-25 17:45:25,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 25: [2022-11-25 17:45:25,482] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. 25: [2022-11-25 17:45:25,482] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt 25: [2022-11-25 17:45:25,482] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 6: [2022-11-25 17:45:25,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. 6: [2022-11-25 17:45:25,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt 6: [2022-11-25 17:45:25,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. 6: [2022-11-25 17:45:25,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 6: [2022-11-25 17:45:25,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt 6: [2022-11-25 17:45:25,486] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 11: [2022-11-25 17:45:25,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. 11: [2022-11-25 17:45:25,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. 11: [2022-11-25 17:45:25,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. 11: [2022-11-25 17:45:25,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt 11: [2022-11-25 17:45:25,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt 11: [2022-11-25 17:45:25,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt 11: [2022-11-25 17:45:25,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 11: [2022-11-25 17:45:25,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 11: [2022-11-25 17:45:25,488] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 62: [2022-11-25 17:45:25,488] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt. 62: [2022-11-25 17:45:25,488] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt 62: [2022-11-25 17:45:25,489] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 24: [2022-11-25 17:45:25,489] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. 24: [2022-11-25 17:45:25,489] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt 24: [2022-11-25 17:45:25,490] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 0: [2022-11-25 17:45:25,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. 0: [2022-11-25 17:45:25,496] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt 0: [2022-11-25 17:45:25,496] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 16: [2022-11-25 17:45:25,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. 16: [2022-11-25 17:45:25,501] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt 16: [2022-11-25 17:45:25,501] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 14: [2022-11-25 17:45:25,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. 14: [2022-11-25 17:45:25,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. 14: [2022-11-25 17:45:25,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. 14: [2022-11-25 17:45:25,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. 14: [2022-11-25 17:45:25,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt 14: [2022-11-25 17:45:25,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt 14: [2022-11-25 17:45:25,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt 14: [2022-11-25 17:45:25,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 14: [2022-11-25 17:45:25,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt 14: [2022-11-25 17:45:25,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 14: [2022-11-25 17:45:25,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 14: [2022-11-25 17:45:25,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 23: [2022-11-25 17:45:25,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. 23: [2022-11-25 17:45:25,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt 23: [2022-11-25 17:45:25,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 40: [2022-11-25 17:45:25,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt. 40: [2022-11-25 17:45:25,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt. 40: [2022-11-25 17:45:25,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt. 40: [2022-11-25 17:45:25,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt 40: [2022-11-25 17:45:25,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt 40: [2022-11-25 17:45:25,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt 40: [2022-11-25 17:45:25,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 40: [2022-11-25 17:45:25,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 40: [2022-11-25 17:45:25,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 40: [2022-11-25 17:45:25,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt. 40: [2022-11-25 17:45:25,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt 40: [2022-11-25 17:45:25,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 55: [2022-11-25 17:45:25,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt. 55: [2022-11-25 17:45:25,505] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt 55: [2022-11-25 17:45:25,505] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 4: [2022-11-25 17:45:25,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. 4: [2022-11-25 17:45:25,505] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. 4: [2022-11-25 17:45:25,506] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt 4: [2022-11-25 17:45:25,506] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt 4: [2022-11-25 17:45:25,506] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 4: [2022-11-25 17:45:25,506] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 20: [2022-11-25 17:45:25,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. 20: [2022-11-25 17:45:25,506] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt 20: [2022-11-25 17:45:25,506] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 30: [2022-11-25 17:45:25,507] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. 30: [2022-11-25 17:45:25,507] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt 30: [2022-11-25 17:45:25,507] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 2: [2022-11-25 17:45:25,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. 2: [2022-11-25 17:45:25,513] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt 2: [2022-11-25 17:45:25,513] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 60: [2022-11-25 17:45:25,514] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt. 60: [2022-11-25 17:45:25,514] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt 60: [2022-11-25 17:45:25,514] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 4: [2022-11-25 17:45:25,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. 4: [2022-11-25 17:45:25,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. 4: [2022-11-25 17:45:25,516] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt 4: [2022-11-25 17:45:25,516] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt 4: [2022-11-25 17:45:25,516] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 4: [2022-11-25 17:45:25,516] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 56: [2022-11-25 17:45:25,519] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt. 39: [2022-11-25 17:45:25,521] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt. 39: [2022-11-25 17:45:25,521] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt 8: [2022-11-25 17:45:25,521] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. 39: [2022-11-25 17:45:25,521] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 8: [2022-11-25 17:45:25,521] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt 8: [2022-11-25 17:45:25,521] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 9: [2022-11-25 17:45:25,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. 9: [2022-11-25 17:45:25,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. 9: [2022-11-25 17:45:25,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. 9: [2022-11-25 17:45:25,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. 9: [2022-11-25 17:45:25,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. 9: [2022-11-25 17:45:25,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt 9: [2022-11-25 17:45:25,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt 9: [2022-11-25 17:45:25,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt 9: [2022-11-25 17:45:25,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt 9: [2022-11-25 17:45:25,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 9: [2022-11-25 17:45:25,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 9: [2022-11-25 17:45:25,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 9: [2022-11-25 17:45:25,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 9: [2022-11-25 17:45:25,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt 9: [2022-11-25 17:45:25,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 57: [2022-11-25 17:45:25,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt. 57: [2022-11-25 17:45:25,527] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt 57: [2022-11-25 17:45:25,527] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 3: [2022-11-25 17:45:25,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. 3: [2022-11-25 17:45:25,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. 3: [2022-11-25 17:45:25,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. 3: [2022-11-25 17:45:25,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. 3: [2022-11-25 17:45:25,530] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. 3: [2022-11-25 17:45:25,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt 3: [2022-11-25 17:45:25,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt 3: [2022-11-25 17:45:25,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt 3: [2022-11-25 17:45:25,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 3: [2022-11-25 17:45:25,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 3: [2022-11-25 17:45:25,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt 3: [2022-11-25 17:45:25,530] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt 3: [2022-11-25 17:45:25,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 3: [2022-11-25 17:45:25,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 3: [2022-11-25 17:45:25,530] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 53: [2022-11-25 17:45:25,533] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt. 53: [2022-11-25 17:45:25,533] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt 53: [2022-11-25 17:45:25,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 56: [2022-11-25 17:45:25,520] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt 56: [2022-11-25 17:45:25,520] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 5: [2022-11-25 17:45:25,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. 5: [2022-11-25 17:45:25,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. 5: [2022-11-25 17:45:25,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. 5: [2022-11-25 17:45:25,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. 5: [2022-11-25 17:45:25,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt 5: [2022-11-25 17:45:25,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt 5: [2022-11-25 17:45:25,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt 5: [2022-11-25 17:45:25,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt 5: [2022-11-25 17:45:25,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 5: [2022-11-25 17:45:25,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 5: [2022-11-25 17:45:25,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 5: [2022-11-25 17:45:25,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 5: [2022-11-25 17:45:25,536] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. 5: [2022-11-25 17:45:25,536] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt 5: [2022-11-25 17:45:25,536] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 1: [2022-11-25 17:45:25,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. 1: [2022-11-25 17:45:25,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt 1: [2022-11-25 17:45:25,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 46: [2022-11-25 17:45:25,546] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt. 46: [2022-11-25 17:45:25,546] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt 46: [2022-11-25 17:45:25,547] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 26: [2022-11-25 17:45:25,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. 26: [2022-11-25 17:45:25,550] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt 26: [2022-11-25 17:45:25,550] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 18: [2022-11-25 17:45:25,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. 18: [2022-11-25 17:45:25,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt 18: [2022-11-25 17:45:25,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 37: [2022-11-25 17:45:25,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt. 37: [2022-11-25 17:45:25,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt 37: [2022-11-25 17:45:25,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 42: [2022-11-25 17:45:25,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt. 42: [2022-11-25 17:45:25,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt 42: [2022-11-25 17:45:25,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 63: [2022-11-25 17:45:25,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt. 63: [2022-11-25 17:45:25,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt 63: [2022-11-25 17:45:25,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 43: [2022-11-25 17:45:25,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt. 43: [2022-11-25 17:45:25,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt 43: [2022-11-25 17:45:25,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 34: [2022-11-25 17:45:25,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt. 34: [2022-11-25 17:45:25,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt 34: [2022-11-25 17:45:25,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 52: [2022-11-25 17:45:25,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt. 52: [2022-11-25 17:45:25,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt 52: [2022-11-25 17:45:25,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 58: [2022-11-25 17:45:25,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt. 58: [2022-11-25 17:45:25,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt 58: [2022-11-25 17:45:25,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 27: [2022-11-25 17:45:25,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. 27: [2022-11-25 17:45:25,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt 27: [2022-11-25 17:45:25,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 40: [2022-11-25 17:45:25,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt. 40: [2022-11-25 17:45:25,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt 40: [2022-11-25 17:45:25,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 24: [2022-11-25 17:45:25,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. 24: [2022-11-25 17:45:25,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt 24: [2022-11-25 17:45:25,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 33: [2022-11-25 17:45:25,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt. 33: [2022-11-25 17:45:25,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt 33: [2022-11-25 17:45:25,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 12: [2022-11-25 17:45:25,631] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. 12: [2022-11-25 17:45:25,631] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt 12: [2022-11-25 17:45:25,631] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 10: [2022-11-25 17:45:25,635] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. 10: [2022-11-25 17:45:25,635] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt 10: [2022-11-25 17:45:25,635] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 17: [2022-11-25 17:45:25,637] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. 17: [2022-11-25 17:45:25,637] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt 17: [2022-11-25 17:45:25,637] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 21: [2022-11-25 17:45:25,652] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. 21: [2022-11-25 17:45:25,652] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt 21: [2022-11-25 17:45:25,652] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 4: [2022-11-25 17:45:25,656] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. 4: [2022-11-25 17:45:25,656] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt 4: [2022-11-25 17:45:25,656] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 14: [2022-11-25 17:45:25,669] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. 14: [2022-11-25 17:45:25,669] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt 14: [2022-11-25 17:45:25,669] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 57: [2022-11-25 17:45:25,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt. 57: [2022-11-25 17:45:25,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt 57: [2022-11-25 17:45:25,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 48: [2022-11-25 17:45:25,703] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt. 48: [2022-11-25 17:45:25,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt 48: [2022-11-25 17:45:25,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 47: [2022-11-25 17:45:25,705] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt. 47: [2022-11-25 17:45:25,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt 49: [2022-11-25 17:45:25,706] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt. 47: [2022-11-25 17:45:25,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 49: [2022-11-25 17:45:25,706] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt 49: [2022-11-25 17:45:25,706] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 50: [2022-11-25 17:45:25,712] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt. 50: [2022-11-25 17:45:25,712] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt 50: [2022-11-25 17:45:25,712] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 11: [2022-11-25 17:45:25,716] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. 11: [2022-11-25 17:45:25,716] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt 11: [2022-11-25 17:45:25,716] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 32: [2022-11-25 17:45:25,719] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt. 32: [2022-11-25 17:45:25,719] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt 32: [2022-11-25 17:45:25,719] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 45: [2022-11-25 17:45:25,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt. 45: [2022-11-25 17:45:25,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt 45: [2022-11-25 17:45:25,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 38: [2022-11-25 17:45:25,724] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt. 38: [2022-11-25 17:45:25,724] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt 38: [2022-11-25 17:45:25,724] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 61: [2022-11-25 17:45:25,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt. 61: [2022-11-25 17:45:25,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt 61: [2022-11-25 17:45:25,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 51: [2022-11-25 17:45:25,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt. 51: [2022-11-25 17:45:25,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt 51: [2022-11-25 17:45:25,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 36: [2022-11-25 17:45:25,740] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt. 36: [2022-11-25 17:45:25,740] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt 36: [2022-11-25 17:45:25,740] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 31: [2022-11-25 17:45:25,746] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. 31: [2022-11-25 17:45:25,746] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt 31: [2022-11-25 17:45:25,746] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 35: [2022-11-25 17:45:25,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt. 35: [2022-11-25 17:45:25,748] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt 35: [2022-11-25 17:45:25,748] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 28: [2022-11-25 17:45:25,749] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. 28: [2022-11-25 17:45:25,749] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt 28: [2022-11-25 17:45:25,749] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 13: [2022-11-25 17:45:25,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. 13: [2022-11-25 17:45:25,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt 13: [2022-11-25 17:45:25,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 25: [2022-11-25 17:45:25,750] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. 25: [2022-11-25 17:45:25,750] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt 25: [2022-11-25 17:45:25,750] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 44: [2022-11-25 17:45:25,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt. 44: [2022-11-25 17:45:25,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt 41: [2022-11-25 17:45:25,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt. 44: [2022-11-25 17:45:25,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 41: [2022-11-25 17:45:25,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt 41: [2022-11-25 17:45:25,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 19: [2022-11-25 17:45:25,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. 19: [2022-11-25 17:45:25,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt 19: [2022-11-25 17:45:25,751] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 22: [2022-11-25 17:45:25,751] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. 22: [2022-11-25 17:45:25,751] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt 22: [2022-11-25 17:45:25,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 54: [2022-11-25 17:45:25,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt. 54: [2022-11-25 17:45:25,755] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt 54: [2022-11-25 17:45:25,755] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 59: [2022-11-25 17:45:25,756] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt. 59: [2022-11-25 17:45:25,756] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt 59: [2022-11-25 17:45:25,756] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 29: [2022-11-25 17:45:25,757] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. 29: [2022-11-25 17:45:25,757] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt 29: [2022-11-25 17:45:25,757] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 6: [2022-11-25 17:45:25,758] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. 6: [2022-11-25 17:45:25,758] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt 6: [2022-11-25 17:45:25,758] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 2: [2022-11-25 17:45:25,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. 2: [2022-11-25 17:45:25,762] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt 2: [2022-11-25 17:45:25,762] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 23: [2022-11-25 17:45:25,763] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. 23: [2022-11-25 17:45:25,763] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt 23: [2022-11-25 17:45:25,763] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 5: [2022-11-25 17:45:25,765] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. 5: [2022-11-25 17:45:25,765] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt 5: [2022-11-25 17:45:25,765] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 15: [2022-11-25 17:45:25,770] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. 15: [2022-11-25 17:45:25,770] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt 15: [2022-11-25 17:45:25,770] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 3: [2022-11-25 17:45:25,773] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. 3: [2022-11-25 17:45:25,773] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt 3: [2022-11-25 17:45:25,773] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 7: [2022-11-25 17:45:25,776] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. 7: [2022-11-25 17:45:25,776] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt 7: [2022-11-25 17:45:25,776] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 62: [2022-11-25 17:45:25,778] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt. 62: [2022-11-25 17:45:25,779] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt 62: [2022-11-25 17:45:25,779] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 0: [2022-11-25 17:45:25,782] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. 0: [2022-11-25 17:45:25,782] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt 0: [2022-11-25 17:45:25,782] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 30: [2022-11-25 17:45:25,796] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. 30: [2022-11-25 17:45:25,796] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt 30: [2022-11-25 17:45:25,796] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 26: [2022-11-25 17:45:25,796] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. 26: [2022-11-25 17:45:25,797] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt 26: [2022-11-25 17:45:25,797] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 60: [2022-11-25 17:45:25,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt. 60: [2022-11-25 17:45:25,799] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt 60: [2022-11-25 17:45:25,799] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 39: [2022-11-25 17:45:25,801] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt. 39: [2022-11-25 17:45:25,802] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt 39: [2022-11-25 17:45:25,802] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 18: [2022-11-25 17:45:25,804] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. 18: [2022-11-25 17:45:25,804] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt 18: [2022-11-25 17:45:25,804] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 20: [2022-11-25 17:45:25,804] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. 20: [2022-11-25 17:45:25,804] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt 20: [2022-11-25 17:45:25,804] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 9: [2022-11-25 17:45:25,804] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. 9: [2022-11-25 17:45:25,804] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt 9: [2022-11-25 17:45:25,804] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 53: [2022-11-25 17:45:25,807] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt. 53: [2022-11-25 17:45:25,807] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt 53: [2022-11-25 17:45:25,807] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 8: [2022-11-25 17:45:25,807] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. 8: [2022-11-25 17:45:25,808] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt 16: [2022-11-25 17:45:25,808] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. 8: [2022-11-25 17:45:25,808] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 16: [2022-11-25 17:45:25,808] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt 16: [2022-11-25 17:45:25,808] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 1: [2022-11-25 17:45:25,810] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. 1: [2022-11-25 17:45:25,810] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt 1: [2022-11-25 17:45:25,810] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 55: [2022-11-25 17:45:25,811] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt. 55: [2022-11-25 17:45:25,811] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt 55: [2022-11-25 17:45:25,811] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 42: [2022-11-25 17:45:25,812] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt. 42: [2022-11-25 17:45:25,812] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt 42: [2022-11-25 17:45:25,812] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 37: [2022-11-25 17:45:25,816] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt. 37: [2022-11-25 17:45:25,816] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt 37: [2022-11-25 17:45:25,816] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 56: [2022-11-25 17:45:25,818] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt. 40: [2022-11-25 17:45:25,822] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt. 40: [2022-11-25 17:45:25,822] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt 40: [2022-11-25 17:45:25,822] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 4: [2022-11-25 17:45:25,825] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. 4: [2022-11-25 17:45:25,825] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt 4: [2022-11-25 17:45:25,825] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 43: [2022-11-25 17:45:25,827] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt. 43: [2022-11-25 17:45:25,827] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt 43: [2022-11-25 17:45:25,827] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 58: [2022-11-25 17:45:25,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt. 57: [2022-11-25 17:45:25,828] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt. 57: [2022-11-25 17:45:25,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt 58: [2022-11-25 17:45:25,828] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt 57: [2022-11-25 17:45:25,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 58: [2022-11-25 17:45:25,828] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 61: [2022-11-25 17:45:25,831] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt. 61: [2022-11-25 17:45:25,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt 61: [2022-11-25 17:45:25,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 24: [2022-11-25 17:45:25,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. 47: [2022-11-25 17:45:25,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt. 24: [2022-11-25 17:45:25,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt 47: [2022-11-25 17:45:25,832] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt 24: [2022-11-25 17:45:25,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 47: [2022-11-25 17:45:25,832] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 56: [2022-11-25 17:45:25,818] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt 56: [2022-11-25 17:45:25,818] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 14: [2022-11-25 17:45:25,832] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. 14: [2022-11-25 17:45:25,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt 14: [2022-11-25 17:45:25,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 46: [2022-11-25 17:45:25,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt. 46: [2022-11-25 17:45:25,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt 50: [2022-11-25 17:45:25,833] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt. 50: [2022-11-25 17:45:25,833] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt 46: [2022-11-25 17:45:25,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 50: [2022-11-25 17:45:25,833] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 33: [2022-11-25 17:45:25,834] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt. 33: [2022-11-25 17:45:25,834] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt 33: [2022-11-25 17:45:25,834] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 10: [2022-11-25 17:45:25,835] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. 10: [2022-11-25 17:45:25,835] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt 10: [2022-11-25 17:45:25,835] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 52: [2022-11-25 17:45:25,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt. 52: [2022-11-25 17:45:25,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt 52: [2022-11-25 17:45:25,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 17: [2022-11-25 17:45:25,839] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. 17: [2022-11-25 17:45:25,839] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt 17: [2022-11-25 17:45:25,839] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 27: [2022-11-25 17:45:25,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. 27: [2022-11-25 17:45:25,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt 27: [2022-11-25 17:45:25,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 12: [2022-11-25 17:45:25,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. 12: [2022-11-25 17:45:25,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt 12: [2022-11-25 17:45:25,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 48: [2022-11-25 17:45:25,841] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt. 48: [2022-11-25 17:45:25,841] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt 48: [2022-11-25 17:45:25,841] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 13: [2022-11-25 17:45:25,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. 13: [2022-11-25 17:45:25,842] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt 13: [2022-11-25 17:45:25,842] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 28: [2022-11-25 17:45:25,842] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. 28: [2022-11-25 17:45:25,843] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt 28: [2022-11-25 17:45:25,843] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 49: [2022-11-25 17:45:25,843] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt. 49: [2022-11-25 17:45:25,844] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt 49: [2022-11-25 17:45:25,844] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 32: [2022-11-25 17:45:25,845] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt. 32: [2022-11-25 17:45:25,845] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt 32: [2022-11-25 17:45:25,845] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 2: [2022-11-25 17:45:25,845] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. 51: [2022-11-25 17:45:25,845] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt. 51: [2022-11-25 17:45:25,845] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt 2: [2022-11-25 17:45:25,845] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt 51: [2022-11-25 17:45:25,845] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 2: [2022-11-25 17:45:25,845] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 21: [2022-11-25 17:45:25,845] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. 15: [2022-11-25 17:45:25,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. 15: [2022-11-25 17:45:25,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt 15: [2022-11-25 17:45:25,846] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 0: [2022-11-25 17:45:25,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. 0: [2022-11-25 17:45:25,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt 0: [2022-11-25 17:45:25,846] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 35: [2022-11-25 17:45:25,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt. 35: [2022-11-25 17:45:25,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt 35: [2022-11-25 17:45:25,846] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 31: [2022-11-25 17:45:25,846] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. 31: [2022-11-25 17:45:25,846] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt 31: [2022-11-25 17:45:25,847] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 19: [2022-11-25 17:45:25,847] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. 19: [2022-11-25 17:45:25,848] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt 19: [2022-11-25 17:45:25,848] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 25: [2022-11-25 17:45:25,849] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. 25: [2022-11-25 17:45:25,849] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt 25: [2022-11-25 17:45:25,849] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 38: [2022-11-25 17:45:25,849] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt. 38: [2022-11-25 17:45:25,849] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt 38: [2022-11-25 17:45:25,849] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 9: [2022-11-25 17:45:25,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. 44: [2022-11-25 17:45:25,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt. 9: [2022-11-25 17:45:25,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt 44: [2022-11-25 17:45:25,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt 9: [2022-11-25 17:45:25,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 44: [2022-11-25 17:45:25,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 22: [2022-11-25 17:45:25,850] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. 22: [2022-11-25 17:45:25,850] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt 22: [2022-11-25 17:45:25,850] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 45: [2022-11-25 17:45:25,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt. 23: [2022-11-25 17:45:25,852] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. 45: [2022-11-25 17:45:25,852] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt 23: [2022-11-25 17:45:25,852] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt 45: [2022-11-25 17:45:25,852] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 23: [2022-11-25 17:45:25,852] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 20: [2022-11-25 17:45:25,854] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. 20: [2022-11-25 17:45:25,854] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt 20: [2022-11-25 17:45:25,854] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 6: [2022-11-25 17:45:25,854] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. 6: [2022-11-25 17:45:25,854] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt 6: [2022-11-25 17:45:25,854] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 8: [2022-11-25 17:45:25,854] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. 8: [2022-11-25 17:45:25,854] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt 8: [2022-11-25 17:45:25,854] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 63: [2022-11-25 17:45:25,855] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt. 63: [2022-11-25 17:45:25,855] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt 63: [2022-11-25 17:45:25,855] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 56: [2022-11-25 17:45:25,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt. 3: [2022-11-25 17:45:25,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. 3: [2022-11-25 17:45:25,856] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt 3: [2022-11-25 17:45:25,856] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 26: [2022-11-25 17:45:25,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. 60: [2022-11-25 17:45:25,857] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt. 60: [2022-11-25 17:45:25,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt 26: [2022-11-25 17:45:25,857] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt 60: [2022-11-25 17:45:25,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 26: [2022-11-25 17:45:25,857] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 39: [2022-11-25 17:45:25,858] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt. 39: [2022-11-25 17:45:25,858] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt 39: [2022-11-25 17:45:25,858] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 21: [2022-11-25 17:45:25,845] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt 21: [2022-11-25 17:45:25,845] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 30: [2022-11-25 17:45:25,858] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. 30: [2022-11-25 17:45:25,858] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt 30: [2022-11-25 17:45:25,858] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 54: [2022-11-25 17:45:25,858] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt. 54: [2022-11-25 17:45:25,858] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt 54: [2022-11-25 17:45:25,858] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 56: [2022-11-25 17:45:25,856] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt 56: [2022-11-25 17:45:25,856] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 62: [2022-11-25 17:45:25,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt. 62: [2022-11-25 17:45:25,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt 62: [2022-11-25 17:45:25,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 41: [2022-11-25 17:45:25,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt. 53: [2022-11-25 17:45:25,860] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt. 41: [2022-11-25 17:45:25,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt 53: [2022-11-25 17:45:25,860] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt 41: [2022-11-25 17:45:25,860] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 53: [2022-11-25 17:45:25,861] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 46: [2022-11-25 17:45:25,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt. 46: [2022-11-25 17:45:25,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt 46: [2022-11-25 17:45:25,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 16: [2022-11-25 17:45:25,862] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. 16: [2022-11-25 17:45:25,862] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt 16: [2022-11-25 17:45:25,862] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 43: [2022-11-25 17:45:25,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt. 43: [2022-11-25 17:45:25,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt 43: [2022-11-25 17:45:25,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 34: [2022-11-25 17:45:25,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt. 34: [2022-11-25 17:45:25,864] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt 34: [2022-11-25 17:45:25,864] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 42: [2022-11-25 17:45:25,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt. 42: [2022-11-25 17:45:25,865] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt 42: [2022-11-25 17:45:25,865] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 11: [2022-11-25 17:45:25,865] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. 11: [2022-11-25 17:45:25,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt 11: [2022-11-25 17:45:25,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 63: [2022-11-25 17:45:25,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt. 63: [2022-11-25 17:45:25,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt 63: [2022-11-25 17:45:25,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 1: [2022-11-25 17:45:25,866] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. 1: [2022-11-25 17:45:25,866] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt 1: [2022-11-25 17:45:25,866] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 5: [2022-11-25 17:45:25,867] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. 5: [2022-11-25 17:45:25,867] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt 5: [2022-11-25 17:45:25,867] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 55: [2022-11-25 17:45:25,868] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt. 55: [2022-11-25 17:45:25,868] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt 55: [2022-11-25 17:45:25,868] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 18: [2022-11-25 17:45:25,869] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. 18: [2022-11-25 17:45:25,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt 18: [2022-11-25 17:45:25,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 37: [2022-11-25 17:45:25,870] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt. 37: [2022-11-25 17:45:25,870] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt 37: [2022-11-25 17:45:25,870] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 36: [2022-11-25 17:45:25,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt. 36: [2022-11-25 17:45:25,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt 36: [2022-11-25 17:45:25,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 40: [2022-11-25 17:45:25,873] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt. 40: [2022-11-25 17:45:25,873] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt 40: [2022-11-25 17:45:25,873] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 34: [2022-11-25 17:45:25,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt. 34: [2022-11-25 17:45:25,874] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt 34: [2022-11-25 17:45:25,874] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 4: [2022-11-25 17:45:25,876] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. 4: [2022-11-25 17:45:25,876] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt 4: [2022-11-25 17:45:25,876] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 47: [2022-11-25 17:45:25,878] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt. 47: [2022-11-25 17:45:25,878] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt 47: [2022-11-25 17:45:25,878] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 50: [2022-11-25 17:45:25,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt. 50: [2022-11-25 17:45:25,879] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt 50: [2022-11-25 17:45:25,879] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 24: [2022-11-25 17:45:25,879] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. 24: [2022-11-25 17:45:25,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt 24: [2022-11-25 17:45:25,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 32: [2022-11-25 17:45:25,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt. 32: [2022-11-25 17:45:25,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt 32: [2022-11-25 17:45:25,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 29: [2022-11-25 17:45:25,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. 29: [2022-11-25 17:45:25,880] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt 29: [2022-11-25 17:45:25,880] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 49: [2022-11-25 17:45:25,885] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt. 49: [2022-11-25 17:45:25,885] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt 49: [2022-11-25 17:45:25,885] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 57: [2022-11-25 17:45:25,885] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt. 57: [2022-11-25 17:45:25,886] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt 57: [2022-11-25 17:45:25,886] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 27: [2022-11-25 17:45:25,887] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. 27: [2022-11-25 17:45:25,887] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt 27: [2022-11-25 17:45:25,887] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 52: [2022-11-25 17:45:25,888] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt. 52: [2022-11-25 17:45:25,888] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt 52: [2022-11-25 17:45:25,888] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 17: [2022-11-25 17:45:25,889] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. 17: [2022-11-25 17:45:25,889] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt 17: [2022-11-25 17:45:25,890] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 31: [2022-11-25 17:45:25,890] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. 31: [2022-11-25 17:45:25,890] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt 45: [2022-11-25 17:45:25,890] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt. 31: [2022-11-25 17:45:25,890] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 45: [2022-11-25 17:45:25,890] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt 45: [2022-11-25 17:45:25,890] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 33: [2022-11-25 17:45:25,890] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt. 33: [2022-11-25 17:45:25,891] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt 33: [2022-11-25 17:45:25,891] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 28: [2022-11-25 17:45:25,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. 28: [2022-11-25 17:45:25,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt 7: [2022-11-25 17:45:25,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. 28: [2022-11-25 17:45:25,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 7: [2022-11-25 17:45:25,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt 51: [2022-11-25 17:45:25,893] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt. 7: [2022-11-25 17:45:25,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 51: [2022-11-25 17:45:25,893] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt 51: [2022-11-25 17:45:25,893] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 14: [2022-11-25 17:45:25,894] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. 14: [2022-11-25 17:45:25,894] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt 14: [2022-11-25 17:45:25,894] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 44: [2022-11-25 17:45:25,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt. 44: [2022-11-25 17:45:25,896] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt 44: [2022-11-25 17:45:25,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 61: [2022-11-25 17:45:25,896] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt. 61: [2022-11-25 17:45:25,896] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt 61: [2022-11-25 17:45:25,896] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 48: [2022-11-25 17:45:25,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt. 48: [2022-11-25 17:45:25,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt 48: [2022-11-25 17:45:25,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 58: [2022-11-25 17:45:25,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt. 58: [2022-11-25 17:45:25,897] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt 58: [2022-11-25 17:45:25,897] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 21: [2022-11-25 17:45:25,898] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. 22: [2022-11-25 17:45:25,899] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. 22: [2022-11-25 17:45:25,899] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt 21: [2022-11-25 17:45:25,898] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt 22: [2022-11-25 17:45:25,899] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 21: [2022-11-25 17:45:25,898] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 29: [2022-11-25 17:45:25,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. 29: [2022-11-25 17:45:25,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt 29: [2022-11-25 17:45:25,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 0: [2022-11-25 17:45:25,900] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. 0: [2022-11-25 17:45:25,900] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt 0: [2022-11-25 17:45:25,900] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 2: [2022-11-25 17:45:25,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. 2: [2022-11-25 17:45:25,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt 2: [2022-11-25 17:45:25,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 6: [2022-11-25 17:45:25,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. 6: [2022-11-25 17:45:25,901] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt 6: [2022-11-25 17:45:25,901] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 59: [2022-11-25 17:45:25,902] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt. 59: [2022-11-25 17:45:25,902] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt 59: [2022-11-25 17:45:25,902] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 15: [2022-11-25 17:45:25,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. 15: [2022-11-25 17:45:25,903] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt 15: [2022-11-25 17:45:25,903] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 20: [2022-11-25 17:45:25,904] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. 20: [2022-11-25 17:45:25,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt 20: [2022-11-25 17:45:25,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 11: [2022-11-25 17:45:25,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. 41: [2022-11-25 17:45:25,905] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt. 11: [2022-11-25 17:45:25,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt 11: [2022-11-25 17:45:25,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 41: [2022-11-25 17:45:25,905] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt 41: [2022-11-25 17:45:25,905] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 56: [2022-11-25 17:45:25,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt. 56: [2022-11-25 17:45:25,906] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt 56: [2022-11-25 17:45:25,906] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 10: [2022-11-25 17:45:25,906] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. 10: [2022-11-25 17:45:25,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt 10: [2022-11-25 17:45:25,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 35: [2022-11-25 17:45:25,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt. 35: [2022-11-25 17:45:25,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt 35: [2022-11-25 17:45:25,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 59: [2022-11-25 17:45:25,907] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt. 59: [2022-11-25 17:45:25,907] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt 59: [2022-11-25 17:45:25,907] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 3: [2022-11-25 17:45:25,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. 25: [2022-11-25 17:45:25,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. 3: [2022-11-25 17:45:25,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt 3: [2022-11-25 17:45:25,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 25: [2022-11-25 17:45:25,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt 25: [2022-11-25 17:45:25,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 5: [2022-11-25 17:45:25,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. 5: [2022-11-25 17:45:25,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt 62: [2022-11-25 17:45:25,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt. 5: [2022-11-25 17:45:25,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 62: [2022-11-25 17:45:25,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt 62: [2022-11-25 17:45:25,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 19: [2022-11-25 17:45:25,908] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. 19: [2022-11-25 17:45:25,908] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt 19: [2022-11-25 17:45:25,908] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 55: [2022-11-25 17:45:25,909] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt. 55: [2022-11-25 17:45:25,909] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt 55: [2022-11-25 17:45:25,909] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 53: [2022-11-25 17:45:25,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt. 53: [2022-11-25 17:45:25,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt 53: [2022-11-25 17:45:25,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 23: [2022-11-25 17:45:25,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. 23: [2022-11-25 17:45:25,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt 23: [2022-11-25 17:45:25,911] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 60: [2022-11-25 17:45:25,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt. 60: [2022-11-25 17:45:25,911] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt 60: [2022-11-25 17:45:25,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 42: [2022-11-25 17:45:25,911] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt. 42: [2022-11-25 17:45:25,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt 42: [2022-11-25 17:45:25,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 13: [2022-11-25 17:45:25,912] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. 13: [2022-11-25 17:45:25,912] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt 13: [2022-11-25 17:45:25,912] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 43: [2022-11-25 17:45:25,913] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt. 43: [2022-11-25 17:45:25,913] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt 43: [2022-11-25 17:45:25,913] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 1: [2022-11-25 17:45:25,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. 1: [2022-11-25 17:45:25,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt 1: [2022-11-25 17:45:25,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 16: [2022-11-25 17:45:25,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. 16: [2022-11-25 17:45:25,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt 30: [2022-11-25 17:45:25,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. 16: [2022-11-25 17:45:25,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 30: [2022-11-25 17:45:25,914] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt 30: [2022-11-25 17:45:25,914] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 12: [2022-11-25 17:45:25,915] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. 12: [2022-11-25 17:45:25,915] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt 12: [2022-11-25 17:45:25,915] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 9: [2022-11-25 17:45:25,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. 9: [2022-11-25 17:45:25,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt 9: [2022-11-25 17:45:25,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 18: [2022-11-25 17:45:25,916] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. 18: [2022-11-25 17:45:25,916] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt 18: [2022-11-25 17:45:25,916] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 7: [2022-11-25 17:45:25,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. 7: [2022-11-25 17:45:25,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt 7: [2022-11-25 17:45:25,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 26: [2022-11-25 17:45:25,917] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. 26: [2022-11-25 17:45:25,917] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt 26: [2022-11-25 17:45:25,917] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 63: [2022-11-25 17:45:25,918] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt. 63: [2022-11-25 17:45:25,919] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt 63: [2022-11-25 17:45:25,919] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 40: [2022-11-25 17:45:25,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt. 57: [2022-11-25 17:45:25,921] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt. 40: [2022-11-25 17:45:25,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt 40: [2022-11-25 17:45:25,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 57: [2022-11-25 17:45:25,921] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt 57: [2022-11-25 17:45:25,921] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 39: [2022-11-25 17:45:25,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt. 39: [2022-11-25 17:45:25,922] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt 39: [2022-11-25 17:45:25,922] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 54: [2022-11-25 17:45:25,922] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt. 54: [2022-11-25 17:45:25,922] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt 54: [2022-11-25 17:45:25,922] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 8: [2022-11-25 17:45:25,925] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. 8: [2022-11-25 17:45:25,925] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt 8: [2022-11-25 17:45:25,925] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 24: [2022-11-25 17:45:25,927] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. 24: [2022-11-25 17:45:25,927] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt 24: [2022-11-25 17:45:25,927] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 4: [2022-11-25 17:45:25,928] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. 4: [2022-11-25 17:45:25,928] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt 4: [2022-11-25 17:45:25,928] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 38: [2022-11-25 17:45:25,929] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt. 38: [2022-11-25 17:45:25,929] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt 38: [2022-11-25 17:45:25,929] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 52: [2022-11-25 17:45:25,931] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt. 52: [2022-11-25 17:45:25,931] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt 52: [2022-11-25 17:45:25,931] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 17: [2022-11-25 17:45:25,931] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. 17: [2022-11-25 17:45:25,931] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt 17: [2022-11-25 17:45:25,931] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 47: [2022-11-25 17:45:25,931] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt. 47: [2022-11-25 17:45:25,931] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt 50: [2022-11-25 17:45:25,931] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt. 47: [2022-11-25 17:45:25,931] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 50: [2022-11-25 17:45:25,931] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt 50: [2022-11-25 17:45:25,931] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 32: [2022-11-25 17:45:25,933] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt. 32: [2022-11-25 17:45:25,933] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt 32: [2022-11-25 17:45:25,933] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 49: [2022-11-25 17:45:25,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt. 14: [2022-11-25 17:45:25,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. 49: [2022-11-25 17:45:25,934] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt 14: [2022-11-25 17:45:25,934] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt 49: [2022-11-25 17:45:25,934] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 14: [2022-11-25 17:45:25,934] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 36: [2022-11-25 17:45:25,934] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt. 36: [2022-11-25 17:45:25,934] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt 36: [2022-11-25 17:45:25,934] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 21: [2022-11-25 17:45:25,936] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. 33: [2022-11-25 17:45:25,936] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt. 33: [2022-11-25 17:45:25,936] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt 33: [2022-11-25 17:45:25,936] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 27: [2022-11-25 17:45:25,938] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. 27: [2022-11-25 17:45:25,938] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt 27: [2022-11-25 17:45:25,938] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 21: [2022-11-25 17:45:25,936] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt 21: [2022-11-25 17:45:25,936] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 34: [2022-11-25 17:45:25,940] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt. 34: [2022-11-25 17:45:25,940] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt 34: [2022-11-25 17:45:25,940] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 45: [2022-11-25 17:45:25,940] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt. 61: [2022-11-25 17:45:25,940] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt. 45: [2022-11-25 17:45:25,940] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt 61: [2022-11-25 17:45:25,940] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt 45: [2022-11-25 17:45:25,940] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 61: [2022-11-25 17:45:25,940] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 58: [2022-11-25 17:45:25,941] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt. 58: [2022-11-25 17:45:25,941] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt 58: [2022-11-25 17:45:25,941] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 28: [2022-11-25 17:45:25,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. 28: [2022-11-25 17:45:25,943] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt 28: [2022-11-25 17:45:25,943] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 51: [2022-11-25 17:45:25,943] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt. 51: [2022-11-25 17:45:25,944] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt 51: [2022-11-25 17:45:25,944] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 37: [2022-11-25 17:45:25,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt. 37: [2022-11-25 17:45:25,945] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt 37: [2022-11-25 17:45:25,945] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 10: [2022-11-25 17:45:25,946] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. 10: [2022-11-25 17:45:25,946] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt 10: [2022-11-25 17:45:25,946] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 12: [2022-11-25 17:45:25,947] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. 12: [2022-11-25 17:45:25,947] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt 12: [2022-11-25 17:45:25,947] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 35: [2022-11-25 17:45:25,951] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt. 35: [2022-11-25 17:45:25,951] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt 35: [2022-11-25 17:45:25,951] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 29: [2022-11-25 17:45:25,952] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. 29: [2022-11-25 17:45:25,952] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt 29: [2022-11-25 17:45:25,952] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 44: [2022-11-25 17:45:25,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt. 44: [2022-11-25 17:45:25,953] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt 44: [2022-11-25 17:45:25,953] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 48: [2022-11-25 17:45:25,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt. 48: [2022-11-25 17:45:25,953] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt 48: [2022-11-25 17:45:25,953] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 22: [2022-11-25 17:45:25,953] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. 31: [2022-11-25 17:45:25,954] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. 22: [2022-11-25 17:45:25,953] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt 22: [2022-11-25 17:45:25,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 31: [2022-11-25 17:45:25,954] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt 31: [2022-11-25 17:45:25,954] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 38: [2022-11-25 17:45:25,958] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt. 38: [2022-11-25 17:45:25,958] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt 38: [2022-11-25 17:45:25,958] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 11: [2022-11-25 17:45:25,960] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. 11: [2022-11-25 17:45:25,960] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt 11: [2022-11-25 17:45:25,960] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 34: [2022-11-25 17:45:25,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt. 34: [2022-11-25 17:45:25,962] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt 34: [2022-11-25 17:45:25,962] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 46: [2022-11-25 17:45:25,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt. 46: [2022-11-25 17:45:25,962] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt 46: [2022-11-25 17:45:25,962] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 41: [2022-11-25 17:45:25,962] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt. 41: [2022-11-25 17:45:25,962] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt 41: [2022-11-25 17:45:25,963] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 63: [2022-11-25 17:45:25,963] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt. 63: [2022-11-25 17:45:25,963] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt 63: [2022-11-25 17:45:25,963] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 6: [2022-11-25 17:45:25,965] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. 6: [2022-11-25 17:45:25,965] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt 6: [2022-11-25 17:45:25,965] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 37: [2022-11-25 17:45:25,967] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt. 37: [2022-11-25 17:45:25,967] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt 37: [2022-11-25 17:45:25,967] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 10: [2022-11-25 17:45:25,968] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. 10: [2022-11-25 17:45:25,968] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt 10: [2022-11-25 17:45:25,968] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 21: [2022-11-25 17:45:25,968] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. 29: [2022-11-25 17:45:25,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. 29: [2022-11-25 17:45:25,969] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt 29: [2022-11-25 17:45:25,969] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 6: [2022-11-25 17:45:25,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. 6: [2022-11-25 17:45:25,969] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt 6: [2022-11-25 17:45:25,969] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 25: [2022-11-25 17:45:25,969] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. 25: [2022-11-25 17:45:25,970] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt 25: [2022-11-25 17:45:25,970] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 33: [2022-11-25 17:45:25,970] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt. 33: [2022-11-25 17:45:25,970] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt 33: [2022-11-25 17:45:25,970] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 48: [2022-11-25 17:45:25,970] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt. 48: [2022-11-25 17:45:25,971] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt 48: [2022-11-25 17:45:25,971] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 21: [2022-11-25 17:45:25,969] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt 21: [2022-11-25 17:45:25,969] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 36: [2022-11-25 17:45:25,971] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt. 36: [2022-11-25 17:45:25,971] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt 36: [2022-11-25 17:45:25,971] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 25: [2022-11-25 17:45:25,972] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. 25: [2022-11-25 17:45:25,972] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt 25: [2022-11-25 17:45:25,972] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 27: [2022-11-25 17:45:25,972] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. 27: [2022-11-25 17:45:25,972] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt 27: [2022-11-25 17:45:25,972] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 11: [2022-11-25 17:45:25,973] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. 11: [2022-11-25 17:45:25,973] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt 11: [2022-11-25 17:45:25,973] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 22: [2022-11-25 17:45:25,973] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. 22: [2022-11-25 17:45:25,973] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt 22: [2022-11-25 17:45:25,973] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 58: [2022-11-25 17:45:25,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt. 58: [2022-11-25 17:45:25,974] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt 7: [2022-11-25 17:45:25,974] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. 58: [2022-11-25 17:45:25,974] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 7: [2022-11-25 17:45:25,974] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt 7: [2022-11-25 17:45:25,974] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 38: [2022-11-25 17:45:25,976] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt. 38: [2022-11-25 17:45:25,976] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt 38: [2022-11-25 17:45:25,976] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 46: [2022-11-25 17:45:25,976] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt. 46: [2022-11-25 17:45:25,976] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt 46: [2022-11-25 17:45:25,976] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 12: [2022-11-25 17:45:25,977] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. 12: [2022-11-25 17:45:25,977] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt 12: [2022-11-25 17:45:25,977] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 7: [2022-11-25 17:45:25,978] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. 7: [2022-11-25 17:45:25,978] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt 7: [2022-11-25 17:45:25,978] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 35: [2022-11-25 17:45:25,978] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt. 35: [2022-11-25 17:45:25,978] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt 35: [2022-11-25 17:45:25,978] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 36: [2022-11-25 17:45:25,979] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt. 36: [2022-11-25 17:45:25,979] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt 36: [2022-11-25 17:45:25,979] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 54: [2022-11-25 17:45:25,980] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt. 54: [2022-11-25 17:45:25,980] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt 54: [2022-11-25 17:45:25,980] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 41: [2022-11-25 17:45:25,981] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt. 41: [2022-11-25 17:45:25,981] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt 59: [2022-11-25 17:45:25,981] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt. 41: [2022-11-25 17:45:25,981] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 59: [2022-11-25 17:45:25,981] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt 59: [2022-11-25 17:45:25,981] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 52: [2022-11-25 17:45:25,981] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt. 52: [2022-11-25 17:45:25,982] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt 52: [2022-11-25 17:45:25,982] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 59: [2022-11-25 17:45:25,982] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt. 59: [2022-11-25 17:45:25,982] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt 59: [2022-11-25 17:45:25,982] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 54: [2022-11-25 17:45:25,998] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt. 54: [2022-11-25 17:45:25,998] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5000/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt 54: [2022-11-25 17:45:25,998] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! 0: successfully saved checkpoint at iteration 5000 to checkpoints_8b7 63: time (ms) | save-checkpoint: 6905.83 63: iteration 5010/ 5494 | consumed samples: 5130240 | consumed tokens: 10506731520 | elapsed time per iteration (s): 6.59 | learning rate: 2.350E-05 | global batch size: 1024 | lm loss: 2.193453E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 155.391 | TFLOPs: 34.74 | 63: iteration 5020/ 5494 | consumed samples: 5140480 | consumed tokens: 10527703040 | elapsed time per iteration (s): 5.66 | learning rate: 2.335E-05 | global batch size: 1024 | lm loss: 2.181195E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.792 | TFLOPs: 40.42 | 63: iteration 5030/ 5494 | consumed samples: 5150720 | consumed tokens: 10548674560 | elapsed time per iteration (s): 5.98 | learning rate: 2.321E-05 | global batch size: 1024 | lm loss: 2.176740E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 171.255 | TFLOPs: 38.29 | 63: iteration 5040/ 5494 | consumed samples: 5160960 | consumed tokens: 10569646080 | elapsed time per iteration (s): 5.62 | learning rate: 2.308E-05 | global batch size: 1024 | lm loss: 2.201088E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.224 | TFLOPs: 40.74 | 63: iteration 5050/ 5494 | consumed samples: 5171200 | consumed tokens: 10590617600 | elapsed time per iteration (s): 5.61 | learning rate: 2.294E-05 | global batch size: 1024 | lm loss: 2.193160E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.578 | TFLOPs: 40.82 | 63: iteration 5060/ 5494 | consumed samples: 5181440 | consumed tokens: 10611589120 | elapsed time per iteration (s): 5.67 | learning rate: 2.281E-05 | global batch size: 1024 | lm loss: 2.174002E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 180.599 | TFLOPs: 40.38 | 63: iteration 5070/ 5494 | consumed samples: 5191680 | consumed tokens: 10632560640 | elapsed time per iteration (s): 5.38 | learning rate: 2.269E-05 | global batch size: 1024 | lm loss: 2.184715E+00 | grad norm: 0.138 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 190.202 | TFLOPs: 42.52 | 63: iteration 5080/ 5494 | consumed samples: 5201920 | consumed tokens: 10653532160 | elapsed time per iteration (s): 5.62 | learning rate: 2.256E-05 | global batch size: 1024 | lm loss: 2.191732E+00 | grad norm: 0.152 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.160 | TFLOPs: 40.73 | 63: iteration 5090/ 5494 | consumed samples: 5212160 | consumed tokens: 10674503680 | elapsed time per iteration (s): 5.91 | learning rate: 2.244E-05 | global batch size: 1024 | lm loss: 2.183935E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.288 | TFLOPs: 38.74 | 63: iteration 5100/ 5494 | consumed samples: 5222400 | consumed tokens: 10695475200 | elapsed time per iteration (s): 5.41 | learning rate: 2.232E-05 | global batch size: 1024 | lm loss: 2.186187E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.180 | TFLOPs: 42.29 | 63: iteration 5110/ 5494 | consumed samples: 5232640 | consumed tokens: 10716446720 | elapsed time per iteration (s): 5.42 | learning rate: 2.221E-05 | global batch size: 1024 | lm loss: 2.191594E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.064 | TFLOPs: 42.27 | 63: iteration 5120/ 5494 | consumed samples: 5242880 | consumed tokens: 10737418240 | elapsed time per iteration (s): 5.59 | learning rate: 2.209E-05 | global batch size: 1024 | lm loss: 2.186345E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 183.028 | TFLOPs: 40.92 | 63: iteration 5130/ 5494 | consumed samples: 5253120 | consumed tokens: 10758389760 | elapsed time per iteration (s): 5.77 | learning rate: 2.198E-05 | global batch size: 1024 | lm loss: 2.202552E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 177.581 | TFLOPs: 39.70 | 63: iteration 5140/ 5494 | consumed samples: 5263360 | consumed tokens: 10779361280 | elapsed time per iteration (s): 5.64 | learning rate: 2.188E-05 | global batch size: 1024 | lm loss: 2.190931E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.477 | TFLOPs: 40.57 | 63: iteration 5150/ 5494 | consumed samples: 5273600 | consumed tokens: 10800332800 | elapsed time per iteration (s): 5.71 | learning rate: 2.177E-05 | global batch size: 1024 | lm loss: 2.175324E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.248 | TFLOPs: 40.07 | 63: iteration 5160/ 5494 | consumed samples: 5283840 | consumed tokens: 10821304320 | elapsed time per iteration (s): 5.63 | learning rate: 2.167E-05 | global batch size: 1024 | lm loss: 2.187761E+00 | grad norm: 0.142 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.968 | TFLOPs: 40.68 | 63: iteration 5170/ 5494 | consumed samples: 5294080 | consumed tokens: 10842275840 | elapsed time per iteration (s): 5.74 | learning rate: 2.157E-05 | global batch size: 1024 | lm loss: 2.191279E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.384 | TFLOPs: 39.88 | 63: iteration 5180/ 5494 | consumed samples: 5304320 | consumed tokens: 10863247360 | elapsed time per iteration (s): 5.42 | learning rate: 2.148E-05 | global batch size: 1024 | lm loss: 2.180054E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 188.964 | TFLOPs: 42.25 | 63: iteration 5190/ 5494 | consumed samples: 5314560 | consumed tokens: 10884218880 | elapsed time per iteration (s): 5.40 | learning rate: 2.138E-05 | global batch size: 1024 | lm loss: 2.186482E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.599 | TFLOPs: 42.39 | 63: iteration 5200/ 5494 | consumed samples: 5324800 | consumed tokens: 10905190400 | elapsed time per iteration (s): 5.41 | learning rate: 2.130E-05 | global batch size: 1024 | lm loss: 2.193873E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.438 | TFLOPs: 42.35 | 63: iteration 5210/ 5494 | consumed samples: 5335040 | consumed tokens: 10926161920 | elapsed time per iteration (s): 5.62 | learning rate: 2.121E-05 | global batch size: 1024 | lm loss: 2.176751E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.169 | TFLOPs: 40.73 | 63: iteration 5220/ 5494 | consumed samples: 5345280 | consumed tokens: 10947133440 | elapsed time per iteration (s): 5.49 | learning rate: 2.113E-05 | global batch size: 1024 | lm loss: 2.194238E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.440 | TFLOPs: 41.68 | 63: iteration 5230/ 5494 | consumed samples: 5355520 | consumed tokens: 10968104960 | elapsed time per iteration (s): 5.49 | learning rate: 2.105E-05 | global batch size: 1024 | lm loss: 2.176428E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.630 | TFLOPs: 41.72 | 63: iteration 5240/ 5494 | consumed samples: 5365760 | consumed tokens: 10989076480 | elapsed time per iteration (s): 5.50 | learning rate: 2.097E-05 | global batch size: 1024 | lm loss: 2.173827E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 186.319 | TFLOPs: 41.65 | 63: iteration 5250/ 5494 | consumed samples: 5376000 | consumed tokens: 11010048000 | elapsed time per iteration (s): 5.51 | learning rate: 2.089E-05 | global batch size: 1024 | lm loss: 2.178470E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.837 | TFLOPs: 41.55 | 63: iteration 5260/ 5494 | consumed samples: 5386240 | consumed tokens: 11031019520 | elapsed time per iteration (s): 5.47 | learning rate: 2.082E-05 | global batch size: 1024 | lm loss: 2.194887E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 187.056 | TFLOPs: 41.82 | 63: iteration 5270/ 5494 | consumed samples: 5396480 | consumed tokens: 11051991040 | elapsed time per iteration (s): 5.53 | learning rate: 2.075E-05 | global batch size: 1024 | lm loss: 2.183040E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.069 | TFLOPs: 41.38 | 63: iteration 5280/ 5494 | consumed samples: 5406720 | consumed tokens: 11072962560 | elapsed time per iteration (s): 5.42 | learning rate: 2.069E-05 | global batch size: 1024 | lm loss: 2.187610E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 188.762 | TFLOPs: 42.20 | 63: iteration 5290/ 5494 | consumed samples: 5416960 | consumed tokens: 11093934080 | elapsed time per iteration (s): 5.41 | learning rate: 2.062E-05 | global batch size: 1024 | lm loss: 2.186448E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.329 | TFLOPs: 42.33 | 63: iteration 5300/ 5494 | consumed samples: 5427200 | consumed tokens: 11114905600 | elapsed time per iteration (s): 5.69 | learning rate: 2.057E-05 | global batch size: 1024 | lm loss: 2.174904E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 179.815 | TFLOPs: 40.20 | 63: iteration 5310/ 5494 | consumed samples: 5437440 | consumed tokens: 11135877120 | elapsed time per iteration (s): 5.38 | learning rate: 2.051E-05 | global batch size: 1024 | lm loss: 2.164177E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 190.232 | TFLOPs: 42.53 | 63: iteration 5320/ 5494 | consumed samples: 5447680 | consumed tokens: 11156848640 | elapsed time per iteration (s): 5.91 | learning rate: 2.045E-05 | global batch size: 1024 | lm loss: 2.187306E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 173.209 | TFLOPs: 38.72 | 63: iteration 5330/ 5494 | consumed samples: 5457920 | consumed tokens: 11177820160 | elapsed time per iteration (s): 5.60 | learning rate: 2.040E-05 | global batch size: 1024 | lm loss: 2.190031E+00 | grad norm: 0.137 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.930 | TFLOPs: 40.90 | 63: iteration 5340/ 5494 | consumed samples: 5468160 | consumed tokens: 11198791680 | elapsed time per iteration (s): 5.42 | learning rate: 2.036E-05 | global batch size: 1024 | lm loss: 2.187207E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.080 | TFLOPs: 42.27 | 63: iteration 5350/ 5494 | consumed samples: 5478400 | consumed tokens: 11219763200 | elapsed time per iteration (s): 5.62 | learning rate: 2.031E-05 | global batch size: 1024 | lm loss: 2.182147E+00 | grad norm: 0.139 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 182.130 | TFLOPs: 40.72 | 63: iteration 5360/ 5494 | consumed samples: 5488640 | consumed tokens: 11240734720 | elapsed time per iteration (s): 5.74 | learning rate: 2.027E-05 | global batch size: 1024 | lm loss: 2.200607E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 178.310 | TFLOPs: 39.86 | 63: iteration 5370/ 5494 | consumed samples: 5498880 | consumed tokens: 11261706240 | elapsed time per iteration (s): 5.53 | learning rate: 2.023E-05 | global batch size: 1024 | lm loss: 2.178468E+00 | grad norm: 0.143 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.107 | TFLOPs: 41.38 | 63: iteration 5380/ 5494 | consumed samples: 5509120 | consumed tokens: 11282677760 | elapsed time per iteration (s): 5.57 | learning rate: 2.020E-05 | global batch size: 1024 | lm loss: 2.184606E+00 | grad norm: 0.134 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 183.783 | TFLOPs: 41.09 | 63: iteration 5390/ 5494 | consumed samples: 5519360 | consumed tokens: 11303649280 | elapsed time per iteration (s): 5.52 | learning rate: 2.016E-05 | global batch size: 1024 | lm loss: 2.166841E+00 | grad norm: 0.136 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 185.392 | TFLOPs: 41.45 | 63: iteration 5400/ 5494 | consumed samples: 5529600 | consumed tokens: 11324620800 | elapsed time per iteration (s): 5.65 | learning rate: 2.013E-05 | global batch size: 1024 | lm loss: 2.173445E+00 | grad norm: 0.130 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.123 | TFLOPs: 40.49 | 63: iteration 5410/ 5494 | consumed samples: 5539840 | consumed tokens: 11345592320 | elapsed time per iteration (s): 5.54 | learning rate: 2.011E-05 | global batch size: 1024 | lm loss: 2.202586E+00 | grad norm: 0.131 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 184.855 | TFLOPs: 41.33 | 63: iteration 5420/ 5494 | consumed samples: 5550080 | consumed tokens: 11366563840 | elapsed time per iteration (s): 5.65 | learning rate: 2.008E-05 | global batch size: 1024 | lm loss: 2.173610E+00 | grad norm: 0.126 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.331 | TFLOPs: 40.54 | 63: iteration 5430/ 5494 | consumed samples: 5560320 | consumed tokens: 11387535360 | elapsed time per iteration (s): 5.88 | learning rate: 2.006E-05 | global batch size: 1024 | lm loss: 2.163896E+00 | grad norm: 0.141 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 174.234 | TFLOPs: 38.95 | 63: iteration 5440/ 5494 | consumed samples: 5570560 | consumed tokens: 11408506880 | elapsed time per iteration (s): 5.41 | learning rate: 2.004E-05 | global batch size: 1024 | lm loss: 2.171603E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.430 | TFLOPs: 42.35 | 63: iteration 5450/ 5494 | consumed samples: 5580800 | consumed tokens: 11429478400 | elapsed time per iteration (s): 5.47 | learning rate: 2.003E-05 | global batch size: 1024 | lm loss: 2.176141E+00 | grad norm: 0.132 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 187.147 | TFLOPs: 41.84 | 63: iteration 5460/ 5494 | consumed samples: 5591040 | consumed tokens: 11450449920 | elapsed time per iteration (s): 5.39 | learning rate: 2.002E-05 | global batch size: 1024 | lm loss: 2.182641E+00 | grad norm: 0.128 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 189.809 | TFLOPs: 42.44 | 63: iteration 5470/ 5494 | consumed samples: 5601280 | consumed tokens: 11471421440 | elapsed time per iteration (s): 5.42 | learning rate: 2.001E-05 | global batch size: 1024 | lm loss: 2.193220E+00 | grad norm: 0.133 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 188.830 | TFLOPs: 42.22 | 63: iteration 5480/ 5494 | consumed samples: 5611520 | consumed tokens: 11492392960 | elapsed time per iteration (s): 5.66 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.176518E+00 | grad norm: 0.129 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 181.003 | TFLOPs: 40.47 | 63: iteration 5490/ 5494 | consumed samples: 5621760 | consumed tokens: 11513364480 | elapsed time per iteration (s): 5.42 | learning rate: 2.000E-05 | global batch size: 1024 | lm loss: 2.191150E+00 | grad norm: 0.135 | num zeros: 0.0 | number of skipped iterations: 0 | number of nan iterations: 0 | samples per second: 188.864 | TFLOPs: 42.22 | 0: [after training is done] datetime: 2022-11-25 18:31:21 0: saving checkpoint at iteration 5494 to checkpoints_8b7 63: ------------------------------------------------------------------------------------------------------------ 63: valid loss at the end of training for val data | lm loss value: 2.142454E+00 | lm loss PPL: 8.520322E+00 | 63: ------------------------------------------------------------------------------------------------------------ 0: [2022-11-25 18:31:23,200] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step5494 is begin to save! 0: [2022-11-25 18:31:23,239] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_01-model_01-model_states.pt... 0: [2022-11-25 18:31:23,239] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_01-model_00-model_states.pt... 32: [2022-11-25 18:31:23,241] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_24-model_01-model_states.pt... 32: [2022-11-25 18:31:23,241] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_24-model_00-model_states.pt... 0: [2022-11-25 18:31:23,524] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_01-model_00-model_states.pt. 0: [2022-11-25 18:31:23,525] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_03-model_00-model_states.pt... 0: [2022-11-25 18:31:23,525] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_01-model_01-model_states.pt. 0: [2022-11-25 18:31:23,525] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_03-model_01-model_states.pt... 32: [2022-11-25 18:31:23,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_24-model_01-model_states.pt. 32: [2022-11-25 18:31:23,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_24-model_00-model_states.pt. 32: [2022-11-25 18:31:23,600] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_25-model_01-model_states.pt... 32: [2022-11-25 18:31:23,600] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_25-model_00-model_states.pt... 0: [2022-11-25 18:31:23,755] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_03-model_01-model_states.pt. 0: [2022-11-25 18:31:23,755] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_04-model_01-model_states.pt... 0: [2022-11-25 18:31:23,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_03-model_00-model_states.pt. 0: [2022-11-25 18:31:23,760] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_04-model_00-model_states.pt... 32: [2022-11-25 18:31:23,851] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_25-model_01-model_states.pt. 32: [2022-11-25 18:31:23,852] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_26-model_01-model_states.pt... 32: [2022-11-25 18:31:23,853] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_25-model_00-model_states.pt. 32: [2022-11-25 18:31:23,853] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_26-model_00-model_states.pt... 0: [2022-11-25 18:31:24,001] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_04-model_01-model_states.pt. 0: [2022-11-25 18:31:24,001] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_05-model_01-model_states.pt... 0: [2022-11-25 18:31:24,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_04-model_00-model_states.pt. 0: [2022-11-25 18:31:24,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_05-model_00-model_states.pt... 32: [2022-11-25 18:31:24,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_26-model_01-model_states.pt. 32: [2022-11-25 18:31:24,126] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_27-model_01-model_states.pt... 32: [2022-11-25 18:31:24,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_26-model_00-model_states.pt. 32: [2022-11-25 18:31:24,144] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_27-model_00-model_states.pt... 0: [2022-11-25 18:31:24,256] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_05-model_01-model_states.pt. 0: [2022-11-25 18:31:24,257] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_06-model_01-model_states.pt... 0: [2022-11-25 18:31:24,280] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_05-model_00-model_states.pt. 0: [2022-11-25 18:31:24,280] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_06-model_00-model_states.pt... 32: [2022-11-25 18:31:24,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_27-model_01-model_states.pt. 32: [2022-11-25 18:31:24,381] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_28-model_01-model_states.pt... 32: [2022-11-25 18:31:24,397] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_27-model_00-model_states.pt. 32: [2022-11-25 18:31:24,397] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_28-model_00-model_states.pt... 0: [2022-11-25 18:31:24,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_06-model_00-model_states.pt. 0: [2022-11-25 18:31:24,527] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_06-model_01-model_states.pt. 0: [2022-11-25 18:31:24,527] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_07-model_01-model_states.pt... 0: [2022-11-25 18:31:24,527] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_07-model_00-model_states.pt... 32: [2022-11-25 18:31:24,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_28-model_01-model_states.pt. 32: [2022-11-25 18:31:24,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_29-model_01-model_states.pt... 32: [2022-11-25 18:31:24,659] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_28-model_00-model_states.pt. 32: [2022-11-25 18:31:24,659] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_29-model_00-model_states.pt... 0: [2022-11-25 18:31:24,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_07-model_01-model_states.pt. 0: [2022-11-25 18:31:24,752] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_08-model_01-model_states.pt... 0: [2022-11-25 18:31:24,762] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_07-model_00-model_states.pt. 0: [2022-11-25 18:31:24,763] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_08-model_00-model_states.pt... 32: [2022-11-25 18:31:24,901] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_29-model_00-model_states.pt. 32: [2022-11-25 18:31:24,902] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_30-model_00-model_states.pt... 32: [2022-11-25 18:31:24,914] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_29-model_01-model_states.pt. 32: [2022-11-25 18:31:24,915] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_30-model_01-model_states.pt... 0: [2022-11-25 18:31:24,999] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_08-model_01-model_states.pt. 0: [2022-11-25 18:31:25,000] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_09-model_01-model_states.pt... 0: [2022-11-25 18:31:25,011] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_08-model_00-model_states.pt. 0: [2022-11-25 18:31:25,012] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_09-model_00-model_states.pt... 32: [2022-11-25 18:31:25,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_30-model_01-model_states.pt. 32: [2022-11-25 18:31:25,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_30-model_00-model_states.pt. 32: [2022-11-25 18:31:25,174] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_31-model_01-model_states.pt... 32: [2022-11-25 18:31:25,174] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_31-model_00-model_states.pt... 0: [2022-11-25 18:31:25,242] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_09-model_01-model_states.pt. 0: [2022-11-25 18:31:25,243] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_10-model_01-model_states.pt... 0: [2022-11-25 18:31:25,247] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_09-model_00-model_states.pt. 0: [2022-11-25 18:31:25,248] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_10-model_00-model_states.pt... 32: [2022-11-25 18:31:25,418] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_31-model_00-model_states.pt. 32: [2022-11-25 18:31:25,418] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_32-model_00-model_states.pt... 32: [2022-11-25 18:31:25,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_31-model_01-model_states.pt. 32: [2022-11-25 18:31:25,421] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_32-model_01-model_states.pt... 0: [2022-11-25 18:31:25,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_10-model_00-model_states.pt. 0: [2022-11-25 18:31:25,479] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_11-model_00-model_states.pt... 0: [2022-11-25 18:31:25,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_10-model_01-model_states.pt. 0: [2022-11-25 18:31:25,488] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_11-model_01-model_states.pt... 32: [2022-11-25 18:31:25,670] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_32-model_01-model_states.pt. 32: [2022-11-25 18:31:25,670] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_33-model_01-model_states.pt... 32: [2022-11-25 18:31:25,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_32-model_00-model_states.pt. 32: [2022-11-25 18:31:25,671] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_33-model_00-model_states.pt... 0: [2022-11-25 18:31:25,708] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_11-model_00-model_states.pt. 0: [2022-11-25 18:31:25,709] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_12-model_00-model_states.pt... 0: [2022-11-25 18:31:25,714] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_11-model_01-model_states.pt. 0: [2022-11-25 18:31:25,714] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_12-model_01-model_states.pt... 32: [2022-11-25 18:31:25,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_33-model_01-model_states.pt. 32: [2022-11-25 18:31:25,897] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_34-model_01-model_states.pt... 32: [2022-11-25 18:31:25,919] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_33-model_00-model_states.pt. 32: [2022-11-25 18:31:25,920] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_34-model_00-model_states.pt... 0: [2022-11-25 18:31:25,929] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_12-model_01-model_states.pt. 0: [2022-11-25 18:31:25,930] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_13-model_01-model_states.pt... 0: [2022-11-25 18:31:25,945] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_12-model_00-model_states.pt. 0: [2022-11-25 18:31:25,946] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_13-model_00-model_states.pt... 0: [2022-11-25 18:31:26,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_13-model_01-model_states.pt. 0: [2022-11-25 18:31:26,174] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_14-model_01-model_states.pt... 32: [2022-11-25 18:31:26,179] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_34-model_01-model_states.pt. 32: [2022-11-25 18:31:26,180] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_35-model_01-model_states.pt... 32: [2022-11-25 18:31:26,180] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_34-model_00-model_states.pt. 32: [2022-11-25 18:31:26,181] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_35-model_00-model_states.pt... 0: [2022-11-25 18:31:26,193] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_13-model_00-model_states.pt. 0: [2022-11-25 18:31:26,193] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_14-model_00-model_states.pt... 0: [2022-11-25 18:31:26,409] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_14-model_00-model_states.pt. 0: [2022-11-25 18:31:26,409] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_15-model_00-model_states.pt... 0: [2022-11-25 18:31:26,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_14-model_01-model_states.pt. 0: [2022-11-25 18:31:26,410] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_15-model_01-model_states.pt... 32: [2022-11-25 18:31:26,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_35-model_00-model_states.pt. 32: [2022-11-25 18:31:26,420] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_36-model_00-model_states.pt... 32: [2022-11-25 18:31:26,421] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_35-model_01-model_states.pt. 32: [2022-11-25 18:31:26,421] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_36-model_01-model_states.pt... 0: [2022-11-25 18:31:26,640] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_15-model_01-model_states.pt. 0: [2022-11-25 18:31:26,641] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_16-model_01-model_states.pt... 0: [2022-11-25 18:31:26,643] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_15-model_00-model_states.pt. 0: [2022-11-25 18:31:26,643] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_16-model_00-model_states.pt... 32: [2022-11-25 18:31:26,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_36-model_00-model_states.pt. 32: [2022-11-25 18:31:26,672] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_37-model_00-model_states.pt... 32: [2022-11-25 18:31:26,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_36-model_01-model_states.pt. 32: [2022-11-25 18:31:26,681] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_37-model_01-model_states.pt... 0: [2022-11-25 18:31:26,872] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_16-model_01-model_states.pt. 0: [2022-11-25 18:31:26,873] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_17-model_01-model_states.pt... 0: [2022-11-25 18:31:26,883] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_16-model_00-model_states.pt. 0: [2022-11-25 18:31:26,883] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_17-model_00-model_states.pt... 32: [2022-11-25 18:31:26,890] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_37-model_00-model_states.pt. 32: [2022-11-25 18:31:26,891] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_38-model_00-model_states.pt... 32: [2022-11-25 18:31:26,903] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_37-model_01-model_states.pt. 32: [2022-11-25 18:31:26,904] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_38-model_01-model_states.pt... 0: [2022-11-25 18:31:27,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_17-model_00-model_states.pt. 0: [2022-11-25 18:31:27,125] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_18-model_00-model_states.pt... 0: [2022-11-25 18:31:27,135] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_17-model_01-model_states.pt. 0: [2022-11-25 18:31:27,136] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_18-model_01-model_states.pt... 32: [2022-11-25 18:31:27,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_38-model_00-model_states.pt. 32: [2022-11-25 18:31:27,139] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_39-model_00-model_states.pt... 32: [2022-11-25 18:31:27,161] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_38-model_01-model_states.pt. 32: [2022-11-25 18:31:27,161] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_39-model_01-model_states.pt... 0: [2022-11-25 18:31:27,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_18-model_00-model_states.pt. 0: [2022-11-25 18:31:27,378] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_18-model_01-model_states.pt. 0: [2022-11-25 18:31:27,378] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_19-model_00-model_states.pt... 0: [2022-11-25 18:31:27,379] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_19-model_01-model_states.pt... 32: [2022-11-25 18:31:27,391] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_39-model_00-model_states.pt. 32: [2022-11-25 18:31:27,392] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_40-model_00-model_states.pt... 32: [2022-11-25 18:31:27,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_39-model_01-model_states.pt. 32: [2022-11-25 18:31:27,411] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_40-model_01-model_states.pt... 32: [2022-11-25 18:31:27,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_40-model_00-model_states.pt. 0: [2022-11-25 18:31:27,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_19-model_01-model_states.pt. 32: [2022-11-25 18:31:27,623] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_41-model_00-model_states.pt... 0: [2022-11-25 18:31:27,623] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_20-model_01-model_states.pt... 32: [2022-11-25 18:31:27,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_40-model_01-model_states.pt. 0: [2022-11-25 18:31:27,634] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_19-model_00-model_states.pt. 32: [2022-11-25 18:31:27,634] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_41-model_01-model_states.pt... 0: [2022-11-25 18:31:27,634] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_20-model_00-model_states.pt... 32: [2022-11-25 18:31:27,856] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_41-model_00-model_states.pt. 32: [2022-11-25 18:31:27,856] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_42-model_00-model_states.pt... 0: [2022-11-25 18:31:27,864] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_20-model_00-model_states.pt. 0: [2022-11-25 18:31:27,864] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_21-model_00-model_states.pt... 32: [2022-11-25 18:31:27,874] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_41-model_01-model_states.pt. 32: [2022-11-25 18:31:27,875] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_42-model_01-model_states.pt... 0: [2022-11-25 18:31:27,880] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_20-model_01-model_states.pt. 0: [2022-11-25 18:31:27,881] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_21-model_01-model_states.pt... 32: [2022-11-25 18:31:28,087] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_42-model_00-model_states.pt. 32: [2022-11-25 18:31:28,088] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_43-model_00-model_states.pt... 0: [2022-11-25 18:31:28,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_21-model_01-model_states.pt. 0: [2022-11-25 18:31:28,092] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_21-model_00-model_states.pt. 0: [2022-11-25 18:31:28,093] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_22-model_01-model_states.pt... 0: [2022-11-25 18:31:28,093] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_22-model_00-model_states.pt... 32: [2022-11-25 18:31:28,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_42-model_01-model_states.pt. 32: [2022-11-25 18:31:28,097] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_43-model_01-model_states.pt... 0: [2022-11-25 18:31:28,329] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_22-model_01-model_states.pt. 0: [2022-11-25 18:31:28,329] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_23-model_01-model_states.pt... 32: [2022-11-25 18:31:28,330] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_43-model_00-model_states.pt. 32: [2022-11-25 18:31:28,330] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_44-model_00-model_states.pt... 0: [2022-11-25 18:31:28,331] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_22-model_00-model_states.pt. 0: [2022-11-25 18:31:28,331] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_23-model_00-model_states.pt... 32: [2022-11-25 18:31:28,347] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_43-model_01-model_states.pt. 32: [2022-11-25 18:31:28,347] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_44-model_01-model_states.pt... 0: [2022-11-25 18:31:28,553] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_23-model_01-model_states.pt. 0: [2022-11-25 18:31:28,554] [INFO] [logging.py:68:log_dist] [Rank 1] Saving model checkpoint: checkpoints_8b7/global_step5494/mp_rank_01_model_states.pt 0: [2022-11-25 18:31:28,554] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/mp_rank_01_model_states.pt... 0: [2022-11-25 18:31:28,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_23-model_00-model_states.pt. 0: [2022-11-25 18:31:28,565] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: checkpoints_8b7/global_step5494/mp_rank_00_model_states.pt 0: [2022-11-25 18:31:28,565] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/mp_rank_00_model_states.pt... 32: [2022-11-25 18:31:28,581] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_44-model_00-model_states.pt. 32: [2022-11-25 18:31:28,582] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_46-model_00-model_states.pt... 32: [2022-11-25 18:31:28,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_46-model_00-model_states.pt. 32: [2022-11-25 18:31:28,596] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/mp_rank_02_model_states.pt... 0: [2022-11-25 18:31:28,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/mp_rank_00_model_states.pt. 0: [2022-11-25 18:31:28,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/mp_rank_01_model_states.pt. 32: [2022-11-25 18:31:28,603] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_44-model_01-model_states.pt. 32: [2022-11-25 18:31:28,603] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/layer_46-model_01-model_states.pt... 32: [2022-11-25 18:31:28,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/layer_46-model_01-model_states.pt. 32: [2022-11-25 18:31:28,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/mp_rank_02_model_states.pt. 32: [2022-11-25 18:31:28,606] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/mp_rank_03_model_states.pt... 32: [2022-11-25 18:31:28,608] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/mp_rank_03_model_states.pt. 57: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt... 57: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt... 57: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt... 57: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt... 63: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt... 63: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt... 39: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt... 53: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt... 53: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt... 55: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt... 55: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt... 55: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt... 51: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt... 51: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt... 51: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt... 51: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt... 33: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt... 61: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt... 61: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt... 59: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt... 59: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt... 59: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt... 59: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt... 35: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt... 35: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt... 5: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt... 29: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt... 37: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt... 37: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt... 37: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt... 58: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt... 58: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt... 58: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt... 40: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt... 40: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt... 40: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt... 60: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt... 60: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt... 60: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt... 52: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt... 52: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt... 30: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt... 30: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt... 4: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt... 4: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt... 4: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt... 56: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt... 56: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt... 56: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt... 54: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt... 54: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt... 54: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt... 62: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt... 62: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt... 36: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt... 36: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt... 28: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt... 28: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt... 38: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt... 38: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt... 38: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt... 0: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt... 2: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt... 24: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt... 24: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt... 49: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt... 49: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt... 47: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt... 47: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt... 47: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt... 41: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt... 41: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt... 41: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt... 41: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt... 45: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt... 45: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt... 45: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt... 45: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt... 9: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt... 9: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt... 43: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt... 43: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt... 43: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt... 43: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt... 27: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt... 27: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt... 25: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt... 3: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt... 7: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt... 17: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt... 23: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt... 11: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt... 11: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt... 31: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt... 19: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt... 34: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt... 34: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt... 46: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt... 46: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt... 46: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt... 44: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt... 44: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt... 44: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt... 44: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt... 6: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt... 6: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt... 16: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt... 18: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt... 18: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt... 18: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt... 10: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt... 20: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt... 8: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt... 8: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt... 48: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt... 48: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt... 48: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt... 50: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt... 50: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt... 42: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt... 42: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt... 42: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt... 32: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt... 12: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt... 12: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt... 26: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt... 26: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt... 14: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt... 15: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt... 22: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt... 22: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt... 21: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt... 21: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt... 57: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt... 63: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt... 63: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt... 63: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt... 63: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt... 63: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt... 39: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt... 53: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt... 55: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt... 55: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt... 55: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt... 51: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt... 33: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt... 1: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt... 61: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt... 61: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt... 59: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt... 59: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt... 13: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt... 13: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt... 35: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt... 5: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt... 29: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt... 29: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt... 29: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt... 37: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt... 58: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt... 40: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt... 60: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt... 60: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt... 60: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt... 60: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt... 52: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt... 52: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt... 30: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt... 4: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt... 56: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt... 56: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt... 54: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt... 54: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt... 62: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt... 62: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt... 36: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt... 28: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt... 38: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt... 38: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt... 38: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt... 0: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt... 2: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt... 24: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt... 49: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt... 49: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt... 49: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt... 49: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt... 47: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt... 41: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt... 45: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt... 45: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt... 9: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt... 43: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt... 43: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt... 27: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt... 25: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt... 25: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt... 3: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt... 3: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt... 7: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt... 7: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt... 17: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt... 17: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt... 23: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt... 11: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt... 31: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt... 31: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt... 19: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt... 34: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt... 34: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt... 34: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt... 34: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt... 34: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt... 46: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt... 46: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt... 44: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt... 44: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt... 6: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt... 6: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt... 16: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt... 16: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt... 18: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt... 10: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt... 10: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt... 10: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt... 20: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt... 20: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt... 8: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt... 48: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt... 50: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt... 42: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt... 32: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt... 32: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt... 12: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt... 26: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt... 26: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt... 14: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt... 15: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt... 15: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt... 22: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt... 22: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt... 21: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt... 57: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt... 63: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt... 39: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt... 39: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt... 53: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt... 53: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt... 55: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt... 51: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt... 33: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt... 1: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt... 1: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt... 61: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt... 59: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt... 13: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt... 35: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt... 5: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt... 29: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt... 37: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt... 58: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt... 40: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt... 40: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt... 60: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt... 52: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt... 30: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt... 30: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt... 4: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt... 56: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt... 54: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt... 62: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt... 62: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt... 62: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt... 36: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt... 36: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt... 36: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt... 28: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt... 38: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt... 38: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt... 0: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt... 2: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt... 24: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt... 49: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt... 47: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt... 41: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt... 41: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt... 45: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt... 9: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt... 43: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt... 27: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt... 25: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt... 3: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt... 7: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt... 7: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt... 17: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt... 17: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt... 23: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt... 11: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt... 31: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt... 19: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt... 34: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt... 46: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt... 46: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt... 46: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt... 44: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt... 44: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt... 6: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt... 6: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt... 16: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt... 18: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt... 18: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt... 10: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt... 20: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt... 8: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt... 48: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt... 48: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt... 50: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt... 42: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt... 42: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt... 42: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt... 32: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt... 12: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt... 26: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt... 14: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt... 15: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt... 22: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt... 21: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt... 57: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt... 39: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt... 53: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt... 55: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt... 51: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt... 33: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt... 1: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt... 61: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt... 59: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt... 13: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt... 35: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt... 5: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt... 5: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt... 29: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt... 37: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt... 58: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt... 40: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt... 52: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt... 30: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt... 30: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt... 4: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt... 56: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt... 54: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt... 62: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt... 36: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt... 28: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt... 0: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt... 2: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt... 24: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt... 49: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt... 47: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt... 41: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt... 45: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt... 9: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt... 43: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt... 27: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt... 25: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt... 3: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt... 7: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt... 7: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt... 17: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt... 17: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt... 23: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt... 11: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt... 31: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt... 19: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt... 6: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt... 6: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt... 16: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt... 18: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt... 18: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt... 10: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt... 20: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt... 8: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt... 48: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt... 50: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt... 50: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt... 42: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt... 32: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt... 12: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt... 26: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt... 14: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt... 15: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt... 22: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt... 22: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt... 21: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt... 57: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt... 39: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt... 53: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt... 51: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt... 33: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt... 1: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt... 61: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt... 61: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt... 13: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt... 35: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt... 5: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt... 5: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt... 29: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt... 37: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt... 58: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt... 40: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt... 52: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt... 30: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt... 4: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt... 56: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt... 54: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt... 36: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt... 28: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt... 28: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt... 28: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt... 0: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt... 2: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt... 24: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt... 47: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt... 9: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt... 27: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt... 25: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt... 3: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt... 7: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt... 17: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt... 23: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt... 11: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt... 31: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt... 19: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt... 16: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt... 10: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt... 10: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt... 20: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt... 8: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt... 48: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt... 50: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt... 32: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt... 32: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt... 12: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt... 26: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt... 26: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt... 14: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt... 14: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt... 15: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt... 22: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt... 21: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt... 39: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt... 53: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt... 33: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt... 1: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt... 13: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt... 13: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt... 35: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt... 5: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt... 29: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt... 37: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt... 58: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt... 52: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt... 4: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt... 0: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt... 0: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt... 2: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt... 2: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt... 24: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt... 47: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt... 9: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt... 27: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt... 25: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt... 3: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt... 23: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt... 11: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt... 31: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt... 19: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt... 16: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt... 20: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt... 8: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt... 50: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt... 32: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt... 12: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt... 14: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt... 15: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt... 21: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt... 39: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt... 33: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt... 1: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt... 13: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt... 35: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt... 0: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt... 2: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt... 24: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt... 9: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt... 27: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt... 25: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt... 3: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt... 23: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt... 11: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt... 31: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt... 19: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt... 16: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt... 20: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt... 8: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt... 12: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt... 14: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt... 15: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt... 21: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt... 33: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt... 1: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt... 23: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt... 19: [2022-11-25 18:31:28,817] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving checkpoints_8b7/global_step5494/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt... 32: [2022-11-25 18:31:29,061] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt. 32: [2022-11-25 18:31:29,061] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_0_mp_rank_03_optim_states.pt 32: [2022-11-25 18:31:29,061] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 4: [2022-11-25 18:31:29,062] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt. 4: [2022-11-25 18:31:29,062] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_17_mp_rank_00_optim_states.pt 4: [2022-11-25 18:31:29,062] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 20: [2022-11-25 18:31:29,066] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt. 20: [2022-11-25 18:31:29,066] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_80_mp_rank_00_optim_states.pt 20: [2022-11-25 18:31:29,066] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 13: [2022-11-25 18:31:29,067] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt. 13: [2022-11-25 18:31:29,067] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_52_mp_rank_00_optim_states.pt 13: [2022-11-25 18:31:29,067] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 14: [2022-11-25 18:31:29,070] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt. 14: [2022-11-25 18:31:29,070] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_58_mp_rank_00_optim_states.pt 14: [2022-11-25 18:31:29,070] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 0: [2022-11-25 18:31:29,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt. 0: [2022-11-25 18:31:29,072] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt 0: [2022-11-25 18:31:29,072] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 0: [2022-11-25 18:31:29,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt. 32: [2022-11-25 18:31:29,072] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt. 32: [2022-11-25 18:31:29,073] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_0_mp_rank_02_optim_states.pt 32: [2022-11-25 18:31:29,073] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 16: [2022-11-25 18:31:29,077] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt. 16: [2022-11-25 18:31:29,077] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_64_mp_rank_01_optim_states.pt 16: [2022-11-25 18:31:29,077] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 14: [2022-11-25 18:31:29,082] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt. 14: [2022-11-25 18:31:29,082] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_59_mp_rank_01_optim_states.pt 14: [2022-11-25 18:31:29,082] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 16: [2022-11-25 18:31:29,083] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt. 16: [2022-11-25 18:31:29,083] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_64_mp_rank_00_optim_states.pt 16: [2022-11-25 18:31:29,083] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 52: [2022-11-25 18:31:29,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt. 52: [2022-11-25 18:31:29,084] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_80_mp_rank_03_optim_states.pt 52: [2022-11-25 18:31:29,084] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 18: [2022-11-25 18:31:29,084] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt. 18: [2022-11-25 18:31:29,084] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_72_mp_rank_00_optim_states.pt 18: [2022-11-25 18:31:29,084] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 52: [2022-11-25 18:31:29,085] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt. 52: [2022-11-25 18:31:29,085] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_83_mp_rank_02_optim_states.pt 52: [2022-11-25 18:31:29,085] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 4: [2022-11-25 18:31:29,086] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt. 4: [2022-11-25 18:31:29,086] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_19_mp_rank_00_optim_states.pt 4: [2022-11-25 18:31:29,086] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 20: [2022-11-25 18:31:29,087] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt. 20: [2022-11-25 18:31:29,087] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_82_mp_rank_01_optim_states.pt 20: [2022-11-25 18:31:29,087] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 20: [2022-11-25 18:31:29,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt. 20: [2022-11-25 18:31:29,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_81_mp_rank_01_optim_states.pt 20: [2022-11-25 18:31:29,089] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 51: [2022-11-25 18:31:29,089] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt. 51: [2022-11-25 18:31:29,089] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_76_mp_rank_02_optim_states.pt 51: [2022-11-25 18:31:29,089] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 44: [2022-11-25 18:31:29,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt. 44: [2022-11-25 18:31:29,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_48_mp_rank_02_optim_states.pt 44: [2022-11-25 18:31:29,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 19: [2022-11-25 18:31:29,090] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt. 49: [2022-11-25 18:31:29,093] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt. 49: [2022-11-25 18:31:29,093] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_69_mp_rank_02_optim_states.pt 49: [2022-11-25 18:31:29,093] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 0: [2022-11-25 18:31:29,095] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt. 15: [2022-11-25 18:31:29,076] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt. 0: [2022-11-25 18:31:29,095] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_0_mp_rank_01_optim_states.pt 15: [2022-11-25 18:31:29,076] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_60_mp_rank_00_optim_states.pt 0: [2022-11-25 18:31:29,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 15: [2022-11-25 18:31:29,076] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 16: [2022-11-25 18:31:29,096] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt. 16: [2022-11-25 18:31:29,096] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_67_mp_rank_01_optim_states.pt 16: [2022-11-25 18:31:29,096] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 24: [2022-11-25 18:31:29,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt. 24: [2022-11-25 18:31:29,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_98_mp_rank_00_optim_states.pt 24: [2022-11-25 18:31:29,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 14: [2022-11-25 18:31:29,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt. 14: [2022-11-25 18:31:29,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_58_mp_rank_01_optim_states.pt 14: [2022-11-25 18:31:29,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 13: [2022-11-25 18:31:29,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt. 13: [2022-11-25 18:31:29,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_55_mp_rank_00_optim_states.pt 13: [2022-11-25 18:31:29,098] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 32: [2022-11-25 18:31:29,098] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt. 32: [2022-11-25 18:31:29,098] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_3_mp_rank_02_optim_states.pt 32: [2022-11-25 18:31:29,098] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 8: [2022-11-25 18:31:29,098] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt. 24: [2022-11-25 18:31:29,098] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt. 8: [2022-11-25 18:31:29,098] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_35_mp_rank_00_optim_states.pt 43: [2022-11-25 18:31:29,098] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt. 8: [2022-11-25 18:31:29,098] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 24: [2022-11-25 18:31:29,098] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_99_mp_rank_00_optim_states.pt 43: [2022-11-25 18:31:29,098] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_47_mp_rank_02_optim_states.pt 24: [2022-11-25 18:31:29,098] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 52: [2022-11-25 18:31:29,098] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt. 43: [2022-11-25 18:31:29,098] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 52: [2022-11-25 18:31:29,098] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_82_mp_rank_02_optim_states.pt 52: [2022-11-25 18:31:29,098] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 8: [2022-11-25 18:31:29,099] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt. 8: [2022-11-25 18:31:29,099] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_32_mp_rank_00_optim_states.pt 8: [2022-11-25 18:31:29,099] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 13: [2022-11-25 18:31:29,099] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt. 13: [2022-11-25 18:31:29,099] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_53_mp_rank_00_optim_states.pt 13: [2022-11-25 18:31:29,099] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 52: [2022-11-25 18:31:29,100] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt. 52: [2022-11-25 18:31:29,101] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_81_mp_rank_02_optim_states.pt 52: [2022-11-25 18:31:29,101] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 50: [2022-11-25 18:31:29,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt. 50: [2022-11-25 18:31:29,101] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_75_mp_rank_02_optim_states.pt 50: [2022-11-25 18:31:29,101] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 50: [2022-11-25 18:31:29,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt. 50: [2022-11-25 18:31:29,101] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_72_mp_rank_02_optim_states.pt 50: [2022-11-25 18:31:29,101] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 14: [2022-11-25 18:31:29,101] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt. 14: [2022-11-25 18:31:29,101] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_56_mp_rank_00_optim_states.pt 14: [2022-11-25 18:31:29,101] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 50: [2022-11-25 18:31:29,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt. 50: [2022-11-25 18:31:29,102] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_74_mp_rank_03_optim_states.pt 12: [2022-11-25 18:31:29,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt. 12: [2022-11-25 18:31:29,102] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_48_mp_rank_00_optim_states.pt 12: [2022-11-25 18:31:29,102] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 45: [2022-11-25 18:31:29,102] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt. 45: [2022-11-25 18:31:29,102] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_52_mp_rank_02_optim_states.pt 45: [2022-11-25 18:31:29,102] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 50: [2022-11-25 18:31:29,102] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 19: [2022-11-25 18:31:29,090] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_77_mp_rank_01_optim_states.pt 19: [2022-11-25 18:31:29,090] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 19: [2022-11-25 18:31:29,097] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt. 19: [2022-11-25 18:31:29,097] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_77_mp_rank_00_optim_states.pt 19: [2022-11-25 18:31:29,097] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 47: [2022-11-25 18:31:29,103] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt. 16: [2022-11-25 18:31:29,104] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt. 5: [2022-11-25 18:31:29,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt. 16: [2022-11-25 18:31:29,104] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_65_mp_rank_01_optim_states.pt 5: [2022-11-25 18:31:29,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_23_mp_rank_01_optim_states.pt 16: [2022-11-25 18:31:29,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 5: [2022-11-25 18:31:29,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 18: [2022-11-25 18:31:29,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt. 18: [2022-11-25 18:31:29,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_72_mp_rank_01_optim_states.pt 18: [2022-11-25 18:31:29,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 42: [2022-11-25 18:31:29,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt. 42: [2022-11-25 18:31:29,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_40_mp_rank_02_optim_states.pt 42: [2022-11-25 18:31:29,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 50: [2022-11-25 18:31:29,105] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt. 50: [2022-11-25 18:31:29,105] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_75_mp_rank_03_optim_states.pt 50: [2022-11-25 18:31:29,105] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 6: [2022-11-25 18:31:29,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt. 6: [2022-11-25 18:31:29,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt. 6: [2022-11-25 18:31:29,106] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_27_mp_rank_00_optim_states.pt 6: [2022-11-25 18:31:29,106] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_24_mp_rank_00_optim_states.pt 6: [2022-11-25 18:31:29,106] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 6: [2022-11-25 18:31:29,106] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 11: [2022-11-25 18:31:29,106] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt. 11: [2022-11-25 18:31:29,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_44_mp_rank_00_optim_states.pt 11: [2022-11-25 18:31:29,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 25: [2022-11-25 18:31:29,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt. 24: [2022-11-25 18:31:29,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt. 24: [2022-11-25 18:31:29,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_96_mp_rank_00_optim_states.pt 3: [2022-11-25 18:31:29,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt. 3: [2022-11-25 18:31:29,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt. 3: [2022-11-25 18:31:29,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt. 3: [2022-11-25 18:31:29,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt. 19: [2022-11-25 18:31:29,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt. 3: [2022-11-25 18:31:29,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_13_mp_rank_00_optim_states.pt 3: [2022-11-25 18:31:29,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_14_mp_rank_00_optim_states.pt 3: [2022-11-25 18:31:29,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_15_mp_rank_00_optim_states.pt 3: [2022-11-25 18:31:29,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_12_mp_rank_00_optim_states.pt 3: [2022-11-25 18:31:29,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 3: [2022-11-25 18:31:29,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 3: [2022-11-25 18:31:29,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 3: [2022-11-25 18:31:29,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 11: [2022-11-25 18:31:29,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt. 11: [2022-11-25 18:31:29,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_46_mp_rank_01_optim_states.pt 49: [2022-11-25 18:31:29,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt. 11: [2022-11-25 18:31:29,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 24: [2022-11-25 18:31:29,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 49: [2022-11-25 18:31:29,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_71_mp_rank_02_optim_states.pt 49: [2022-11-25 18:31:29,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 47: [2022-11-25 18:31:29,103] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_61_mp_rank_02_optim_states.pt 47: [2022-11-25 18:31:29,103] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 45: [2022-11-25 18:31:29,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt. 45: [2022-11-25 18:31:29,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_54_mp_rank_02_optim_states.pt 45: [2022-11-25 18:31:29,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 45: [2022-11-25 18:31:29,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt. 45: [2022-11-25 18:31:29,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_55_mp_rank_02_optim_states.pt 45: [2022-11-25 18:31:29,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 62: [2022-11-25 18:31:29,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt. 62: [2022-11-25 18:31:29,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_121_mp_rank_02_optim_states.pt 62: [2022-11-25 18:31:29,108] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 5: [2022-11-25 18:31:29,108] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt. 5: [2022-11-25 18:31:29,108] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_22_mp_rank_01_optim_states.pt 5: [2022-11-25 18:31:29,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 6: [2022-11-25 18:31:29,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt. 6: [2022-11-25 18:31:29,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt. 5: [2022-11-25 18:31:29,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt. 5: [2022-11-25 18:31:29,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt. 5: [2022-11-25 18:31:29,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_20_mp_rank_00_optim_states.pt 5: [2022-11-25 18:31:29,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_21_mp_rank_00_optim_states.pt 49: [2022-11-25 18:31:29,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt. 6: [2022-11-25 18:31:29,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_26_mp_rank_01_optim_states.pt 6: [2022-11-25 18:31:29,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_24_mp_rank_01_optim_states.pt 42: [2022-11-25 18:31:29,109] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt. 5: [2022-11-25 18:31:29,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 49: [2022-11-25 18:31:29,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_69_mp_rank_03_optim_states.pt 42: [2022-11-25 18:31:29,109] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_43_mp_rank_02_optim_states.pt 5: [2022-11-25 18:31:29,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 49: [2022-11-25 18:31:29,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 6: [2022-11-25 18:31:29,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 42: [2022-11-25 18:31:29,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 6: [2022-11-25 18:31:29,109] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 25: [2022-11-25 18:31:29,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_103_mp_rank_01_optim_states.pt 25: [2022-11-25 18:31:29,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 25: [2022-11-25 18:31:29,107] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt. 25: [2022-11-25 18:31:29,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_103_mp_rank_00_optim_states.pt 25: [2022-11-25 18:31:29,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 31: [2022-11-25 18:31:29,110] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt. 31: [2022-11-25 18:31:29,110] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_124_mp_rank_00_optim_states.pt 31: [2022-11-25 18:31:29,110] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 25: [2022-11-25 18:31:29,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt. 42: [2022-11-25 18:31:29,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt. 42: [2022-11-25 18:31:29,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_42_mp_rank_02_optim_states.pt 42: [2022-11-25 18:31:29,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 17: [2022-11-25 18:31:29,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt. 42: [2022-11-25 18:31:29,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt. 42: [2022-11-25 18:31:29,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_41_mp_rank_02_optim_states.pt 42: [2022-11-25 18:31:29,112] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 19: [2022-11-25 18:31:29,107] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_78_mp_rank_01_optim_states.pt 17: [2022-11-25 18:31:29,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_71_mp_rank_01_optim_states.pt 19: [2022-11-25 18:31:29,107] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 13: [2022-11-25 18:31:29,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt. 17: [2022-11-25 18:31:29,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 13: [2022-11-25 18:31:29,112] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_54_mp_rank_00_optim_states.pt 13: [2022-11-25 18:31:29,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 28: [2022-11-25 18:31:29,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt. 18: [2022-11-25 18:31:29,112] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt. 28: [2022-11-25 18:31:29,113] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt. 28: [2022-11-25 18:31:29,113] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_112_mp_rank_00_optim_states.pt 28: [2022-11-25 18:31:29,113] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_114_mp_rank_00_optim_states.pt 28: [2022-11-25 18:31:29,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 28: [2022-11-25 18:31:29,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 18: [2022-11-25 18:31:29,113] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_75_mp_rank_00_optim_states.pt 18: [2022-11-25 18:31:29,113] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 19: [2022-11-25 18:31:29,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt. 44: [2022-11-25 18:31:29,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt. 44: [2022-11-25 18:31:29,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt. 44: [2022-11-25 18:31:29,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_50_mp_rank_02_optim_states.pt 44: [2022-11-25 18:31:29,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_48_mp_rank_03_optim_states.pt 44: [2022-11-25 18:31:29,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 44: [2022-11-25 18:31:29,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 46: [2022-11-25 18:31:29,115] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt. 43: [2022-11-25 18:31:29,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt. 43: [2022-11-25 18:31:29,116] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_45_mp_rank_02_optim_states.pt 43: [2022-11-25 18:31:29,116] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 46: [2022-11-25 18:31:29,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_57_mp_rank_02_optim_states.pt 46: [2022-11-25 18:31:29,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 41: [2022-11-25 18:31:29,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt. 0: [2022-11-25 18:31:29,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt. 41: [2022-11-25 18:31:29,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_36_mp_rank_02_optim_states.pt 0: [2022-11-25 18:31:29,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_1_mp_rank_01_optim_states.pt 41: [2022-11-25 18:31:29,116] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt. 41: [2022-11-25 18:31:29,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 25: [2022-11-25 18:31:29,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_101_mp_rank_00_optim_states.pt 0: [2022-11-25 18:31:29,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 41: [2022-11-25 18:31:29,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_37_mp_rank_03_optim_states.pt 25: [2022-11-25 18:31:29,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 41: [2022-11-25 18:31:29,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 4: [2022-11-25 18:31:29,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt. 4: [2022-11-25 18:31:29,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt. 33: [2022-11-25 18:31:29,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt. 33: [2022-11-25 18:31:29,117] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt. 4: [2022-11-25 18:31:29,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_16_mp_rank_00_optim_states.pt 4: [2022-11-25 18:31:29,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_18_mp_rank_00_optim_states.pt 33: [2022-11-25 18:31:29,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_6_mp_rank_02_optim_states.pt 33: [2022-11-25 18:31:29,117] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_6_mp_rank_03_optim_states.pt 4: [2022-11-25 18:31:29,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 4: [2022-11-25 18:31:29,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 33: [2022-11-25 18:31:29,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 33: [2022-11-25 18:31:29,117] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 49: [2022-11-25 18:31:29,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt. 46: [2022-11-25 18:31:29,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt. 49: [2022-11-25 18:31:29,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_68_mp_rank_02_optim_states.pt 49: [2022-11-25 18:31:29,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 46: [2022-11-25 18:31:29,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_58_mp_rank_03_optim_states.pt 46: [2022-11-25 18:31:29,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 43: [2022-11-25 18:31:29,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt. 43: [2022-11-25 18:31:29,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_44_mp_rank_03_optim_states.pt 33: [2022-11-25 18:31:29,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt. 43: [2022-11-25 18:31:29,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 33: [2022-11-25 18:31:29,118] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_7_mp_rank_02_optim_states.pt 33: [2022-11-25 18:31:29,118] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 33: [2022-11-25 18:31:29,118] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt. 33: [2022-11-25 18:31:29,119] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_4_mp_rank_03_optim_states.pt 33: [2022-11-25 18:31:29,119] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 61: [2022-11-25 18:31:29,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt. 61: [2022-11-25 18:31:29,119] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_118_mp_rank_02_optim_states.pt 61: [2022-11-25 18:31:29,119] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 43: [2022-11-25 18:31:29,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt. 17: [2022-11-25 18:31:29,119] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt. 43: [2022-11-25 18:31:29,119] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_46_mp_rank_02_optim_states.pt 17: [2022-11-25 18:31:29,119] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_69_mp_rank_00_optim_states.pt 43: [2022-11-25 18:31:29,119] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 17: [2022-11-25 18:31:29,119] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 17: [2022-11-25 18:31:29,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt. 17: [2022-11-25 18:31:29,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_71_mp_rank_00_optim_states.pt 17: [2022-11-25 18:31:29,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 44: [2022-11-25 18:31:29,120] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt. 44: [2022-11-25 18:31:29,120] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_51_mp_rank_02_optim_states.pt 44: [2022-11-25 18:31:29,120] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 47: [2022-11-25 18:31:29,121] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt. 47: [2022-11-25 18:31:29,121] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_61_mp_rank_03_optim_states.pt 47: [2022-11-25 18:31:29,121] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 25: [2022-11-25 18:31:29,121] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt. 25: [2022-11-25 18:31:29,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_100_mp_rank_00_optim_states.pt 12: [2022-11-25 18:31:29,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt. 12: [2022-11-25 18:31:29,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_50_mp_rank_00_optim_states.pt 12: [2022-11-25 18:31:29,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 25: [2022-11-25 18:31:29,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 12: [2022-11-25 18:31:29,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt. 53: [2022-11-25 18:31:29,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt. 12: [2022-11-25 18:31:29,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_49_mp_rank_01_optim_states.pt 12: [2022-11-25 18:31:29,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 53: [2022-11-25 18:31:29,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_85_mp_rank_02_optim_states.pt 53: [2022-11-25 18:31:29,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 51: [2022-11-25 18:31:29,122] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt. 51: [2022-11-25 18:31:29,122] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_76_mp_rank_03_optim_states.pt 51: [2022-11-25 18:31:29,122] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 46: [2022-11-25 18:31:29,123] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt. 46: [2022-11-25 18:31:29,123] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_58_mp_rank_02_optim_states.pt 26: [2022-11-25 18:31:29,123] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt. 46: [2022-11-25 18:31:29,123] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 26: [2022-11-25 18:31:29,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_106_mp_rank_00_optim_states.pt 26: [2022-11-25 18:31:29,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 26: [2022-11-25 18:31:29,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt. 26: [2022-11-25 18:31:29,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_107_mp_rank_01_optim_states.pt 26: [2022-11-25 18:31:29,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 8: [2022-11-25 18:31:29,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt. 8: [2022-11-25 18:31:29,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt. 8: [2022-11-25 18:31:29,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_35_mp_rank_01_optim_states.pt 8: [2022-11-25 18:31:29,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_34_mp_rank_00_optim_states.pt 8: [2022-11-25 18:31:29,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 28: [2022-11-25 18:31:29,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt. 8: [2022-11-25 18:31:29,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 28: [2022-11-25 18:31:29,124] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt. 28: [2022-11-25 18:31:29,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_112_mp_rank_01_optim_states.pt 28: [2022-11-25 18:31:29,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_115_mp_rank_00_optim_states.pt 28: [2022-11-25 18:31:29,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 28: [2022-11-25 18:31:29,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 51: [2022-11-25 18:31:29,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt. 51: [2022-11-25 18:31:29,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_78_mp_rank_02_optim_states.pt 31: [2022-11-25 18:31:29,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt. 31: [2022-11-25 18:31:29,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_127_mp_rank_00_optim_states.pt 31: [2022-11-25 18:31:29,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 51: [2022-11-25 18:31:29,125] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 45: [2022-11-25 18:31:29,125] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt. 45: [2022-11-25 18:31:29,125] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_53_mp_rank_02_optim_states.pt 45: [2022-11-25 18:31:29,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 61: [2022-11-25 18:31:29,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt. 31: [2022-11-25 18:31:29,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt. 61: [2022-11-25 18:31:29,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_118_mp_rank_03_optim_states.pt 61: [2022-11-25 18:31:29,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 31: [2022-11-25 18:31:29,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_125_mp_rank_00_optim_states.pt 31: [2022-11-25 18:31:29,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 61: [2022-11-25 18:31:29,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt. 56: [2022-11-25 18:31:29,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt. 56: [2022-11-25 18:31:29,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt. 56: [2022-11-25 18:31:29,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt. 61: [2022-11-25 18:31:29,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_117_mp_rank_02_optim_states.pt 61: [2022-11-25 18:31:29,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 51: [2022-11-25 18:31:29,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt. 53: [2022-11-25 18:31:29,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt. 20: [2022-11-25 18:31:29,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt. 53: [2022-11-25 18:31:29,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_85_mp_rank_03_optim_states.pt 51: [2022-11-25 18:31:29,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_77_mp_rank_02_optim_states.pt 51: [2022-11-25 18:31:29,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 20: [2022-11-25 18:31:29,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_81_mp_rank_00_optim_states.pt 20: [2022-11-25 18:31:29,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 53: [2022-11-25 18:31:29,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 46: [2022-11-25 18:31:29,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt. 56: [2022-11-25 18:31:29,126] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt. 46: [2022-11-25 18:31:29,127] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_56_mp_rank_02_optim_states.pt 56: [2022-11-25 18:31:29,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_96_mp_rank_03_optim_states.pt 56: [2022-11-25 18:31:29,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_98_mp_rank_02_optim_states.pt 56: [2022-11-25 18:31:29,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_96_mp_rank_02_optim_states.pt 46: [2022-11-25 18:31:29,127] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 56: [2022-11-25 18:31:29,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 56: [2022-11-25 18:31:29,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 56: [2022-11-25 18:31:29,126] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_99_mp_rank_02_optim_states.pt 56: [2022-11-25 18:31:29,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 56: [2022-11-25 18:31:29,126] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 31: [2022-11-25 18:31:29,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt. 31: [2022-11-25 18:31:29,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_126_mp_rank_00_optim_states.pt 31: [2022-11-25 18:31:29,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 10: [2022-11-25 18:31:29,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt. 10: [2022-11-25 18:31:29,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt. 10: [2022-11-25 18:31:29,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt. 10: [2022-11-25 18:31:29,128] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt. 10: [2022-11-25 18:31:29,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_40_mp_rank_00_optim_states.pt 10: [2022-11-25 18:31:29,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_41_mp_rank_00_optim_states.pt 10: [2022-11-25 18:31:29,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_43_mp_rank_00_optim_states.pt 10: [2022-11-25 18:31:29,128] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_43_mp_rank_01_optim_states.pt 10: [2022-11-25 18:31:29,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 10: [2022-11-25 18:31:29,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 10: [2022-11-25 18:31:29,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 10: [2022-11-25 18:31:29,128] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 36: [2022-11-25 18:31:29,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt. 36: [2022-11-25 18:31:29,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt. 36: [2022-11-25 18:31:29,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_17_mp_rank_03_optim_states.pt 36: [2022-11-25 18:31:29,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_17_mp_rank_02_optim_states.pt 36: [2022-11-25 18:31:29,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 36: [2022-11-25 18:31:29,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 36: [2022-11-25 18:31:29,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt. 36: [2022-11-25 18:31:29,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt. 36: [2022-11-25 18:31:29,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_16_mp_rank_02_optim_states.pt 17: [2022-11-25 18:31:29,129] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt. 36: [2022-11-25 18:31:29,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 36: [2022-11-25 18:31:29,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_19_mp_rank_02_optim_states.pt 17: [2022-11-25 18:31:29,129] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_70_mp_rank_00_optim_states.pt 36: [2022-11-25 18:31:29,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 17: [2022-11-25 18:31:29,129] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 57: [2022-11-25 18:31:29,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt. 57: [2022-11-25 18:31:29,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_101_mp_rank_02_optim_states.pt 57: [2022-11-25 18:31:29,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 32: [2022-11-25 18:31:29,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt. 32: [2022-11-25 18:31:29,130] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_2_mp_rank_02_optim_states.pt 32: [2022-11-25 18:31:29,130] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 41: [2022-11-25 18:31:29,130] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt. 41: [2022-11-25 18:31:29,131] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_38_mp_rank_02_optim_states.pt 41: [2022-11-25 18:31:29,131] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 41: [2022-11-25 18:31:29,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt. 41: [2022-11-25 18:31:29,131] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_37_mp_rank_02_optim_states.pt 41: [2022-11-25 18:31:29,131] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 7: [2022-11-25 18:31:29,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt. 7: [2022-11-25 18:31:29,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt. 7: [2022-11-25 18:31:29,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt. 7: [2022-11-25 18:31:29,131] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_30_mp_rank_00_optim_states.pt 7: [2022-11-25 18:31:29,131] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_31_mp_rank_01_optim_states.pt 7: [2022-11-25 18:31:29,131] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_29_mp_rank_00_optim_states.pt 7: [2022-11-25 18:31:29,131] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 7: [2022-11-25 18:31:29,131] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 7: [2022-11-25 18:31:29,131] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 7: [2022-11-25 18:31:29,131] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt. 7: [2022-11-25 18:31:29,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_28_mp_rank_00_optim_states.pt 7: [2022-11-25 18:31:29,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 47: [2022-11-25 18:31:29,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt. 47: [2022-11-25 18:31:29,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_62_mp_rank_02_optim_states.pt 47: [2022-11-25 18:31:29,132] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 26: [2022-11-25 18:31:29,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt. 26: [2022-11-25 18:31:29,133] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_104_mp_rank_00_optim_states.pt 26: [2022-11-25 18:31:29,133] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 26: [2022-11-25 18:31:29,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt. 26: [2022-11-25 18:31:29,134] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_106_mp_rank_01_optim_states.pt 26: [2022-11-25 18:31:29,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 60: [2022-11-25 18:31:29,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt. 62: [2022-11-25 18:31:29,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt. 60: [2022-11-25 18:31:29,133] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt. 60: [2022-11-25 18:31:29,133] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_114_mp_rank_02_optim_states.pt 60: [2022-11-25 18:31:29,133] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_115_mp_rank_02_optim_states.pt 60: [2022-11-25 18:31:29,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 62: [2022-11-25 18:31:29,134] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_123_mp_rank_03_optim_states.pt 60: [2022-11-25 18:31:29,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 62: [2022-11-25 18:31:29,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 60: [2022-11-25 18:31:29,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt. 60: [2022-11-25 18:31:29,134] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt. 60: [2022-11-25 18:31:29,134] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_113_mp_rank_02_optim_states.pt 60: [2022-11-25 18:31:29,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 60: [2022-11-25 18:31:29,134] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_112_mp_rank_02_optim_states.pt 60: [2022-11-25 18:31:29,134] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 11: [2022-11-25 18:31:29,123] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt. 24: [2022-11-25 18:31:29,136] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt. 62: [2022-11-25 18:31:29,136] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt. 24: [2022-11-25 18:31:29,136] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_99_mp_rank_01_optim_states.pt 24: [2022-11-25 18:31:29,136] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 53: [2022-11-25 18:31:29,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt. 53: [2022-11-25 18:31:29,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_86_mp_rank_03_optim_states.pt 53: [2022-11-25 18:31:29,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 19: [2022-11-25 18:31:29,115] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_78_mp_rank_00_optim_states.pt 19: [2022-11-25 18:31:29,115] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 19: [2022-11-25 18:31:29,132] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt. 57: [2022-11-25 18:31:29,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt. 19: [2022-11-25 18:31:29,132] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_76_mp_rank_00_optim_states.pt 57: [2022-11-25 18:31:29,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_100_mp_rank_03_optim_states.pt 19: [2022-11-25 18:31:29,133] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 62: [2022-11-25 18:31:29,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_120_mp_rank_02_optim_states.pt 57: [2022-11-25 18:31:29,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 62: [2022-11-25 18:31:29,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 62: [2022-11-25 18:31:29,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt. 57: [2022-11-25 18:31:29,137] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt. 62: [2022-11-25 18:31:29,137] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_122_mp_rank_02_optim_states.pt 62: [2022-11-25 18:31:29,137] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 57: [2022-11-25 18:31:29,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_103_mp_rank_02_optim_states.pt 57: [2022-11-25 18:31:29,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 57: [2022-11-25 18:31:29,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt. 57: [2022-11-25 18:31:29,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_100_mp_rank_02_optim_states.pt 57: [2022-11-25 18:31:29,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 35: [2022-11-25 18:31:29,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt. 35: [2022-11-25 18:31:29,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt. 35: [2022-11-25 18:31:29,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt. 35: [2022-11-25 18:31:29,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt. 35: [2022-11-25 18:31:29,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_14_mp_rank_02_optim_states.pt 35: [2022-11-25 18:31:29,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_13_mp_rank_03_optim_states.pt 35: [2022-11-25 18:31:29,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_15_mp_rank_02_optim_states.pt 35: [2022-11-25 18:31:29,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_12_mp_rank_02_optim_states.pt 35: [2022-11-25 18:31:29,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 35: [2022-11-25 18:31:29,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 35: [2022-11-25 18:31:29,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 35: [2022-11-25 18:31:29,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 12: [2022-11-25 18:31:29,138] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt. 53: [2022-11-25 18:31:29,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt. 53: [2022-11-25 18:31:29,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_84_mp_rank_02_optim_states.pt 53: [2022-11-25 18:31:29,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 47: [2022-11-25 18:31:29,139] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt. 12: [2022-11-25 18:31:29,138] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_49_mp_rank_00_optim_states.pt 12: [2022-11-25 18:31:29,138] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 47: [2022-11-25 18:31:29,139] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_63_mp_rank_02_optim_states.pt 47: [2022-11-25 18:31:29,139] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 11: [2022-11-25 18:31:29,123] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_47_mp_rank_00_optim_states.pt 11: [2022-11-25 18:31:29,123] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 11: [2022-11-25 18:31:29,123] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt. 11: [2022-11-25 18:31:29,124] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_44_mp_rank_01_optim_states.pt 11: [2022-11-25 18:31:29,124] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 15: [2022-11-25 18:31:29,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt. 15: [2022-11-25 18:31:29,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_60_mp_rank_01_optim_states.pt 15: [2022-11-25 18:31:29,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 15: [2022-11-25 18:31:29,111] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt. 15: [2022-11-25 18:31:29,111] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_61_mp_rank_01_optim_states.pt 15: [2022-11-25 18:31:29,111] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 18: [2022-11-25 18:31:29,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt. 18: [2022-11-25 18:31:29,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_75_mp_rank_01_optim_states.pt 18: [2022-11-25 18:31:29,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 15: [2022-11-25 18:31:29,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt. 15: [2022-11-25 18:31:29,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_61_mp_rank_00_optim_states.pt 15: [2022-11-25 18:31:29,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 11: [2022-11-25 18:31:29,141] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt. 15: [2022-11-25 18:31:29,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt. 3: [2022-11-25 18:31:29,142] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt. 3: [2022-11-25 18:31:29,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_13_mp_rank_01_optim_states.pt 11: [2022-11-25 18:31:29,141] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_47_mp_rank_01_optim_states.pt 15: [2022-11-25 18:31:29,142] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_63_mp_rank_01_optim_states.pt 3: [2022-11-25 18:31:29,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 11: [2022-11-25 18:31:29,141] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 15: [2022-11-25 18:31:29,142] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 61: [2022-11-25 18:31:29,143] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt. 61: [2022-11-25 18:31:29,143] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_117_mp_rank_03_optim_states.pt 61: [2022-11-25 18:31:29,143] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 1: [2022-11-25 18:31:29,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt. 1: [2022-11-25 18:31:29,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt. 1: [2022-11-25 18:31:29,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt. 1: [2022-11-25 18:31:29,144] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt. 1: [2022-11-25 18:31:29,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt 1: [2022-11-25 18:31:29,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt 1: [2022-11-25 18:31:29,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_4_mp_rank_01_optim_states.pt 1: [2022-11-25 18:31:29,144] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_6_mp_rank_01_optim_states.pt 1: [2022-11-25 18:31:29,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 1: [2022-11-25 18:31:29,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 1: [2022-11-25 18:31:29,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 1: [2022-11-25 18:31:29,144] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 9: [2022-11-25 18:31:29,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt. 9: [2022-11-25 18:31:29,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt. 9: [2022-11-25 18:31:29,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt. 9: [2022-11-25 18:31:29,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt. 9: [2022-11-25 18:31:29,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_39_mp_rank_01_optim_states.pt 9: [2022-11-25 18:31:29,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_37_mp_rank_00_optim_states.pt 9: [2022-11-25 18:31:29,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_36_mp_rank_00_optim_states.pt 9: [2022-11-25 18:31:29,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_38_mp_rank_00_optim_states.pt 9: [2022-11-25 18:31:29,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 9: [2022-11-25 18:31:29,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 9: [2022-11-25 18:31:29,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 9: [2022-11-25 18:31:29,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 18: [2022-11-25 18:31:29,145] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt. 18: [2022-11-25 18:31:29,145] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_73_mp_rank_00_optim_states.pt 18: [2022-11-25 18:31:29,145] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 25: [2022-11-25 18:31:29,149] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt. 25: [2022-11-25 18:31:29,149] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_100_mp_rank_01_optim_states.pt 25: [2022-11-25 18:31:29,149] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 22: [2022-11-25 18:31:29,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt. 22: [2022-11-25 18:31:29,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt. 22: [2022-11-25 18:31:29,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt. 22: [2022-11-25 18:31:29,150] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt. 22: [2022-11-25 18:31:29,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_91_mp_rank_00_optim_states.pt 22: [2022-11-25 18:31:29,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_90_mp_rank_00_optim_states.pt 22: [2022-11-25 18:31:29,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_91_mp_rank_01_optim_states.pt 22: [2022-11-25 18:31:29,150] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_88_mp_rank_00_optim_states.pt 22: [2022-11-25 18:31:29,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 22: [2022-11-25 18:31:29,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 22: [2022-11-25 18:31:29,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 22: [2022-11-25 18:31:29,150] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 0: [2022-11-25 18:31:29,152] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt 0: [2022-11-25 18:31:29,152] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 23: [2022-11-25 18:31:29,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt. 23: [2022-11-25 18:31:29,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt. 23: [2022-11-25 18:31:29,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt. 23: [2022-11-25 18:31:29,156] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt. 23: [2022-11-25 18:31:29,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_95_mp_rank_00_optim_states.pt 23: [2022-11-25 18:31:29,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_94_mp_rank_01_optim_states.pt 23: [2022-11-25 18:31:29,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_92_mp_rank_00_optim_states.pt 23: [2022-11-25 18:31:29,156] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_94_mp_rank_00_optim_states.pt 23: [2022-11-25 18:31:29,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 23: [2022-11-25 18:31:29,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 23: [2022-11-25 18:31:29,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 23: [2022-11-25 18:31:29,156] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 39: [2022-11-25 18:31:29,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt. 39: [2022-11-25 18:31:29,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt. 39: [2022-11-25 18:31:29,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt. 39: [2022-11-25 18:31:29,159] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt. 39: [2022-11-25 18:31:29,159] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_28_mp_rank_03_optim_states.pt 39: [2022-11-25 18:31:29,159] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_31_mp_rank_02_optim_states.pt 39: [2022-11-25 18:31:29,159] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_30_mp_rank_02_optim_states.pt 39: [2022-11-25 18:31:29,159] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_29_mp_rank_03_optim_states.pt 39: [2022-11-25 18:31:29,159] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 39: [2022-11-25 18:31:29,159] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 39: [2022-11-25 18:31:29,159] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 39: [2022-11-25 18:31:29,159] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 40: [2022-11-25 18:31:29,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt. 40: [2022-11-25 18:31:29,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt. 40: [2022-11-25 18:31:29,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt. 40: [2022-11-25 18:31:29,160] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt. 40: [2022-11-25 18:31:29,160] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_34_mp_rank_03_optim_states.pt 40: [2022-11-25 18:31:29,160] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_33_mp_rank_02_optim_states.pt 40: [2022-11-25 18:31:29,160] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_33_mp_rank_03_optim_states.pt 40: [2022-11-25 18:31:29,160] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 40: [2022-11-25 18:31:29,160] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_32_mp_rank_02_optim_states.pt 40: [2022-11-25 18:31:29,160] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 40: [2022-11-25 18:31:29,160] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 40: [2022-11-25 18:31:29,160] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 0: [2022-11-25 18:31:29,162] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt. 0: [2022-11-25 18:31:29,162] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt 0: [2022-11-25 18:31:29,162] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 30: [2022-11-25 18:31:29,172] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt. 30: [2022-11-25 18:31:29,172] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt. 30: [2022-11-25 18:31:29,172] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt. 30: [2022-11-25 18:31:29,172] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt. 30: [2022-11-25 18:31:29,172] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_120_mp_rank_00_optim_states.pt 30: [2022-11-25 18:31:29,172] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_123_mp_rank_00_optim_states.pt 30: [2022-11-25 18:31:29,172] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_122_mp_rank_00_optim_states.pt 30: [2022-11-25 18:31:29,172] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 30: [2022-11-25 18:31:29,172] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_120_mp_rank_01_optim_states.pt 30: [2022-11-25 18:31:29,172] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 30: [2022-11-25 18:31:29,172] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 30: [2022-11-25 18:31:29,172] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 22: [2022-11-25 18:31:29,174] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt. 22: [2022-11-25 18:31:29,175] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_90_mp_rank_01_optim_states.pt 22: [2022-11-25 18:31:29,175] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 55: [2022-11-25 18:31:29,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt. 55: [2022-11-25 18:31:29,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt. 55: [2022-11-25 18:31:29,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt. 55: [2022-11-25 18:31:29,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt. 2: [2022-11-25 18:31:29,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt. 2: [2022-11-25 18:31:29,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt. 55: [2022-11-25 18:31:29,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_95_mp_rank_02_optim_states.pt 2: [2022-11-25 18:31:29,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt. 55: [2022-11-25 18:31:29,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_94_mp_rank_02_optim_states.pt 55: [2022-11-25 18:31:29,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_92_mp_rank_02_optim_states.pt 2: [2022-11-25 18:31:29,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt. 2: [2022-11-25 18:31:29,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_10_mp_rank_00_optim_states.pt 55: [2022-11-25 18:31:29,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 2: [2022-11-25 18:31:29,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_8_mp_rank_01_optim_states.pt 55: [2022-11-25 18:31:29,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_92_mp_rank_03_optim_states.pt 55: [2022-11-25 18:31:29,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 2: [2022-11-25 18:31:29,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_9_mp_rank_01_optim_states.pt 2: [2022-11-25 18:31:29,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 2: [2022-11-25 18:31:29,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 2: [2022-11-25 18:31:29,178] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt. 55: [2022-11-25 18:31:29,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 2: [2022-11-25 18:31:29,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_11_mp_rank_00_optim_states.pt 55: [2022-11-25 18:31:29,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 2: [2022-11-25 18:31:29,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 2: [2022-11-25 18:31:29,178] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_9_mp_rank_00_optim_states.pt 2: [2022-11-25 18:31:29,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 2: [2022-11-25 18:31:29,178] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 14: [2022-11-25 18:31:29,182] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt. 14: [2022-11-25 18:31:29,182] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_56_mp_rank_01_optim_states.pt 14: [2022-11-25 18:31:29,182] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 24: [2022-11-25 18:31:29,188] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt. 24: [2022-11-25 18:31:29,188] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_98_mp_rank_01_optim_states.pt 24: [2022-11-25 18:31:29,188] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 41: [2022-11-25 18:31:29,189] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt. 41: [2022-11-25 18:31:29,189] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_39_mp_rank_03_optim_states.pt 41: [2022-11-25 18:31:29,189] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 10: [2022-11-25 18:31:29,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt. 8: [2022-11-25 18:31:29,198] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt. 10: [2022-11-25 18:31:29,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_41_mp_rank_01_optim_states.pt 8: [2022-11-25 18:31:29,198] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_34_mp_rank_01_optim_states.pt 10: [2022-11-25 18:31:29,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 8: [2022-11-25 18:31:29,198] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 38: [2022-11-25 18:31:29,204] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt. 38: [2022-11-25 18:31:29,204] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_24_mp_rank_03_optim_states.pt 38: [2022-11-25 18:31:29,204] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 16: [2022-11-25 18:31:29,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt. 16: [2022-11-25 18:31:29,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_66_mp_rank_01_optim_states.pt 16: [2022-11-25 18:31:29,205] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 58: [2022-11-25 18:31:29,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt. 58: [2022-11-25 18:31:29,205] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_104_mp_rank_02_optim_states.pt 58: [2022-11-25 18:31:29,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 9: [2022-11-25 18:31:29,205] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt. 9: [2022-11-25 18:31:29,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_36_mp_rank_01_optim_states.pt 9: [2022-11-25 18:31:29,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 37: [2022-11-25 18:31:29,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt. 34: [2022-11-25 18:31:29,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt. 37: [2022-11-25 18:31:29,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_21_mp_rank_03_optim_states.pt 34: [2022-11-25 18:31:29,206] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_10_mp_rank_03_optim_states.pt 37: [2022-11-25 18:31:29,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 34: [2022-11-25 18:31:29,206] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 54: [2022-11-25 18:31:29,206] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt. 29: [2022-11-25 18:31:29,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt. 29: [2022-11-25 18:31:29,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_118_mp_rank_01_optim_states.pt 29: [2022-11-25 18:31:29,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 54: [2022-11-25 18:31:29,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_91_mp_rank_03_optim_states.pt 54: [2022-11-25 18:31:29,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 63: [2022-11-25 18:31:29,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt. 63: [2022-11-25 18:31:29,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_127_mp_rank_02_optim_states.pt 63: [2022-11-25 18:31:29,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 21: [2022-11-25 18:31:29,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt. 48: [2022-11-25 18:31:29,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt. 21: [2022-11-25 18:31:29,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_87_mp_rank_01_optim_states.pt 48: [2022-11-25 18:31:29,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_65_mp_rank_03_optim_states.pt 48: [2022-11-25 18:31:29,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 21: [2022-11-25 18:31:29,207] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 7: [2022-11-25 18:31:29,207] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt. 7: [2022-11-25 18:31:29,207] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_30_mp_rank_01_optim_states.pt 7: [2022-11-25 18:31:29,208] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 20: [2022-11-25 18:31:29,220] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt. 20: [2022-11-25 18:31:29,221] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_83_mp_rank_01_optim_states.pt 20: [2022-11-25 18:31:29,221] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 13: [2022-11-25 18:31:29,225] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt. 13: [2022-11-25 18:31:29,225] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_53_mp_rank_01_optim_states.pt 13: [2022-11-25 18:31:29,225] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 27: [2022-11-25 18:31:29,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt. 27: [2022-11-25 18:31:29,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt. 27: [2022-11-25 18:31:29,237] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_108_mp_rank_01_optim_states.pt 27: [2022-11-25 18:31:29,237] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_111_mp_rank_00_optim_states.pt 27: [2022-11-25 18:31:29,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt. 27: [2022-11-25 18:31:29,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt. 27: [2022-11-25 18:31:29,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 27: [2022-11-25 18:31:29,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 27: [2022-11-25 18:31:29,237] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt. 27: [2022-11-25 18:31:29,237] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_110_mp_rank_00_optim_states.pt 27: [2022-11-25 18:31:29,237] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_108_mp_rank_00_optim_states.pt 27: [2022-11-25 18:31:29,237] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_109_mp_rank_00_optim_states.pt 27: [2022-11-25 18:31:29,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 27: [2022-11-25 18:31:29,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 27: [2022-11-25 18:31:29,237] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 23: [2022-11-25 18:31:29,258] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt. 23: [2022-11-25 18:31:29,258] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_95_mp_rank_01_optim_states.pt 23: [2022-11-25 18:31:29,259] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 52: [2022-11-25 18:31:29,277] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt. 52: [2022-11-25 18:31:29,277] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_81_mp_rank_03_optim_states.pt 52: [2022-11-25 18:31:29,277] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 30: [2022-11-25 18:31:29,281] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt. 30: [2022-11-25 18:31:29,281] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_123_mp_rank_01_optim_states.pt 30: [2022-11-25 18:31:29,281] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 55: [2022-11-25 18:31:29,282] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt. 55: [2022-11-25 18:31:29,282] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_93_mp_rank_02_optim_states.pt 55: [2022-11-25 18:31:29,282] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 42: [2022-11-25 18:31:29,332] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt. 42: [2022-11-25 18:31:29,333] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_40_mp_rank_03_optim_states.pt 42: [2022-11-25 18:31:29,333] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 59: [2022-11-25 18:31:29,346] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt. 59: [2022-11-25 18:31:29,346] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_111_mp_rank_02_optim_states.pt 59: [2022-11-25 18:31:29,346] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 1: [2022-11-25 18:31:29,354] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt. 1: [2022-11-25 18:31:29,354] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_7_mp_rank_01_optim_states.pt 1: [2022-11-25 18:31:29,354] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 32: [2022-11-25 18:31:29,360] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt. 32: [2022-11-25 18:31:29,360] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_2_mp_rank_03_optim_states.pt 32: [2022-11-25 18:31:29,360] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 17: [2022-11-25 18:31:29,368] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt. 17: [2022-11-25 18:31:29,368] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_68_mp_rank_01_optim_states.pt 17: [2022-11-25 18:31:29,368] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 45: [2022-11-25 18:31:29,371] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt. 45: [2022-11-25 18:31:29,371] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_55_mp_rank_03_optim_states.pt 45: [2022-11-25 18:31:29,371] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 5: [2022-11-25 18:31:29,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt. 5: [2022-11-25 18:31:29,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_22_mp_rank_00_optim_states.pt 5: [2022-11-25 18:31:29,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 4: [2022-11-25 18:31:29,376] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt. 4: [2022-11-25 18:31:29,376] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_16_mp_rank_01_optim_states.pt 4: [2022-11-25 18:31:29,376] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 43: [2022-11-25 18:31:29,380] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt. 43: [2022-11-25 18:31:29,381] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_45_mp_rank_03_optim_states.pt 43: [2022-11-25 18:31:29,381] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 38: [2022-11-25 18:31:29,382] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt. 38: [2022-11-25 18:31:29,382] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_24_mp_rank_02_optim_states.pt 38: [2022-11-25 18:31:29,382] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 51: [2022-11-25 18:31:29,387] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt. 51: [2022-11-25 18:31:29,387] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_78_mp_rank_03_optim_states.pt 51: [2022-11-25 18:31:29,387] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 40: [2022-11-25 18:31:29,391] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt. 40: [2022-11-25 18:31:29,391] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_35_mp_rank_03_optim_states.pt 40: [2022-11-25 18:31:29,391] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 54: [2022-11-25 18:31:29,392] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt. 54: [2022-11-25 18:31:29,392] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_89_mp_rank_03_optim_states.pt 54: [2022-11-25 18:31:29,392] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 57: [2022-11-25 18:31:29,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt. 50: [2022-11-25 18:31:29,395] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt. 57: [2022-11-25 18:31:29,395] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_102_mp_rank_03_optim_states.pt 50: [2022-11-25 18:31:29,395] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_72_mp_rank_03_optim_states.pt 57: [2022-11-25 18:31:29,395] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 50: [2022-11-25 18:31:29,395] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 6: [2022-11-25 18:31:29,397] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt. 6: [2022-11-25 18:31:29,397] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_25_mp_rank_01_optim_states.pt 44: [2022-11-25 18:31:29,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt. 6: [2022-11-25 18:31:29,397] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 44: [2022-11-25 18:31:29,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_51_mp_rank_03_optim_states.pt 44: [2022-11-25 18:31:29,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 48: [2022-11-25 18:31:29,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt. 48: [2022-11-25 18:31:29,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_64_mp_rank_02_optim_states.pt 48: [2022-11-25 18:31:29,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 12: [2022-11-25 18:31:29,398] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt. 12: [2022-11-25 18:31:29,398] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_50_mp_rank_01_optim_states.pt 12: [2022-11-25 18:31:29,398] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 60: [2022-11-25 18:31:29,399] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt. 60: [2022-11-25 18:31:29,399] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_113_mp_rank_03_optim_states.pt 60: [2022-11-25 18:31:29,399] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 26: [2022-11-25 18:31:29,400] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt. 26: [2022-11-25 18:31:29,400] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_107_mp_rank_00_optim_states.pt 26: [2022-11-25 18:31:29,400] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 62: [2022-11-25 18:31:29,401] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt. 62: [2022-11-25 18:31:29,401] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_123_mp_rank_02_optim_states.pt 62: [2022-11-25 18:31:29,401] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 35: [2022-11-25 18:31:29,402] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt. 35: [2022-11-25 18:31:29,402] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_12_mp_rank_03_optim_states.pt 35: [2022-11-25 18:31:29,402] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 21: [2022-11-25 18:31:29,404] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt. 21: [2022-11-25 18:31:29,404] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_85_mp_rank_01_optim_states.pt 21: [2022-11-25 18:31:29,404] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 19: [2022-11-25 18:31:29,405] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt. 19: [2022-11-25 18:31:29,405] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_79_mp_rank_01_optim_states.pt 19: [2022-11-25 18:31:29,405] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 46: [2022-11-25 18:31:29,408] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt. 46: [2022-11-25 18:31:29,408] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_57_mp_rank_03_optim_states.pt 46: [2022-11-25 18:31:29,408] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 49: [2022-11-25 18:31:29,410] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt. 49: [2022-11-25 18:31:29,410] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_70_mp_rank_02_optim_states.pt 49: [2022-11-25 18:31:29,410] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 37: [2022-11-25 18:31:29,413] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt. 37: [2022-11-25 18:31:29,414] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_22_mp_rank_03_optim_states.pt 37: [2022-11-25 18:31:29,414] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 53: [2022-11-25 18:31:29,414] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt. 53: [2022-11-25 18:31:29,414] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_86_mp_rank_02_optim_states.pt 53: [2022-11-25 18:31:29,414] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 33: [2022-11-25 18:31:29,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt. 33: [2022-11-25 18:31:29,415] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_5_mp_rank_03_optim_states.pt 33: [2022-11-25 18:31:29,415] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 58: [2022-11-25 18:31:29,415] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt. 58: [2022-11-25 18:31:29,416] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_104_mp_rank_03_optim_states.pt 58: [2022-11-25 18:31:29,416] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 28: [2022-11-25 18:31:29,420] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt. 28: [2022-11-25 18:31:29,420] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_113_mp_rank_01_optim_states.pt 28: [2022-11-25 18:31:29,420] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 47: [2022-11-25 18:31:29,422] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt. 47: [2022-11-25 18:31:29,422] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_62_mp_rank_03_optim_states.pt 61: [2022-11-25 18:31:29,426] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt. 61: [2022-11-25 18:31:29,426] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_116_mp_rank_02_optim_states.pt 61: [2022-11-25 18:31:29,426] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 36: [2022-11-25 18:31:29,427] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt. 36: [2022-11-25 18:31:29,427] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_18_mp_rank_02_optim_states.pt 36: [2022-11-25 18:31:29,427] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 15: [2022-11-25 18:31:29,428] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt. 15: [2022-11-25 18:31:29,428] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_63_mp_rank_00_optim_states.pt 15: [2022-11-25 18:31:29,428] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 47: [2022-11-25 18:31:29,422] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 63: [2022-11-25 18:31:29,429] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt. 63: [2022-11-25 18:31:29,430] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_127_mp_rank_03_optim_states.pt 63: [2022-11-25 18:31:29,430] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 18: [2022-11-25 18:31:29,435] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt. 18: [2022-11-25 18:31:29,435] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_73_mp_rank_01_optim_states.pt 18: [2022-11-25 18:31:29,435] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 29: [2022-11-25 18:31:29,440] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt. 29: [2022-11-25 18:31:29,440] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_117_mp_rank_01_optim_states.pt 29: [2022-11-25 18:31:29,440] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 11: [2022-11-25 18:31:29,445] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt. 11: [2022-11-25 18:31:29,445] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_45_mp_rank_01_optim_states.pt 11: [2022-11-25 18:31:29,445] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 56: [2022-11-25 18:31:29,447] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt. 56: [2022-11-25 18:31:29,447] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_97_mp_rank_03_optim_states.pt 56: [2022-11-25 18:31:29,447] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 39: [2022-11-25 18:31:29,452] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt. 39: [2022-11-25 18:31:29,452] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_29_mp_rank_02_optim_states.pt 39: [2022-11-25 18:31:29,452] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 31: [2022-11-25 18:31:29,454] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt. 31: [2022-11-25 18:31:29,454] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_125_mp_rank_01_optim_states.pt 31: [2022-11-25 18:31:29,454] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 27: [2022-11-25 18:31:29,457] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt. 27: [2022-11-25 18:31:29,457] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_110_mp_rank_01_optim_states.pt 27: [2022-11-25 18:31:29,457] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 24: [2022-11-25 18:31:29,461] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt. 24: [2022-11-25 18:31:29,461] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_97_mp_rank_01_optim_states.pt 24: [2022-11-25 18:31:29,461] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 0: [2022-11-25 18:31:29,464] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt. 0: [2022-11-25 18:31:29,464] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt 0: [2022-11-25 18:31:29,464] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 3: [2022-11-25 18:31:29,467] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt. 3: [2022-11-25 18:31:29,467] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_14_mp_rank_01_optim_states.pt 3: [2022-11-25 18:31:29,467] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 22: [2022-11-25 18:31:29,469] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt. 22: [2022-11-25 18:31:29,470] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_88_mp_rank_01_optim_states.pt 22: [2022-11-25 18:31:29,470] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 34: [2022-11-25 18:31:29,471] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt. 34: [2022-11-25 18:31:29,471] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_9_mp_rank_03_optim_states.pt 34: [2022-11-25 18:31:29,471] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 41: [2022-11-25 18:31:29,472] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt. 41: [2022-11-25 18:31:29,472] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_36_mp_rank_03_optim_states.pt 41: [2022-11-25 18:31:29,472] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 14: [2022-11-25 18:31:29,473] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt. 14: [2022-11-25 18:31:29,474] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_57_mp_rank_01_optim_states.pt 14: [2022-11-25 18:31:29,474] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 2: [2022-11-25 18:31:29,476] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt. 2: [2022-11-25 18:31:29,476] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_8_mp_rank_00_optim_states.pt 2: [2022-11-25 18:31:29,476] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 8: [2022-11-25 18:31:29,477] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt. 8: [2022-11-25 18:31:29,477] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_33_mp_rank_01_optim_states.pt 8: [2022-11-25 18:31:29,477] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 16: [2022-11-25 18:31:29,478] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt. 16: [2022-11-25 18:31:29,478] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_66_mp_rank_00_optim_states.pt 16: [2022-11-25 18:31:29,478] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 25: [2022-11-25 18:31:29,479] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt. 25: [2022-11-25 18:31:29,479] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_102_mp_rank_00_optim_states.pt 25: [2022-11-25 18:31:29,479] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 10: [2022-11-25 18:31:29,486] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt. 10: [2022-11-25 18:31:29,486] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_42_mp_rank_01_optim_states.pt 10: [2022-11-25 18:31:29,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 59: [2022-11-25 18:31:29,487] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt. 59: [2022-11-25 18:31:29,487] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_108_mp_rank_02_optim_states.pt 59: [2022-11-25 18:31:29,487] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 9: [2022-11-25 18:31:29,495] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt. 9: [2022-11-25 18:31:29,495] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_38_mp_rank_01_optim_states.pt 9: [2022-11-25 18:31:29,495] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 20: [2022-11-25 18:31:29,498] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt. 20: [2022-11-25 18:31:29,499] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_80_mp_rank_01_optim_states.pt 20: [2022-11-25 18:31:29,499] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 1: [2022-11-25 18:31:29,501] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt. 1: [2022-11-25 18:31:29,501] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt 1: [2022-11-25 18:31:29,502] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 55: [2022-11-25 18:31:29,503] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt. 55: [2022-11-25 18:31:29,503] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_93_mp_rank_03_optim_states.pt 55: [2022-11-25 18:31:29,503] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 23: [2022-11-25 18:31:29,504] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt. 23: [2022-11-25 18:31:29,504] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_93_mp_rank_00_optim_states.pt 23: [2022-11-25 18:31:29,504] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 5: [2022-11-25 18:31:29,506] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt. 5: [2022-11-25 18:31:29,506] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_20_mp_rank_01_optim_states.pt 5: [2022-11-25 18:31:29,506] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 51: [2022-11-25 18:31:29,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt. 51: [2022-11-25 18:31:29,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_79_mp_rank_03_optim_states.pt 51: [2022-11-25 18:31:29,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 7: [2022-11-25 18:31:29,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt. 7: [2022-11-25 18:31:29,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_28_mp_rank_01_optim_states.pt 7: [2022-11-25 18:31:29,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 61: [2022-11-25 18:31:29,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt. 61: [2022-11-25 18:31:29,508] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_119_mp_rank_03_optim_states.pt 61: [2022-11-25 18:31:29,508] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 35: [2022-11-25 18:31:29,508] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt. 35: [2022-11-25 18:31:29,509] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_14_mp_rank_03_optim_states.pt 35: [2022-11-25 18:31:29,509] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 30: [2022-11-25 18:31:29,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt. 30: [2022-11-25 18:31:29,509] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_122_mp_rank_01_optim_states.pt 30: [2022-11-25 18:31:29,509] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 62: [2022-11-25 18:31:29,509] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt. 62: [2022-11-25 18:31:29,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_122_mp_rank_03_optim_states.pt 62: [2022-11-25 18:31:29,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 57: [2022-11-25 18:31:29,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt. 57: [2022-11-25 18:31:29,510] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_102_mp_rank_02_optim_states.pt 57: [2022-11-25 18:31:29,510] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 32: [2022-11-25 18:31:29,510] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt. 32: [2022-11-25 18:31:29,511] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_1_mp_rank_02_optim_states.pt 32: [2022-11-25 18:31:29,511] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 12: [2022-11-25 18:31:29,511] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt. 12: [2022-11-25 18:31:29,511] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_51_mp_rank_01_optim_states.pt 12: [2022-11-25 18:31:29,511] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 13: [2022-11-25 18:31:29,511] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt. 13: [2022-11-25 18:31:29,511] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_52_mp_rank_01_optim_states.pt 13: [2022-11-25 18:31:29,511] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 4: [2022-11-25 18:31:29,513] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt. 4: [2022-11-25 18:31:29,513] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_18_mp_rank_01_optim_states.pt 4: [2022-11-25 18:31:29,514] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 40: [2022-11-25 18:31:29,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt. 40: [2022-11-25 18:31:29,516] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_32_mp_rank_03_optim_states.pt 40: [2022-11-25 18:31:29,516] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 17: [2022-11-25 18:31:29,516] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt. 17: [2022-11-25 18:31:29,516] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_70_mp_rank_01_optim_states.pt 17: [2022-11-25 18:31:29,516] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 52: [2022-11-25 18:31:29,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt. 52: [2022-11-25 18:31:29,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_82_mp_rank_03_optim_states.pt 52: [2022-11-25 18:31:29,517] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 6: [2022-11-25 18:31:29,517] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt. 6: [2022-11-25 18:31:29,517] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_25_mp_rank_00_optim_states.pt 6: [2022-11-25 18:31:29,517] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 45: [2022-11-25 18:31:29,520] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt. 45: [2022-11-25 18:31:29,520] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_54_mp_rank_03_optim_states.pt 45: [2022-11-25 18:31:29,520] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 26: [2022-11-25 18:31:29,521] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt. 26: [2022-11-25 18:31:29,521] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_105_mp_rank_01_optim_states.pt 26: [2022-11-25 18:31:29,521] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 53: [2022-11-25 18:31:29,521] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt. 47: [2022-11-25 18:31:29,521] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt. 53: [2022-11-25 18:31:29,521] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_84_mp_rank_03_optim_states.pt 47: [2022-11-25 18:31:29,521] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_60_mp_rank_02_optim_states.pt 53: [2022-11-25 18:31:29,521] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 47: [2022-11-25 18:31:29,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 43: [2022-11-25 18:31:29,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt. 42: [2022-11-25 18:31:29,522] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt. 43: [2022-11-25 18:31:29,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_47_mp_rank_03_optim_states.pt 43: [2022-11-25 18:31:29,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 42: [2022-11-25 18:31:29,522] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_41_mp_rank_03_optim_states.pt 42: [2022-11-25 18:31:29,522] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 50: [2022-11-25 18:31:29,526] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt. 50: [2022-11-25 18:31:29,526] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_74_mp_rank_02_optim_states.pt 50: [2022-11-25 18:31:29,526] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 56: [2022-11-25 18:31:29,528] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt. 56: [2022-11-25 18:31:29,528] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_98_mp_rank_03_optim_states.pt 56: [2022-11-25 18:31:29,528] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 44: [2022-11-25 18:31:29,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt. 44: [2022-11-25 18:31:29,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_50_mp_rank_03_optim_states.pt 33: [2022-11-25 18:31:29,534] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt. 44: [2022-11-25 18:31:29,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 33: [2022-11-25 18:31:29,534] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_5_mp_rank_02_optim_states.pt 33: [2022-11-25 18:31:29,534] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 46: [2022-11-25 18:31:29,539] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt. 46: [2022-11-25 18:31:29,539] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_59_mp_rank_03_optim_states.pt 46: [2022-11-25 18:31:29,539] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 11: [2022-11-25 18:31:29,540] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt. 11: [2022-11-25 18:31:29,540] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_46_mp_rank_00_optim_states.pt 11: [2022-11-25 18:31:29,540] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 60: [2022-11-25 18:31:29,541] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt. 60: [2022-11-25 18:31:29,541] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_114_mp_rank_03_optim_states.pt 60: [2022-11-25 18:31:29,541] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 15: [2022-11-25 18:31:29,542] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt. 15: [2022-11-25 18:31:29,542] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_62_mp_rank_00_optim_states.pt 15: [2022-11-25 18:31:29,542] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 54: [2022-11-25 18:31:29,543] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt. 54: [2022-11-25 18:31:29,543] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_89_mp_rank_02_optim_states.pt 54: [2022-11-25 18:31:29,543] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 22: [2022-11-25 18:31:29,548] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt. 22: [2022-11-25 18:31:29,548] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_89_mp_rank_00_optim_states.pt 22: [2022-11-25 18:31:29,548] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 49: [2022-11-25 18:31:29,548] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt. 49: [2022-11-25 18:31:29,548] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_71_mp_rank_03_optim_states.pt 49: [2022-11-25 18:31:29,548] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 3: [2022-11-25 18:31:29,548] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt. 3: [2022-11-25 18:31:29,549] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_12_mp_rank_01_optim_states.pt 3: [2022-11-25 18:31:29,549] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 18: [2022-11-25 18:31:29,549] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt. 18: [2022-11-25 18:31:29,550] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_74_mp_rank_01_optim_states.pt 18: [2022-11-25 18:31:29,550] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 58: [2022-11-25 18:31:29,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt. 58: [2022-11-25 18:31:29,550] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_105_mp_rank_03_optim_states.pt 58: [2022-11-25 18:31:29,550] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 0: [2022-11-25 18:31:29,550] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt. 0: [2022-11-25 18:31:29,551] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_2_mp_rank_01_optim_states.pt 0: [2022-11-25 18:31:29,551] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 2: [2022-11-25 18:31:29,551] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt. 2: [2022-11-25 18:31:29,551] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_10_mp_rank_01_optim_states.pt 2: [2022-11-25 18:31:29,551] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 41: [2022-11-25 18:31:29,551] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt. 41: [2022-11-25 18:31:29,551] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_39_mp_rank_02_optim_states.pt 41: [2022-11-25 18:31:29,552] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 28: [2022-11-25 18:31:29,552] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt. 28: [2022-11-25 18:31:29,552] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_114_mp_rank_01_optim_states.pt 28: [2022-11-25 18:31:29,552] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 19: [2022-11-25 18:31:29,554] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt. 19: [2022-11-25 18:31:29,554] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_76_mp_rank_01_optim_states.pt 19: [2022-11-25 18:31:29,554] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 16: [2022-11-25 18:31:29,556] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt. 16: [2022-11-25 18:31:29,556] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_65_mp_rank_00_optim_states.pt 16: [2022-11-25 18:31:29,556] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 14: [2022-11-25 18:31:29,558] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt. 14: [2022-11-25 18:31:29,558] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_59_mp_rank_00_optim_states.pt 14: [2022-11-25 18:31:29,558] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 48: [2022-11-25 18:31:29,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt. 48: [2022-11-25 18:31:29,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_65_mp_rank_02_optim_states.pt 48: [2022-11-25 18:31:29,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 31: [2022-11-25 18:31:29,560] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt. 31: [2022-11-25 18:31:29,560] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_127_mp_rank_01_optim_states.pt 31: [2022-11-25 18:31:29,560] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 25: [2022-11-25 18:31:29,561] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt. 25: [2022-11-25 18:31:29,561] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_102_mp_rank_01_optim_states.pt 25: [2022-11-25 18:31:29,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 8: [2022-11-25 18:31:29,562] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt. 8: [2022-11-25 18:31:29,562] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_33_mp_rank_00_optim_states.pt 8: [2022-11-25 18:31:29,562] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 10: [2022-11-25 18:31:29,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt. 10: [2022-11-25 18:31:29,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_40_mp_rank_01_optim_states.pt 10: [2022-11-25 18:31:29,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 37: [2022-11-25 18:31:29,563] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt. 37: [2022-11-25 18:31:29,563] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_20_mp_rank_02_optim_states.pt 37: [2022-11-25 18:31:29,563] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 55: [2022-11-25 18:31:29,564] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt. 55: [2022-11-25 18:31:29,564] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_95_mp_rank_03_optim_states.pt 55: [2022-11-25 18:31:29,564] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 27: [2022-11-25 18:31:29,565] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt. 27: [2022-11-25 18:31:29,565] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_109_mp_rank_01_optim_states.pt 27: [2022-11-25 18:31:29,565] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 23: [2022-11-25 18:31:29,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt. 32: [2022-11-25 18:31:29,566] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt. 32: [2022-11-25 18:31:29,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_1_mp_rank_03_optim_states.pt 23: [2022-11-25 18:31:29,566] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_93_mp_rank_01_optim_states.pt 32: [2022-11-25 18:31:29,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 23: [2022-11-25 18:31:29,566] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 29: [2022-11-25 18:31:29,567] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt. 29: [2022-11-25 18:31:29,567] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_119_mp_rank_01_optim_states.pt 29: [2022-11-25 18:31:29,567] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 13: [2022-11-25 18:31:29,568] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt. 13: [2022-11-25 18:31:29,568] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_54_mp_rank_01_optim_states.pt 13: [2022-11-25 18:31:29,568] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 21: [2022-11-25 18:31:29,569] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt. 21: [2022-11-25 18:31:29,569] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_87_mp_rank_00_optim_states.pt 21: [2022-11-25 18:31:29,569] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 39: [2022-11-25 18:31:29,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt. 39: [2022-11-25 18:31:29,570] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_28_mp_rank_02_optim_states.pt 39: [2022-11-25 18:31:29,570] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 30: [2022-11-25 18:31:29,570] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt. 30: [2022-11-25 18:31:29,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_121_mp_rank_01_optim_states.pt 30: [2022-11-25 18:31:29,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 24: [2022-11-25 18:31:29,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt. 24: [2022-11-25 18:31:29,571] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_96_mp_rank_01_optim_states.pt 24: [2022-11-25 18:31:29,571] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 36: [2022-11-25 18:31:29,571] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt. 36: [2022-11-25 18:31:29,572] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_18_mp_rank_03_optim_states.pt 36: [2022-11-25 18:31:29,572] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 52: [2022-11-25 18:31:29,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt. 52: [2022-11-25 18:31:29,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_83_mp_rank_03_optim_states.pt 52: [2022-11-25 18:31:29,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 40: [2022-11-25 18:31:29,573] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt. 40: [2022-11-25 18:31:29,573] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_34_mp_rank_02_optim_states.pt 40: [2022-11-25 18:31:29,573] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 20: [2022-11-25 18:31:29,575] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt. 20: [2022-11-25 18:31:29,575] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_83_mp_rank_00_optim_states.pt 20: [2022-11-25 18:31:29,575] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 6: [2022-11-25 18:31:29,576] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt. 6: [2022-11-25 18:31:29,576] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_27_mp_rank_01_optim_states.pt 6: [2022-11-25 18:31:29,576] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 19: [2022-11-25 18:31:29,578] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt. 19: [2022-11-25 18:31:29,578] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_79_mp_rank_00_optim_states.pt 19: [2022-11-25 18:31:29,578] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 51: [2022-11-25 18:31:29,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt. 51: [2022-11-25 18:31:29,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_77_mp_rank_03_optim_states.pt 51: [2022-11-25 18:31:29,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 1: [2022-11-25 18:31:29,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt. 60: [2022-11-25 18:31:29,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt. 57: [2022-11-25 18:31:29,579] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt. 1: [2022-11-25 18:31:29,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_5_mp_rank_01_optim_states.pt 61: [2022-11-25 18:31:29,580] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt. 60: [2022-11-25 18:31:29,579] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_112_mp_rank_03_optim_states.pt 57: [2022-11-25 18:31:29,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_103_mp_rank_03_optim_states.pt 1: [2022-11-25 18:31:29,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 61: [2022-11-25 18:31:29,580] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_119_mp_rank_02_optim_states.pt 60: [2022-11-25 18:31:29,579] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 57: [2022-11-25 18:31:29,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 61: [2022-11-25 18:31:29,580] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 7: [2022-11-25 18:31:29,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt. 42: [2022-11-25 18:31:29,582] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt. 7: [2022-11-25 18:31:29,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_29_mp_rank_01_optim_states.pt 42: [2022-11-25 18:31:29,582] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_43_mp_rank_03_optim_states.pt 42: [2022-11-25 18:31:29,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 7: [2022-11-25 18:31:29,582] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 62: [2022-11-25 18:31:29,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt. 62: [2022-11-25 18:31:29,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_120_mp_rank_03_optim_states.pt 62: [2022-11-25 18:31:29,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 9: [2022-11-25 18:31:29,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt. 5: [2022-11-25 18:31:29,583] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt. 5: [2022-11-25 18:31:29,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_21_mp_rank_01_optim_states.pt 9: [2022-11-25 18:31:29,583] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_39_mp_rank_00_optim_states.pt 5: [2022-11-25 18:31:29,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 9: [2022-11-25 18:31:29,583] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 53: [2022-11-25 18:31:29,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt. 53: [2022-11-25 18:31:29,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_87_mp_rank_03_optim_states.pt 53: [2022-11-25 18:31:29,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 49: [2022-11-25 18:31:29,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt. 49: [2022-11-25 18:31:29,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_68_mp_rank_03_optim_states.pt 49: [2022-11-25 18:31:29,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 44: [2022-11-25 18:31:29,584] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt. 44: [2022-11-25 18:31:29,584] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_49_mp_rank_03_optim_states.pt 44: [2022-11-25 18:31:29,584] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 26: [2022-11-25 18:31:29,585] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt. 26: [2022-11-25 18:31:29,585] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_105_mp_rank_00_optim_states.pt 26: [2022-11-25 18:31:29,585] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 45: [2022-11-25 18:31:29,586] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt. 45: [2022-11-25 18:31:29,586] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_53_mp_rank_03_optim_states.pt 45: [2022-11-25 18:31:29,586] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 35: [2022-11-25 18:31:29,587] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt. 35: [2022-11-25 18:31:29,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_15_mp_rank_03_optim_states.pt 35: [2022-11-25 18:31:29,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 34: [2022-11-25 18:31:29,588] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt. 34: [2022-11-25 18:31:29,588] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_9_mp_rank_02_optim_states.pt 34: [2022-11-25 18:31:29,588] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 47: [2022-11-25 18:31:29,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt. 28: [2022-11-25 18:31:29,589] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt. 47: [2022-11-25 18:31:29,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_60_mp_rank_03_optim_states.pt 28: [2022-11-25 18:31:29,589] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_115_mp_rank_01_optim_states.pt 47: [2022-11-25 18:31:29,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 28: [2022-11-25 18:31:29,589] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 50: [2022-11-25 18:31:29,590] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt. 50: [2022-11-25 18:31:29,590] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_73_mp_rank_02_optim_states.pt 50: [2022-11-25 18:31:29,590] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 17: [2022-11-25 18:31:29,591] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt. 17: [2022-11-25 18:31:29,591] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_68_mp_rank_00_optim_states.pt 17: [2022-11-25 18:31:29,591] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 12: [2022-11-25 18:31:29,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt. 12: [2022-11-25 18:31:29,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_48_mp_rank_01_optim_states.pt 12: [2022-11-25 18:31:29,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 31: [2022-11-25 18:31:29,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt. 31: [2022-11-25 18:31:29,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_124_mp_rank_01_optim_states.pt 15: [2022-11-25 18:31:29,592] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt. 31: [2022-11-25 18:31:29,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 15: [2022-11-25 18:31:29,592] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_62_mp_rank_01_optim_states.pt 15: [2022-11-25 18:31:29,592] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 33: [2022-11-25 18:31:29,593] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt. 33: [2022-11-25 18:31:29,593] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_4_mp_rank_02_optim_states.pt 33: [2022-11-25 18:31:29,593] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 36: [2022-11-25 18:31:29,594] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt. 36: [2022-11-25 18:31:29,594] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_19_mp_rank_03_optim_states.pt 36: [2022-11-25 18:31:29,594] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 11: [2022-11-25 18:31:29,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt. 11: [2022-11-25 18:31:29,595] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_45_mp_rank_00_optim_states.pt 11: [2022-11-25 18:31:29,595] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 4: [2022-11-25 18:31:29,596] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt. 4: [2022-11-25 18:31:29,596] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_17_mp_rank_01_optim_states.pt 4: [2022-11-25 18:31:29,596] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 14: [2022-11-25 18:31:29,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt. 48: [2022-11-25 18:31:29,597] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt. 14: [2022-11-25 18:31:29,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_57_mp_rank_00_optim_states.pt 48: [2022-11-25 18:31:29,597] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_66_mp_rank_02_optim_states.pt 14: [2022-11-25 18:31:29,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 48: [2022-11-25 18:31:29,597] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 56: [2022-11-25 18:31:29,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt. 56: [2022-11-25 18:31:29,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_99_mp_rank_03_optim_states.pt 56: [2022-11-25 18:31:29,598] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 43: [2022-11-25 18:31:29,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt. 3: [2022-11-25 18:31:29,598] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt. 43: [2022-11-25 18:31:29,598] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_46_mp_rank_03_optim_states.pt 43: [2022-11-25 18:31:29,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 3: [2022-11-25 18:31:29,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_15_mp_rank_01_optim_states.pt 3: [2022-11-25 18:31:29,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 17: [2022-11-25 18:31:29,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt. 17: [2022-11-25 18:31:29,599] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_69_mp_rank_01_optim_states.pt 17: [2022-11-25 18:31:29,599] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 54: [2022-11-25 18:31:29,599] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt. 54: [2022-11-25 18:31:29,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_88_mp_rank_03_optim_states.pt 54: [2022-11-25 18:31:29,600] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 18: [2022-11-25 18:31:29,600] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt. 18: [2022-11-25 18:31:29,600] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_74_mp_rank_00_optim_states.pt 18: [2022-11-25 18:31:29,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 24: [2022-11-25 18:31:29,601] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt. 24: [2022-11-25 18:31:29,601] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_97_mp_rank_00_optim_states.pt 24: [2022-11-25 18:31:29,601] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 27: [2022-11-25 18:31:29,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt. 27: [2022-11-25 18:31:29,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_111_mp_rank_01_optim_states.pt 27: [2022-11-25 18:31:29,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 41: [2022-11-25 18:31:29,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt. 41: [2022-11-25 18:31:29,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_38_mp_rank_03_optim_states.pt 41: [2022-11-25 18:31:29,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 0: [2022-11-25 18:31:29,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt. 0: [2022-11-25 18:31:29,602] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_3_mp_rank_01_optim_states.pt 0: [2022-11-25 18:31:29,602] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 38: [2022-11-25 18:31:29,602] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt. 38: [2022-11-25 18:31:29,603] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_27_mp_rank_03_optim_states.pt 38: [2022-11-25 18:31:29,603] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 2: [2022-11-25 18:31:29,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt. 2: [2022-11-25 18:31:29,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_11_mp_rank_01_optim_states.pt 2: [2022-11-25 18:31:29,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 39: [2022-11-25 18:31:29,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt. 39: [2022-11-25 18:31:29,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_31_mp_rank_03_optim_states.pt 39: [2022-11-25 18:31:29,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 61: [2022-11-25 18:31:29,604] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt. 61: [2022-11-25 18:31:29,604] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_116_mp_rank_03_optim_states.pt 61: [2022-11-25 18:31:29,604] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 22: [2022-11-25 18:31:29,605] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt. 22: [2022-11-25 18:31:29,605] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_89_mp_rank_01_optim_states.pt 22: [2022-11-25 18:31:29,605] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 38: [2022-11-25 18:31:29,606] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt. 38: [2022-11-25 18:31:29,606] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_27_mp_rank_02_optim_states.pt 38: [2022-11-25 18:31:29,606] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 51: [2022-11-25 18:31:29,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt. 51: [2022-11-25 18:31:29,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_79_mp_rank_02_optim_states.pt 51: [2022-11-25 18:31:29,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 25: [2022-11-25 18:31:29,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt. 25: [2022-11-25 18:31:29,607] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_101_mp_rank_01_optim_states.pt 25: [2022-11-25 18:31:29,607] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 10: [2022-11-25 18:31:29,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt. 28: [2022-11-25 18:31:29,607] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt. 10: [2022-11-25 18:31:29,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_42_mp_rank_00_optim_states.pt 28: [2022-11-25 18:31:29,608] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_113_mp_rank_00_optim_states.pt 10: [2022-11-25 18:31:29,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 28: [2022-11-25 18:31:29,608] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 45: [2022-11-25 18:31:29,609] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt. 45: [2022-11-25 18:31:29,609] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_52_mp_rank_03_optim_states.pt 45: [2022-11-25 18:31:29,609] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 8: [2022-11-25 18:31:29,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt. 8: [2022-11-25 18:31:29,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_32_mp_rank_01_optim_states.pt 8: [2022-11-25 18:31:29,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 16: [2022-11-25 18:31:29,610] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt. 16: [2022-11-25 18:31:29,610] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_67_mp_rank_00_optim_states.pt 16: [2022-11-25 18:31:29,610] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 53: [2022-11-25 18:31:29,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt. 62: [2022-11-25 18:31:29,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt. 53: [2022-11-25 18:31:29,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_87_mp_rank_02_optim_states.pt 62: [2022-11-25 18:31:29,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_121_mp_rank_03_optim_states.pt 43: [2022-11-25 18:31:29,611] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt. 53: [2022-11-25 18:31:29,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 62: [2022-11-25 18:31:29,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 43: [2022-11-25 18:31:29,611] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_44_mp_rank_02_optim_states.pt 43: [2022-11-25 18:31:29,611] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 5: [2022-11-25 18:31:29,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt. 5: [2022-11-25 18:31:29,612] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_23_mp_rank_00_optim_states.pt 5: [2022-11-25 18:31:29,612] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 7: [2022-11-25 18:31:29,612] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt. 7: [2022-11-25 18:31:29,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_31_mp_rank_00_optim_states.pt 7: [2022-11-25 18:31:29,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 1: [2022-11-25 18:31:29,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt. 1: [2022-11-25 18:31:29,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt 1: [2022-11-25 18:31:29,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 20: [2022-11-25 18:31:29,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt. 20: [2022-11-25 18:31:29,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_82_mp_rank_00_optim_states.pt 20: [2022-11-25 18:31:29,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 46: [2022-11-25 18:31:29,613] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt. 46: [2022-11-25 18:31:29,613] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_56_mp_rank_03_optim_states.pt 46: [2022-11-25 18:31:29,613] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 57: [2022-11-25 18:31:29,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt. 57: [2022-11-25 18:31:29,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_101_mp_rank_03_optim_states.pt 57: [2022-11-25 18:31:29,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 32: [2022-11-25 18:31:29,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt. 32: [2022-11-25 18:31:29,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_3_mp_rank_03_optim_states.pt 32: [2022-11-25 18:31:29,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 40: [2022-11-25 18:31:29,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt. 40: [2022-11-25 18:31:29,614] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_35_mp_rank_02_optim_states.pt 40: [2022-11-25 18:31:29,614] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 58: [2022-11-25 18:31:29,614] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt. 58: [2022-11-25 18:31:29,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_106_mp_rank_03_optim_states.pt 58: [2022-11-25 18:31:29,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 13: [2022-11-25 18:31:29,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt. 13: [2022-11-25 18:31:29,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_55_mp_rank_01_optim_states.pt 13: [2022-11-25 18:31:29,615] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 23: [2022-11-25 18:31:29,615] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt. 23: [2022-11-25 18:31:29,615] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_92_mp_rank_01_optim_states.pt 23: [2022-11-25 18:31:29,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 37: [2022-11-25 18:31:29,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt. 37: [2022-11-25 18:31:29,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_20_mp_rank_03_optim_states.pt 37: [2022-11-25 18:31:29,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 42: [2022-11-25 18:31:29,616] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt. 42: [2022-11-25 18:31:29,616] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_42_mp_rank_03_optim_states.pt 42: [2022-11-25 18:31:29,616] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 30: [2022-11-25 18:31:29,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt. 30: [2022-11-25 18:31:29,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_121_mp_rank_00_optim_states.pt 30: [2022-11-25 18:31:29,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 47: [2022-11-25 18:31:29,617] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt. 47: [2022-11-25 18:31:29,617] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_63_mp_rank_03_optim_states.pt 47: [2022-11-25 18:31:29,617] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 63: [2022-11-25 18:31:29,618] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt. 63: [2022-11-25 18:31:29,618] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_124_mp_rank_03_optim_states.pt 63: [2022-11-25 18:31:29,618] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 44: [2022-11-25 18:31:29,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt. 44: [2022-11-25 18:31:29,619] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_49_mp_rank_02_optim_states.pt 44: [2022-11-25 18:31:29,619] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 21: [2022-11-25 18:31:29,619] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt. 21: [2022-11-25 18:31:29,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_86_mp_rank_00_optim_states.pt 21: [2022-11-25 18:31:29,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 26: [2022-11-25 18:31:29,620] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt. 26: [2022-11-25 18:31:29,620] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_104_mp_rank_01_optim_states.pt 26: [2022-11-25 18:31:29,620] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 49: [2022-11-25 18:31:29,621] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt. 49: [2022-11-25 18:31:29,621] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_70_mp_rank_03_optim_states.pt 49: [2022-11-25 18:31:29,621] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 52: [2022-11-25 18:31:29,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt. 56: [2022-11-25 18:31:29,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt. 55: [2022-11-25 18:31:29,622] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt. 52: [2022-11-25 18:31:29,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_80_mp_rank_02_optim_states.pt 56: [2022-11-25 18:31:29,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_97_mp_rank_02_optim_states.pt 55: [2022-11-25 18:31:29,622] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_94_mp_rank_03_optim_states.pt 52: [2022-11-25 18:31:29,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 55: [2022-11-25 18:31:29,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 4: [2022-11-25 18:31:29,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt. 4: [2022-11-25 18:31:29,623] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_19_mp_rank_01_optim_states.pt 9: [2022-11-25 18:31:29,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt. 9: [2022-11-25 18:31:29,623] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_37_mp_rank_01_optim_states.pt 4: [2022-11-25 18:31:29,623] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 9: [2022-11-25 18:31:29,623] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 6: [2022-11-25 18:31:29,623] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt. 6: [2022-11-25 18:31:29,623] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_26_mp_rank_00_optim_states.pt 6: [2022-11-25 18:31:29,623] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 59: [2022-11-25 18:31:29,624] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt. 59: [2022-11-25 18:31:29,624] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_110_mp_rank_03_optim_states.pt 59: [2022-11-25 18:31:29,624] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 12: [2022-11-25 18:31:29,625] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt. 12: [2022-11-25 18:31:29,625] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_51_mp_rank_00_optim_states.pt 12: [2022-11-25 18:31:29,625] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 50: [2022-11-25 18:31:29,626] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt. 50: [2022-11-25 18:31:29,626] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_73_mp_rank_03_optim_states.pt 50: [2022-11-25 18:31:29,626] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 36: [2022-11-25 18:31:29,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt. 36: [2022-11-25 18:31:29,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_16_mp_rank_03_optim_states.pt 36: [2022-11-25 18:31:29,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 35: [2022-11-25 18:31:29,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt. 35: [2022-11-25 18:31:29,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_13_mp_rank_02_optim_states.pt 29: [2022-11-25 18:31:29,627] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt. 29: [2022-11-25 18:31:29,627] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_116_mp_rank_00_optim_states.pt 29: [2022-11-25 18:31:29,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 35: [2022-11-25 18:31:29,627] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 34: [2022-11-25 18:31:29,628] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt. 34: [2022-11-25 18:31:29,628] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_8_mp_rank_02_optim_states.pt 34: [2022-11-25 18:31:29,628] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 56: [2022-11-25 18:31:29,622] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 60: [2022-11-25 18:31:29,632] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt. 60: [2022-11-25 18:31:29,632] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_115_mp_rank_03_optim_states.pt 60: [2022-11-25 18:31:29,632] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 33: [2022-11-25 18:31:29,639] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt. 33: [2022-11-25 18:31:29,639] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_7_mp_rank_03_optim_states.pt 33: [2022-11-25 18:31:29,639] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 21: [2022-11-25 18:31:29,641] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt. 39: [2022-11-25 18:31:29,650] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt. 39: [2022-11-25 18:31:29,650] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_30_mp_rank_03_optim_states.pt 39: [2022-11-25 18:31:29,650] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 31: [2022-11-25 18:31:29,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt. 31: [2022-11-25 18:31:29,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_126_mp_rank_01_optim_states.pt 31: [2022-11-25 18:31:29,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 46: [2022-11-25 18:31:29,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt. 46: [2022-11-25 18:31:29,651] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_59_mp_rank_02_optim_states.pt 46: [2022-11-25 18:31:29,651] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 38: [2022-11-25 18:31:29,651] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt. 38: [2022-11-25 18:31:29,652] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_25_mp_rank_03_optim_states.pt 38: [2022-11-25 18:31:29,652] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 21: [2022-11-25 18:31:29,641] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_86_mp_rank_01_optim_states.pt 21: [2022-11-25 18:31:29,641] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 63: [2022-11-25 18:31:29,665] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt. 63: [2022-11-25 18:31:29,665] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_124_mp_rank_02_optim_states.pt 63: [2022-11-25 18:31:29,665] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 59: [2022-11-25 18:31:29,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt. 59: [2022-11-25 18:31:29,671] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_111_mp_rank_03_optim_states.pt 59: [2022-11-25 18:31:29,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 58: [2022-11-25 18:31:29,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt. 58: [2022-11-25 18:31:29,671] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_106_mp_rank_02_optim_states.pt 59: [2022-11-25 18:31:29,671] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt. 58: [2022-11-25 18:31:29,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 59: [2022-11-25 18:31:29,671] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_109_mp_rank_02_optim_states.pt 59: [2022-11-25 18:31:29,671] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 29: [2022-11-25 18:31:29,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt. 29: [2022-11-25 18:31:29,672] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_116_mp_rank_01_optim_states.pt 29: [2022-11-25 18:31:29,672] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 37: [2022-11-25 18:31:29,672] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt. 37: [2022-11-25 18:31:29,673] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_23_mp_rank_03_optim_states.pt 37: [2022-11-25 18:31:29,673] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 48: [2022-11-25 18:31:29,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt. 48: [2022-11-25 18:31:29,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_67_mp_rank_02_optim_states.pt 48: [2022-11-25 18:31:29,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 34: [2022-11-25 18:31:29,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt. 34: [2022-11-25 18:31:29,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_11_mp_rank_03_optim_states.pt 34: [2022-11-25 18:31:29,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 38: [2022-11-25 18:31:29,674] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt. 38: [2022-11-25 18:31:29,674] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_25_mp_rank_02_optim_states.pt 38: [2022-11-25 18:31:29,674] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 63: [2022-11-25 18:31:29,680] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt. 63: [2022-11-25 18:31:29,680] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_126_mp_rank_02_optim_states.pt 63: [2022-11-25 18:31:29,680] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 54: [2022-11-25 18:31:29,684] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt. 54: [2022-11-25 18:31:29,684] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_90_mp_rank_02_optim_states.pt 54: [2022-11-25 18:31:29,684] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 48: [2022-11-25 18:31:29,697] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt. 48: [2022-11-25 18:31:29,697] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_66_mp_rank_03_optim_states.pt 48: [2022-11-25 18:31:29,698] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 37: [2022-11-25 18:31:29,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt. 37: [2022-11-25 18:31:29,700] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_21_mp_rank_02_optim_states.pt 37: [2022-11-25 18:31:29,700] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 59: [2022-11-25 18:31:29,700] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt. 59: [2022-11-25 18:31:29,701] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_108_mp_rank_03_optim_states.pt 59: [2022-11-25 18:31:29,701] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 63: [2022-11-25 18:31:29,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt. 63: [2022-11-25 18:31:29,702] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_126_mp_rank_03_optim_states.pt 63: [2022-11-25 18:31:29,702] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 34: [2022-11-25 18:31:29,702] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt. 34: [2022-11-25 18:31:29,703] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_8_mp_rank_03_optim_states.pt 34: [2022-11-25 18:31:29,703] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 29: [2022-11-25 18:31:29,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt. 29: [2022-11-25 18:31:29,704] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_117_mp_rank_00_optim_states.pt 29: [2022-11-25 18:31:29,704] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 54: [2022-11-25 18:31:29,704] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt. 54: [2022-11-25 18:31:29,705] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_90_mp_rank_03_optim_states.pt 54: [2022-11-25 18:31:29,705] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 21: [2022-11-25 18:31:29,709] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt. 21: [2022-11-25 18:31:29,709] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_85_mp_rank_00_optim_states.pt 21: [2022-11-25 18:31:29,709] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 59: [2022-11-25 18:31:29,723] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt. 59: [2022-11-25 18:31:29,723] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_109_mp_rank_03_optim_states.pt 59: [2022-11-25 18:31:29,723] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 21: [2022-11-25 18:31:29,725] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt. 21: [2022-11-25 18:31:29,725] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_84_mp_rank_00_optim_states.pt 21: [2022-11-25 18:31:29,725] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 34: [2022-11-25 18:31:29,727] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt. 34: [2022-11-25 18:31:29,727] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_10_mp_rank_02_optim_states.pt 34: [2022-11-25 18:31:29,727] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 29: [2022-11-25 18:31:29,729] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt. 29: [2022-11-25 18:31:29,729] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_118_mp_rank_00_optim_states.pt 29: [2022-11-25 18:31:29,729] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 21: [2022-11-25 18:31:29,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt. 21: [2022-11-25 18:31:29,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_84_mp_rank_01_optim_states.pt 58: [2022-11-25 18:31:29,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt. 21: [2022-11-25 18:31:29,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 58: [2022-11-25 18:31:29,732] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_107_mp_rank_03_optim_states.pt 58: [2022-11-25 18:31:29,732] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 54: [2022-11-25 18:31:29,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt. 54: [2022-11-25 18:31:29,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_88_mp_rank_02_optim_states.pt 54: [2022-11-25 18:31:29,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 38: [2022-11-25 18:31:29,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt. 38: [2022-11-25 18:31:29,733] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_26_mp_rank_02_optim_states.pt 38: [2022-11-25 18:31:29,733] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 58: [2022-11-25 18:31:29,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt. 58: [2022-11-25 18:31:29,733] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt. 58: [2022-11-25 18:31:29,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_107_mp_rank_02_optim_states.pt 58: [2022-11-25 18:31:29,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 58: [2022-11-25 18:31:29,734] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_105_mp_rank_02_optim_states.pt 58: [2022-11-25 18:31:29,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 54: [2022-11-25 18:31:29,737] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt. 54: [2022-11-25 18:31:29,737] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_91_mp_rank_02_optim_states.pt 54: [2022-11-25 18:31:29,737] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 37: [2022-11-25 18:31:29,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt. 37: [2022-11-25 18:31:29,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_22_mp_rank_02_optim_states.pt 37: [2022-11-25 18:31:29,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 34: [2022-11-25 18:31:29,739] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt. 34: [2022-11-25 18:31:29,739] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_11_mp_rank_02_optim_states.pt 34: [2022-11-25 18:31:29,739] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 37: [2022-11-25 18:31:29,747] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt. 37: [2022-11-25 18:31:29,747] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_23_mp_rank_02_optim_states.pt 37: [2022-11-25 18:31:29,747] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 38: [2022-11-25 18:31:29,752] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt. 38: [2022-11-25 18:31:29,752] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_26_mp_rank_03_optim_states.pt 38: [2022-11-25 18:31:29,752] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 48: [2022-11-25 18:31:29,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt. 48: [2022-11-25 18:31:29,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_64_mp_rank_03_optim_states.pt 48: [2022-11-25 18:31:29,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 48: [2022-11-25 18:31:29,753] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt. 48: [2022-11-25 18:31:29,753] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_67_mp_rank_03_optim_states.pt 48: [2022-11-25 18:31:29,753] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 63: [2022-11-25 18:31:29,759] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt. 63: [2022-11-25 18:31:29,759] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_125_mp_rank_03_optim_states.pt 63: [2022-11-25 18:31:29,759] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 29: [2022-11-25 18:31:29,767] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt. 29: [2022-11-25 18:31:29,767] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_119_mp_rank_00_optim_states.pt 29: [2022-11-25 18:31:29,767] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 63: [2022-11-25 18:31:29,795] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt. 63: [2022-11-25 18:31:29,795] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_125_mp_rank_02_optim_states.pt 63: [2022-11-25 18:31:29,795] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 59: [2022-11-25 18:31:29,924] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt. 59: [2022-11-25 18:31:29,924] [INFO] [engine.py:3213:_save_zero_checkpoint] bf16_zero checkpoint saved checkpoints_8b7/global_step5494/bf16_zero_pp_rank_110_mp_rank_02_optim_states.pt 59: [2022-11-25 18:31:29,924] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5494 is ready now! 0: successfully saved checkpoint at iteration 5494 to checkpoints_8b7 63: ------------------------------------------------------------------------------------------------------------ 63: test loss at the end of training for test data | lm loss value: 2.137998E+00 | lm loss PPL: 8.482440E+00 | 63: ------------------------------------------------------------------------------------------------------------ 61: END 2059276: Fri Nov 25 18:31:42 EET 2022 57: END 2059276: Fri Nov 25 18:31:42 EET 2022 63: END 2059276: Fri Nov 25 18:31:42 EET 2022 39: END 2059276: Fri Nov 25 18:31:42 EET 2022 53: END 2059276: Fri Nov 25 18:31:42 EET 2022 55: END 2059276: Fri Nov 25 18:31:42 EET 2022 51: END 2059276: Fri Nov 25 18:31:42 EET 2022 33: END 2059276: Fri Nov 25 18:31:42 EET 2022 1: END 2059276: Fri Nov 25 18:31:42 EET 2022 59: END 2059276: Fri Nov 25 18:31:42 EET 2022 13: END 2059276: Fri Nov 25 18:31:42 EET 2022 35: END 2059276: Fri Nov 25 18:31:42 EET 2022 5: END 2059276: Fri Nov 25 18:31:42 EET 2022 29: END 2059276: Fri Nov 25 18:31:42 EET 2022 37: END 2059276: Fri Nov 25 18:31:42 EET 2022 58: END 2059276: Fri Nov 25 18:31:42 EET 2022 40: END 2059276: Fri Nov 25 18:31:42 EET 2022 60: END 2059276: Fri Nov 25 18:31:42 EET 2022 52: END 2059276: Fri Nov 25 18:31:42 EET 2022 30: END 2059276: Fri Nov 25 18:31:42 EET 2022 4: END 2059276: Fri Nov 25 18:31:42 EET 2022 56: END 2059276: Fri Nov 25 18:31:42 EET 2022 54: END 2059276: Fri Nov 25 18:31:42 EET 2022 62: END 2059276: Fri Nov 25 18:31:42 EET 2022 36: END 2059276: Fri Nov 25 18:31:42 EET 2022 38: END 2059276: Fri Nov 25 18:31:42 EET 2022 0: END 2059276: Fri Nov 25 18:31:42 EET 2022 24: END 2059276: Fri Nov 25 18:31:42 EET 2022 49: END 2059276: Fri Nov 25 18:31:42 EET 2022 47: END 2059276: Fri Nov 25 18:31:42 EET 2022 41: END 2059276: Fri Nov 25 18:31:42 EET 2022 45: END 2059276: Fri Nov 25 18:31:42 EET 2022 9: END 2059276: Fri Nov 25 18:31:42 EET 2022 43: END 2059276: Fri Nov 25 18:31:42 EET 2022 27: END 2059276: Fri Nov 25 18:31:42 EET 2022 25: END 2059276: Fri Nov 25 18:31:42 EET 2022 3: END 2059276: Fri Nov 25 18:31:42 EET 2022 7: END 2059276: Fri Nov 25 18:31:42 EET 2022 17: END 2059276: Fri Nov 25 18:31:42 EET 2022 23: END 2059276: Fri Nov 25 18:31:42 EET 2022 31: END 2059276: Fri Nov 25 18:31:42 EET 2022 19: END 2059276: Fri Nov 25 18:31:42 EET 2022 34: END 2059276: Fri Nov 25 18:31:42 EET 2022 46: END 2059276: Fri Nov 25 18:31:42 EET 2022 44: END 2059276: Fri Nov 25 18:31:42 EET 2022 6: END 2059276: Fri Nov 25 18:31:42 EET 2022 16: END 2059276: Fri Nov 25 18:31:42 EET 2022 18: END 2059276: Fri Nov 25 18:31:42 EET 2022 10: END 2059276: Fri Nov 25 18:31:42 EET 2022 20: END 2059276: Fri Nov 25 18:31:42 EET 2022 8: END 2059276: Fri Nov 25 18:31:42 EET 2022 48: END 2059276: Fri Nov 25 18:31:42 EET 2022 50: END 2059276: Fri Nov 25 18:31:42 EET 2022 42: END 2059276: Fri Nov 25 18:31:42 EET 2022 32: END 2059276: Fri Nov 25 18:31:42 EET 2022 26: END 2059276: Fri Nov 25 18:31:42 EET 2022 14: END 2059276: Fri Nov 25 18:31:42 EET 2022 15: END 2059276: Fri Nov 25 18:31:42 EET 2022 21: END 2059276: Fri Nov 25 18:31:42 EET 2022 28: END 2059276: Fri Nov 25 18:31:42 EET 2022 11: END 2059276: Fri Nov 25 18:31:42 EET 2022 22: END 2059276: Fri Nov 25 18:31:42 EET 2022 12: END 2059276: Fri Nov 25 18:31:42 EET 2022 2: END 2059276: Fri Nov 25 18:31:42 EET 2022