diff --git "a/training_log.txt" "b/training_log.txt"
new file mode 100644--- /dev/null
+++ "b/training_log.txt"
@@ -0,0 +1,4434 @@
+[2024-06-18 19:24:24,793] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[93m [WARNING] [0m async_io requires the dev libaio .so object and headers but these were not found.
+[93m [WARNING] [0m async_io: please install the libaio-dev package with apt
+[93m [WARNING] [0m If libaio is already installed (perhaps from source), try setting the CFLAGS and LDFLAGS environment variables to where it can be found.
+[93m [WARNING] [0m Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
+[93m [WARNING] [0m sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.3
+[93m [WARNING] [0m using untested triton version (2.3.1), only 1.0.0 is known to be compatible
+/dss/dssmcmlfs01/pn34sa/pn34sa-dss-0000/haowei/intern_clean/lib/python3.10/site-packages/transformers/utils/generic.py:311: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.
+  torch.utils._pytree._register_pytree_node(
+/dss/dssmcmlfs01/pn34sa/pn34sa-dss-0000/haowei/intern_clean/lib/python3.10/site-packages/transformers/utils/generic.py:311: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.
+  torch.utils._pytree._register_pytree_node(
+FlashAttention is not installed.
+Traceback (most recent call last):
+  File "/dss/dssmcmlfs01/pn34sa/pn34sa-dss-0000/haowei/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 19, in <module>
+    from internvl.model.internvl_chat import (InternVisionConfig,
+  File "/dss/dssmcmlfs01/pn34sa/pn34sa-dss-0000/haowei/InternVL/internvl_chat/internvl/model/internvl_chat/__init__.py", line 10, in <module>
+    from .modeling_internvl_chat import InternVLChatModel
+  File "/dss/dssmcmlfs01/pn34sa/pn34sa-dss-0000/haowei/InternVL/internvl_chat/internvl/model/internvl_chat/modeling_internvl_chat.py", line 11, in <module>
+    from internvl.model.phi3.modeling_phi3 import Phi3ForCausalLM
+  File "/dss/dssmcmlfs01/pn34sa/pn34sa-dss-0000/haowei/InternVL/internvl_chat/internvl/model/phi3/modeling_phi3.py", line 28, in <module>
+    from transformers.cache_utils import Cache, DynamicCache
+ModuleNotFoundError: No module named 'transformers.cache_utils'
+[2024-06-18 19:27:39,142] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+[2024-06-18 19:27:46,998] [INFO] [comm.py:637:init_distributed] cdb=None
+[2024-06-18 19:27:46,998] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[2024-06-18 19:39:27,279] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+[2024-06-18 19:39:30,548] [INFO] [comm.py:637:init_distributed] cdb=None
+[2024-06-18 19:39:30,548] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[2024-06-18 19:45:53,175] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+[2024-06-18 19:45:56,443] [INFO] [comm.py:637:init_distributed] cdb=None
+[2024-06-18 19:45:56,443] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[2024-06-18 19:54:33,951] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+[2024-06-18 19:54:37,230] [INFO] [comm.py:637:init_distributed] cdb=None
+[2024-06-18 19:54:37,230] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[2024-06-18 20:16:50,604] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+[2024-06-18 20:16:53,931] [INFO] [comm.py:637:init_distributed] cdb=None
+[2024-06-18 20:16:53,931] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+Traceback (most recent call last):
+  File "/dss/dssmcmlfs01/pn34sa/pn34sa-dss-0000/haowei/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 12, in <module>
+    import orjson as json
+ModuleNotFoundError: No module named 'orjson'
+[2024-06-18 20:37:56,715] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+Traceback (most recent call last):
+  File "/dss/dssmcmlfs01/pn34sa/pn34sa-dss-0000/haowei/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 19, in <module>
+    from internvl.model.internvl_chat import (InternVisionConfig,
+  File "/dss/dssmcmlfs01/pn34sa/pn34sa-dss-0000/haowei/InternVL/internvl_chat/internvl/model/internvl_chat/__init__.py", line 10, in <module>
+    from .modeling_internvl_chat import InternVLChatModel
+  File "/dss/dssmcmlfs01/pn34sa/pn34sa-dss-0000/haowei/InternVL/internvl_chat/internvl/model/internvl_chat/modeling_internvl_chat.py", line 12, in <module>
+    from peft import LoraConfig, get_peft_model
+ModuleNotFoundError: No module named 'peft'
+[2024-06-18 20:38:14,879] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+[2024-06-18 20:38:18,227] [INFO] [comm.py:637:init_distributed] cdb=None
+[2024-06-18 20:38:18,227] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[2024-06-18 20:44:30,812] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+[2024-06-18 20:44:38,626] [INFO] [comm.py:637:init_distributed] cdb=None
+[2024-06-18 20:44:38,626] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[E socket.cpp:860] [c10d] The client socket has timed out after 1800s while trying to connect to (lrz-hgx-a100-004, 34229).
+Traceback (most recent call last):
+  File "/dss/dssmcmlfs01/pn34sa/pn34sa-dss-0000/haowei/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 655, in <module>
+    main()
+  File "/dss/dssmcmlfs01/pn34sa/pn34sa-dss-0000/haowei/InternVL/internvl_chat/internvl/train/internvl_chat_finetune.py", line 421, in main
+    init_dist(launcher=launcher, backend='nccl')
+  File "/dss/dssmcmlfs01/pn34sa/pn34sa-dss-0000/haowei/InternVL/internvl_chat/internvl/dist_utils.py", line 40, in init_dist
+    _init_dist_slurm(backend, **kwargs)
+  File "/dss/dssmcmlfs01/pn34sa/pn34sa-dss-0000/haowei/InternVL/internvl_chat/internvl/dist_utils.py", line 104, in _init_dist_slurm
+    deepspeed.init_distributed(dist_backend=backend)
+  File "/dss/dssmcmlfs01/pn34sa/pn34sa-dss-0000/haowei/internvl_zhw/lib/python3.10/site-packages/deepspeed/comm/comm.py", line 670, in init_distributed
+    cdb = TorchBackend(dist_backend, timeout, init_method, rank, world_size)
+  File "/dss/dssmcmlfs01/pn34sa/pn34sa-dss-0000/haowei/internvl_zhw/lib/python3.10/site-packages/deepspeed/comm/torch.py", line 121, in __init__
+    self.init_process_group(backend, timeout, init_method, rank, world_size)
+  File "/dss/dssmcmlfs01/pn34sa/pn34sa-dss-0000/haowei/internvl_zhw/lib/python3.10/site-packages/deepspeed/comm/torch.py", line 149, in init_process_group
+    torch.distributed.init_process_group(backend,
+  File "/dss/dssmcmlfs01/pn34sa/pn34sa-dss-0000/haowei/internvl_zhw/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 900, in init_process_group
+    store, rank, world_size = next(rendezvous_iterator)
+  File "/dss/dssmcmlfs01/pn34sa/pn34sa-dss-0000/haowei/internvl_zhw/lib/python3.10/site-packages/torch/distributed/rendezvous.py", line 245, in _env_rendezvous_handler
+    store = _create_c10d_store(master_addr, master_port, rank, world_size, timeout)
+  File "/dss/dssmcmlfs01/pn34sa/pn34sa-dss-0000/haowei/internvl_zhw/lib/python3.10/site-packages/torch/distributed/rendezvous.py", line 176, in _create_c10d_store
+    return TCPStore(
+TimeoutError: The client socket has timed out after 1800s while trying to connect to (lrz-hgx-a100-004, 34229).
+[2024-06-18 22:01:37,577] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+[2024-06-18 22:01:40,800] [INFO] [comm.py:637:init_distributed] cdb=None
+[2024-06-18 22:01:40,800] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+[2024-06-18 22:04:58,931] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+[2024-06-18 22:05:02,275] [INFO] [comm.py:637:init_distributed] cdb=None
+[2024-06-18 22:05:02,275] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+06/18/2024 22:05:02 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1distributed training: True, 16-bits training: False
+06/18/2024 22:05:02 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
+_n_gpu=1,
+adafactor=False,
+adam_beta1=0.9,
+adam_beta2=0.999,
+adam_epsilon=1e-08,
+auto_find_batch_size=False,
+bf16=True,
+bf16_full_eval=False,
+data_seed=None,
+dataloader_drop_last=False,
+dataloader_num_workers=4,
+dataloader_persistent_workers=False,
+dataloader_pin_memory=True,
+ddp_backend=None,
+ddp_broadcast_buffers=None,
+ddp_bucket_cap_mb=None,
+ddp_find_unused_parameters=None,
+ddp_timeout=1800,
+debug=[],
+deepspeed=/dss/dssmcmlfs01/pn34sa/pn34sa-dss-0000/haowei/InternVL/internvl_chat/zero_stage3_config.json,
+disable_tqdm=False,
+dispatch_batches=None,
+do_eval=False,
+do_predict=False,
+do_train=True,
+eval_accumulation_steps=None,
+eval_delay=0,
+eval_steps=None,
+evaluation_strategy=no,
+fp16=False,
+fp16_backend=auto,
+fp16_full_eval=False,
+fp16_opt_level=O1,
+fsdp=[],
+fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
+fsdp_min_num_params=0,
+fsdp_transformer_layer_cls_to_wrap=None,
+full_determinism=False,
+gradient_accumulation_steps=2,
+gradient_checkpointing=False,
+gradient_checkpointing_kwargs=None,
+greater_is_better=None,
+group_by_length=True,
+half_precision_backend=auto,
+hub_always_push=False,
+hub_model_id=None,
+hub_private_repo=False,
+hub_strategy=every_save,
+hub_token=<HUB_TOKEN>,
+ignore_data_skip=False,
+include_inputs_for_metrics=False,
+include_num_input_tokens_seen=False,
+include_tokens_per_second=False,
+jit_mode_eval=False,
+label_names=None,
+label_smoothing_factor=0.0,
+learning_rate=0.0001,
+length_column_name=length,
+load_best_model_at_end=False,
+local_rank=0,
+log_level=passive,
+log_level_replica=warning,
+log_on_each_node=True,
+logging_dir=ckpts/baseline3_7_epochs/runs/Jun18_22-05-02_lrz-hgx-a100-003,
+logging_first_step=False,
+logging_nan_inf_filter=True,
+logging_steps=1.0,
+logging_strategy=steps,
+lr_scheduler_kwargs={},
+lr_scheduler_type=cosine,
+max_grad_norm=1.0,
+max_steps=-1,
+metric_for_best_model=None,
+mp_parameters=,
+neftune_noise_alpha=None,
+no_cuda=False,
+num_train_epochs=7.0,
+optim=adamw_torch,
+optim_args=None,
+output_dir=ckpts/baseline3_7_epochs/,
+overwrite_output_dir=True,
+past_index=-1,
+per_device_eval_batch_size=8,
+per_device_train_batch_size=2,
+prediction_loss_only=False,
+push_to_hub=False,
+push_to_hub_model_id=None,
+push_to_hub_organization=None,
+push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
+ray_scope=last,
+remove_unused_columns=True,
+report_to=[],
+resume_from_checkpoint=None,
+run_name=ckpts/baseline3_7_epochs/,
+save_on_each_node=False,
+save_only_model=False,
+save_safetensors=True,
+save_steps=10000,
+save_strategy=steps,
+save_total_limit=3,
+seed=42,
+skip_memory_metrics=True,
+split_batches=False,
+tf32=None,
+torch_compile=False,
+torch_compile_backend=None,
+torch_compile_mode=None,
+torchdynamo=None,
+tpu_metrics_debug=False,
+tpu_num_cores=None,
+use_cpu=False,
+use_ipex=False,
+use_legacy_prediction_loop=False,
+use_mps_device=False,
+warmup_ratio=0.03,
+warmup_steps=0,
+weight_decay=0.05,
+)
+06/18/2024 22:05:02 - INFO - __main__ - Loading Tokenizer: /dss/dssmcmlfs01/pn34sa/pn34sa-dss-0000/haowei/HF_models/InternVL-Chat-V1-5
+[INFO|tokenization_utils_base.py:2025] 2024-06-18 22:05:02,384 >> loading file ./tokenizer.model
+[INFO|tokenization_utils_base.py:2025] 2024-06-18 22:05:02,384 >> loading file added_tokens.json
+[INFO|tokenization_utils_base.py:2025] 2024-06-18 22:05:02,384 >> loading file special_tokens_map.json
+[INFO|tokenization_utils_base.py:2025] 2024-06-18 22:05:02,384 >> loading file tokenizer_config.json
+[INFO|tokenization_utils_base.py:2025] 2024-06-18 22:05:02,384 >> loading file tokenizer.json
+[WARNING|logging.py:314] 2024-06-18 22:05:02,612 >> Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
+06/18/2024 22:05:02 - INFO - __main__ - Loading InternVLChatModel...
+[INFO|configuration_utils.py:727] 2024-06-18 22:05:02,759 >> loading configuration file /dss/dssmcmlfs01/pn34sa/pn34sa-dss-0000/haowei/HF_models/InternVL-Chat-V1-5/config.json
+[INFO|configuration_utils.py:792] 2024-06-18 22:05:02,760 >> Model config InternVLChatConfig {
+  "_commit_hash": null,
+  "_name_or_path": "OpenGVLab/InternVL-Chat-V1-5",
+  "architectures": [
+    "InternVLChatModel"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_internvl_chat.InternVLChatConfig",
+    "AutoModel": "modeling_internvl_chat.InternVLChatModel",
+    "AutoModelForCausalLM": "modeling_internvl_chat.InternVLChatModel"
+  },
+  "downsample_ratio": 0.5,
+  "dynamic_image_size": true,
+  "force_image_size": 448,
+  "llm_config": {
+    "_name_or_path": "pretrained/internlm2-chat-20b/",
+    "add_cross_attention": false,
+    "architectures": [
+      "InternLM2ForCausalLM"
+    ],
+    "attn_implementation": "flash_attention_2",
+    "auto_map": {
+      "AutoConfig": "configuration_internlm2.InternLM2Config",
+      "AutoModel": "modeling_internlm2.InternLM2ForCausalLM",
+      "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM"
+    },
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bias": false,
+    "bos_token_id": 1,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 2,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "silu",
+    "hidden_size": 6144,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_range": 0.02,
+    "intermediate_size": 16384,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 32768,
+    "min_length": 0,
+    "model_type": "internlm2",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 48,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 48,
+    "num_key_value_heads": 8,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 2,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "rms_norm_eps": 1e-05,
+    "rope_scaling": {
+      "factor": 3.0,
+      "type": "dynamic"
+    },
+    "rope_theta": 1000000,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": false,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "bfloat16",
+    "torchscript": false,
+    "transformers_version": "4.37.2",
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": true,
+    "vocab_size": 92553
+  },
+  "max_dynamic_patch": 12,
+  "min_dynamic_patch": 1,
+  "model_type": "internvl_chat",
+  "pad2square": false,
+  "ps_version": "v2",
+  "select_layer": -1,
+  "template": "internlm2-chat",
+  "torch_dtype": "bfloat16",
+  "transformers_version": null,
+  "use_backbone_lora": 0,
+  "use_llm_lora": 0,
+  "use_thumbnail": true,
+  "vision_config": {
+    "_name_or_path": "OpenGVLab/InternViT-6B-448px-V1-5",
+    "add_cross_attention": false,
+    "architectures": [
+      "InternVisionModel"
+    ],
+    "attention_dropout": 0.0,
+    "auto_map": {
+      "AutoConfig": "configuration_intern_vit.InternVisionConfig",
+      "AutoModel": "modeling_intern_vit.InternVisionModel"
+    },
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "drop_path_rate": 0.4,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "gelu",
+    "hidden_size": 3200,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_size": 448,
+    "initializer_factor": 0.1,
+    "initializer_range": 1e-10,
+    "intermediate_size": 12800,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-06,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "intern_vit_6b",
+    "no_repeat_ngram_size": 0,
+    "norm_type": "rms_norm",
+    "num_attention_heads": 25,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_channels": 3,
+    "num_hidden_layers": 45,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 14,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "qk_normalization": true,
+    "qkv_bias": false,
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": "bfloat16",
+    "torchscript": false,
+    "transformers_version": "4.37.2",
+    "typical_p": 1.0,
+    "use_bfloat16": true,
+    "use_flash_attn": true
+  }
+}
+
+06/18/2024 22:05:02 - INFO - __main__ - Using flash_attention_2 for InternLM
+[INFO|modeling_utils.py:3473] 2024-06-18 22:05:02,762 >> loading weights file /dss/dssmcmlfs01/pn34sa/pn34sa-dss-0000/haowei/HF_models/InternVL-Chat-V1-5/model.safetensors.index.json
+[INFO|modeling_utils.py:1426] 2024-06-18 22:05:02,764 >> Instantiating InternVLChatModel model under default dtype torch.bfloat16.
+[INFO|modeling_utils.py:3582] 2024-06-18 22:05:02,764 >> Detected DeepSpeed ZeRO-3: activating zero.init() for this model
+[INFO|configuration_utils.py:826] 2024-06-18 22:05:02,773 >> Generate config GenerationConfig {}
+
+[INFO|configuration_utils.py:826] 2024-06-18 22:05:03,711 >> Generate config GenerationConfig {
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "pad_token_id": 2
+}
+
+[2024-06-18 22:05:04,030] [INFO] [partition_parameters.py:343:__exit__] finished initializing model - num_params = 934, num_elems = 25.51B
+Loading checkpoint shards:   0%|          | 0/11 [00:00<?, ?it/s]Loading checkpoint shards:   9%|▉         | 1/11 [00:22<03:47, 22.76s/it]Loading checkpoint shards:  18%|█▊        | 2/11 [00:45<03:23, 22.64s/it]Loading checkpoint shards:  27%|██▋       | 3/11 [00:59<02:28, 18.56s/it]Loading checkpoint shards:  36%|███▋      | 4/11 [01:04<01:34, 13.54s/it]Loading checkpoint shards:  45%|████▌     | 5/11 [01:10<01:04, 10.82s/it]Loading checkpoint shards:  55%|█████▍    | 6/11 [01:16<00:45,  9.16s/it]Loading checkpoint shards:  64%|██████▎   | 7/11 [01:22<00:32,  8.10s/it]Loading checkpoint shards:  73%|███████▎  | 8/11 [01:29<00:22,  7.57s/it]Loading checkpoint shards:  82%|████████▏ | 9/11 [01:35<00:14,  7.06s/it]Loading checkpoint shards:  91%|█████████ | 10/11 [01:41<00:06,  6.71s/it]Loading checkpoint shards: 100%|██████████| 11/11 [01:43<00:00,  5.51s/it]Loading checkpoint shards: 100%|██████████| 11/11 [01:43<00:00,  9.44s/it]
+[INFO|modeling_utils.py:4350] 2024-06-18 22:06:48,106 >> All model checkpoint weights were used when initializing InternVLChatModel.
+
+[INFO|modeling_utils.py:4358] 2024-06-18 22:06:48,106 >> All the weights of InternVLChatModel were initialized from the model checkpoint at /dss/dssmcmlfs01/pn34sa/pn34sa-dss-0000/haowei/HF_models/InternVL-Chat-V1-5.
+If your task is similar to the task the model of the checkpoint was trained on, you can already use InternVLChatModel for predictions without further training.
+[INFO|configuration_utils.py:779] 2024-06-18 22:06:48,117 >> loading configuration file /dss/dssmcmlfs01/pn34sa/pn34sa-dss-0000/haowei/HF_models/InternVL-Chat-V1-5/generation_config.json
+[INFO|configuration_utils.py:826] 2024-06-18 22:06:48,117 >> Generate config GenerationConfig {}
+
+06/18/2024 22:06:48 - INFO - __main__ - Finished
+06/18/2024 22:06:48 - INFO - __main__ - model.config.force_image_size: 448
+06/18/2024 22:06:48 - INFO - __main__ - data_args.force_image_size: 448
+06/18/2024 22:06:48 - INFO - __main__ - model.config.vision_config.image_size: 448
+06/18/2024 22:06:48 - INFO - __main__ - [Dataset] num_image_token: 256
+06/18/2024 22:06:48 - INFO - __main__ - [Dataset] dynamic_image_size: True
+06/18/2024 22:06:48 - INFO - __main__ - [Dataset] use_thumbnail: True
+06/18/2024 22:06:48 - INFO - __main__ - [Dataset] min_dynamic_patch: 1, max_dynamic_patch: 6
+06/18/2024 22:06:48 - INFO - __main__ - Formatting inputs...Skip in lazy mode
+06/18/2024 22:06:48 - INFO - __main__ - Add dataset:decovqa_0 with length: 400
+trainable params: 11,010,048 || all params: 19,872,270,336 || trainable%: 0.0554
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.0.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.0.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.1.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.1.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.2.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.2.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.3.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.3.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.4.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.4.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.5.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.5.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.6.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.6.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.7.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.7.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.8.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.8.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.9.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.9.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.10.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.10.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.11.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.11.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.12.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.12.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.13.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.13.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.14.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.14.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.15.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.15.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.16.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.16.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.17.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.17.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.18.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.18.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.19.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.19.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.20.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.20.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.21.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.21.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.22.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.22.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.23.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.23.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.24.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.24.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.25.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.25.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.26.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.26.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.27.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.27.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.28.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.28.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.29.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.29.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.30.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.30.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.31.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.31.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.32.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.32.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.33.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.33.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.34.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.34.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.35.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.35.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.36.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.36.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.37.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.37.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.38.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.38.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.39.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.39.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.40.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.40.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.41.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.41.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.42.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.42.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.43.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.43.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.44.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.44.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.45.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.45.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.46.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.46.attention.wqkv.lora_B.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.47.attention.wqkv.lora_A.default.weight
+06/18/2024 22:06:48 - INFO - __main__ - language_model.base_model.model.model.layers.47.attention.wqkv.lora_B.default.weight
+[INFO|trainer.py:571] 2024-06-18 22:06:48,872 >> Using auto half precision backend
+[2024-06-18 22:06:48,985] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.13.5, git-hash=unknown, git-branch=unknown
+[2024-06-18 22:06:49,017] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
+Using /dss/dsshome1/00/di93zun/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
+Detected CUDA files, patching ldflags
+Emitting ninja build file /dss/dsshome1/00/di93zun/.cache/torch_extensions/py310_cu118/fused_adam/build.ninja...
+Building extension module fused_adam...
+Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
+ninja: no work to do.
+Loading extension module fused_adam...
+Time to load fused_adam op: 0.1629350185394287 seconds
+[2024-06-18 22:06:49,190] [INFO] [logging.py:96:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adamw as basic optimizer
+[2024-06-18 22:06:49,190] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer
+[2024-06-18 22:06:49,229] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam
+[2024-06-18 22:06:49,229] [INFO] [utils.py:56:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type=<class 'deepspeed.ops.adam.fused_adam.FusedAdam'>
+[2024-06-18 22:06:49,229] [INFO] [logging.py:96:log_dist] [Rank 0] Creating fp16 ZeRO stage 3 optimizer, MiCS is enabled False, Hierarchical params gather False
+[2024-06-18 22:06:49,229] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.bfloat16 ZeRO stage 3 optimizer
+[2024-06-18 22:06:49,338] [INFO] [utils.py:800:see_memory_usage] Stage 3 initialize beginning
+[2024-06-18 22:06:49,338] [INFO] [utils.py:801:see_memory_usage] MA 48.29 GB         Max_MA 50.39 GB         CA 49.98 GB         Max_CA 52 GB 
+[2024-06-18 22:06:49,339] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory:  used = 35.61 GB, percent = 3.5%
+[2024-06-18 22:06:49,345] [INFO] [stage3.py:130:__init__] Reduce bucket size 1000000000
+[2024-06-18 22:06:49,345] [INFO] [stage3.py:131:__init__] Prefetch bucket size 1000000000
+[2024-06-18 22:06:49,438] [INFO] [utils.py:800:see_memory_usage] DeepSpeedZeRoOffload initialize [begin]
+[2024-06-18 22:06:49,439] [INFO] [utils.py:801:see_memory_usage] MA 48.29 GB         Max_MA 48.29 GB         CA 49.98 GB         Max_CA 50 GB 
+[2024-06-18 22:06:49,439] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory:  used = 35.61 GB, percent = 3.5%
+Parameter Offload: Total persistent parameters: 18539904 in 606 params
+[2024-06-18 22:06:49,639] [INFO] [utils.py:800:see_memory_usage] DeepSpeedZeRoOffload initialize [end]
+[2024-06-18 22:06:49,639] [INFO] [utils.py:801:see_memory_usage] MA 48.29 GB         Max_MA 48.3 GB         CA 49.98 GB         Max_CA 50 GB 
+[2024-06-18 22:06:49,640] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory:  used = 35.62 GB, percent = 3.5%
+[2024-06-18 22:06:49,741] [INFO] [utils.py:800:see_memory_usage] Before creating fp16 partitions
+[2024-06-18 22:06:49,742] [INFO] [utils.py:801:see_memory_usage] MA 48.29 GB         Max_MA 48.29 GB         CA 49.98 GB         Max_CA 50 GB 
+[2024-06-18 22:06:49,742] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory:  used = 35.62 GB, percent = 3.5%
+[2024-06-18 22:06:49,965] [INFO] [utils.py:800:see_memory_usage] After creating fp16 partitions: 1
+[2024-06-18 22:06:49,966] [INFO] [utils.py:801:see_memory_usage] MA 48.29 GB         Max_MA 48.29 GB         CA 49.96 GB         Max_CA 50 GB 
+[2024-06-18 22:06:49,966] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory:  used = 35.61 GB, percent = 3.5%
+[2024-06-18 22:06:50,067] [INFO] [utils.py:800:see_memory_usage] Before creating fp32 partitions
+[2024-06-18 22:06:50,068] [INFO] [utils.py:801:see_memory_usage] MA 48.29 GB         Max_MA 48.29 GB         CA 49.96 GB         Max_CA 50 GB 
+[2024-06-18 22:06:50,068] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory:  used = 35.61 GB, percent = 3.5%
+[2024-06-18 22:06:50,169] [INFO] [utils.py:800:see_memory_usage] After creating fp32 partitions
+[2024-06-18 22:06:50,169] [INFO] [utils.py:801:see_memory_usage] MA 48.34 GB         Max_MA 48.36 GB         CA 49.96 GB         Max_CA 50 GB 
+[2024-06-18 22:06:50,170] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory:  used = 35.61 GB, percent = 3.5%
+[2024-06-18 22:06:50,271] [INFO] [utils.py:800:see_memory_usage] Before initializing optimizer states
+[2024-06-18 22:06:50,271] [INFO] [utils.py:801:see_memory_usage] MA 48.34 GB         Max_MA 48.34 GB         CA 49.96 GB         Max_CA 50 GB 
+[2024-06-18 22:06:50,271] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory:  used = 35.61 GB, percent = 3.5%
+[2024-06-18 22:06:50,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | init_optimizer_state: 0.08
+[2024-06-18 22:06:50,373] [INFO] [utils.py:800:see_memory_usage] After initializing optimizer states
+[2024-06-18 22:06:50,374] [INFO] [utils.py:801:see_memory_usage] MA 48.34 GB         Max_MA 48.38 GB         CA 49.96 GB         Max_CA 50 GB 
+[2024-06-18 22:06:50,374] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory:  used = 35.61 GB, percent = 3.5%
+[2024-06-18 22:06:50,374] [INFO] [stage3.py:486:_setup_for_real_optimizer] optimizer state initialized
+[2024-06-18 22:06:50,503] [INFO] [utils.py:800:see_memory_usage] After initializing ZeRO optimizer
+[2024-06-18 22:06:50,504] [INFO] [utils.py:801:see_memory_usage] MA 50.22 GB         Max_MA 50.22 GB         CA 51.82 GB         Max_CA 52 GB 
+[2024-06-18 22:06:50,504] [INFO] [utils.py:808:see_memory_usage] CPU Virtual Memory:  used = 35.61 GB, percent = 3.5%
+[2024-06-18 22:06:50,504] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = adamw
+[2024-06-18 22:06:50,504] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client callable to create LR scheduler
+[2024-06-18 22:06:50,504] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = <torch.optim.lr_scheduler.LambdaLR object at 0x7f54dc69d0c0>
+[2024-06-18 22:06:50,504] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[0.0], mom=[[0.9, 0.999]]
+[2024-06-18 22:06:50,508] [INFO] [config.py:996:print] DeepSpeedEngine configuration:
+[2024-06-18 22:06:50,508] [INFO] [config.py:1000:print]   activation_checkpointing_config  {
+    "partition_activations": false, 
+    "contiguous_memory_optimization": false, 
+    "cpu_checkpointing": false, 
+    "number_checkpoints": null, 
+    "synchronize_checkpoint_boundary": false, 
+    "profile": false
+}
+[2024-06-18 22:06:50,508] [INFO] [config.py:1000:print]   aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}
+[2024-06-18 22:06:50,508] [INFO] [config.py:1000:print]   amp_enabled .................. False
+[2024-06-18 22:06:50,508] [INFO] [config.py:1000:print]   amp_params ................... False
+[2024-06-18 22:06:50,508] [INFO] [config.py:1000:print]   autotuning_config ............ {
+    "enabled": false, 
+    "start_step": null, 
+    "end_step": null, 
+    "metric_path": null, 
+    "arg_mappings": null, 
+    "metric": "throughput", 
+    "model_info": null, 
+    "results_dir": "autotuning_results", 
+    "exps_dir": "autotuning_exps", 
+    "overwrite": true, 
+    "fast": true, 
+    "start_profile_step": 3, 
+    "end_profile_step": 5, 
+    "tuner_type": "gridsearch", 
+    "tuner_early_stopping": 5, 
+    "tuner_num_trials": 50, 
+    "model_info_path": null, 
+    "mp_size": 1, 
+    "max_train_batch_size": null, 
+    "min_train_batch_size": 1, 
+    "max_train_micro_batch_size_per_gpu": 1.024000e+03, 
+    "min_train_micro_batch_size_per_gpu": 1, 
+    "num_tuning_micro_batch_sizes": 3
+}
+[2024-06-18 22:06:50,508] [INFO] [config.py:1000:print]   bfloat16_enabled ............. True
+[2024-06-18 22:06:50,508] [INFO] [config.py:1000:print]   bfloat16_immediate_grad_update  False
+[2024-06-18 22:06:50,508] [INFO] [config.py:1000:print]   checkpoint_parallel_write_pipeline  False
+[2024-06-18 22:06:50,508] [INFO] [config.py:1000:print]   checkpoint_tag_validation_enabled  True
+[2024-06-18 22:06:50,508] [INFO] [config.py:1000:print]   checkpoint_tag_validation_fail  False
+[2024-06-18 22:06:50,508] [INFO] [config.py:1000:print]   comms_config ................. <deepspeed.comm.config.DeepSpeedCommsConfig object at 0x7f5542af1090>
+[2024-06-18 22:06:50,508] [INFO] [config.py:1000:print]   communication_data_type ...... None
+[2024-06-18 22:06:50,508] [INFO] [config.py:1000:print]   compile_config ............... enabled=False backend='inductor' kwargs={}
+[2024-06-18 22:06:50,508] [INFO] [config.py:1000:print]   compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}
+[2024-06-18 22:06:50,508] [INFO] [config.py:1000:print]   curriculum_enabled_legacy .... False
+[2024-06-18 22:06:50,508] [INFO] [config.py:1000:print]   curriculum_params_legacy ..... False
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}}
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   data_efficiency_enabled ...... False
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   dataloader_drop_last ......... False
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   disable_allgather ............ False
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   dump_state ................... False
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   dynamic_loss_scale_args ...... None
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   eigenvalue_enabled ........... False
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   eigenvalue_gas_boundary_resolution  1
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   eigenvalue_layer_name ........ bert.encoder.layer
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   eigenvalue_layer_num ......... 0
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   eigenvalue_max_iter .......... 100
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   eigenvalue_stability ......... 1e-06
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   eigenvalue_tol ............... 0.01
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   eigenvalue_verbose ........... False
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   elasticity_enabled ........... False
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   flops_profiler_config ........ {
+    "enabled": false, 
+    "recompute_fwd_factor": 0.0, 
+    "profile_step": 1, 
+    "module_depth": -1, 
+    "top_modules": 1, 
+    "detailed": true, 
+    "output_file": null
+}
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   fp16_auto_cast ............... None
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   fp16_enabled ................. False
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   fp16_master_weights_and_gradients  False
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   global_rank .................. 0
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   grad_accum_dtype ............. None
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   gradient_accumulation_steps .. 2
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   gradient_clipping ............ 1.0
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   gradient_predivide_factor .... 1.0
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   graph_harvesting ............. False
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   initial_dynamic_scale ........ 1
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   load_universal_checkpoint .... False
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   loss_scale ................... 1.0
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   memory_breakdown ............. False
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   mics_hierarchial_params_gather  False
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   mics_shard_size .............. -1
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   nebula_config ................ {
+    "enabled": false, 
+    "persistent_storage_path": null, 
+    "persistent_time_interval": 100, 
+    "num_of_version_in_retention": 2, 
+    "enable_nebula_load": true, 
+    "load_path": null
+}
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   optimizer_legacy_fusion ...... False
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   optimizer_name ............... adamw
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   optimizer_params ............. {'lr': 0.0001, 'betas': [0.9, 0.999], 'eps': 1e-08, 'weight_decay': 0.05}
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True}
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   pld_enabled .................. False
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   pld_params ................... False
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   prescale_gradients ........... False
+[2024-06-18 22:06:50,509] [INFO] [config.py:1000:print]   scheduler_name ............... None
+[2024-06-18 22:06:50,510] [INFO] [config.py:1000:print]   scheduler_params ............. None
+[2024-06-18 22:06:50,510] [INFO] [config.py:1000:print]   seq_parallel_communication_data_type  torch.float32
+[2024-06-18 22:06:50,510] [INFO] [config.py:1000:print]   sparse_attention ............. None
+[2024-06-18 22:06:50,510] [INFO] [config.py:1000:print]   sparse_gradients_enabled ..... False
+[2024-06-18 22:06:50,510] [INFO] [config.py:1000:print]   steps_per_print .............. inf
+[2024-06-18 22:06:50,510] [INFO] [config.py:1000:print]   train_batch_size ............. 4
+[2024-06-18 22:06:50,510] [INFO] [config.py:1000:print]   train_micro_batch_size_per_gpu  2
+[2024-06-18 22:06:50,510] [INFO] [config.py:1000:print]   use_data_before_expert_parallel_  False
+[2024-06-18 22:06:50,510] [INFO] [config.py:1000:print]   use_node_local_storage ....... False
+[2024-06-18 22:06:50,510] [INFO] [config.py:1000:print]   wall_clock_breakdown ......... True
+[2024-06-18 22:06:50,510] [INFO] [config.py:1000:print]   weight_quantization_config ... None
+[2024-06-18 22:06:50,510] [INFO] [config.py:1000:print]   world_size ................... 1
+[2024-06-18 22:06:50,510] [INFO] [config.py:1000:print]   zero_allow_untested_optimizer  False
+[2024-06-18 22:06:50,510] [INFO] [config.py:1000:print]   zero_config .................. stage=3 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=1000000000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500,000,000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=None sub_group_size=1000000000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=1000000000 param_persistence_threshold=10000000 model_persistence_threshold=sys.maxsize max_live_parameters=1000000000 max_reuse_distance=1000000000 gather_16bit_weights_on_model_save=True stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=True pipeline_loading_checkpoint=False override_module_apply=True
+[2024-06-18 22:06:50,510] [INFO] [config.py:1000:print]   zero_enabled ................. True
+[2024-06-18 22:06:50,510] [INFO] [config.py:1000:print]   zero_force_ds_cpu_optimizer .. True
+[2024-06-18 22:06:50,510] [INFO] [config.py:1000:print]   zero_optimization_stage ...... 3
+[2024-06-18 22:06:50,510] [INFO] [config.py:986:print_user_config]   json = {
+    "zero_optimization": {
+        "stage": 3, 
+        "overlap_comm": true, 
+        "contiguous_gradients": true, 
+        "sub_group_size": 1.000000e+09, 
+        "reduce_bucket_size": 1.000000e+09, 
+        "stage3_prefetch_bucket_size": 1.000000e+09, 
+        "stage3_param_persistence_threshold": 1.000000e+07, 
+        "stage3_max_live_parameters": 1.000000e+09, 
+        "stage3_max_reuse_distance": 1.000000e+09, 
+        "stage3_gather_16bit_weights_on_model_save": true
+    }, 
+    "fp16": {
+        "enabled": false, 
+        "auto_cast": true, 
+        "loss_scale": 0, 
+        "initial_scale_power": 32, 
+        "loss_scale_window": 1000, 
+        "hysteresis": 2, 
+        "min_loss_scale": 1
+    }, 
+    "bf16": {
+        "enabled": true
+    }, 
+    "optimizer": {
+        "type": "AdamW", 
+        "params": {
+            "lr": 0.0001, 
+            "betas": [0.9, 0.999], 
+            "eps": 1e-08, 
+            "weight_decay": 0.05
+        }
+    }, 
+    "gradient_accumulation_steps": 2, 
+    "gradient_clipping": 1.0, 
+    "steps_per_print": inf, 
+    "train_batch_size": 4, 
+    "train_micro_batch_size_per_gpu": 2, 
+    "wall_clock_breakdown": true
+}
+[INFO|trainer.py:1721] 2024-06-18 22:06:50,510 >> ***** Running training *****
+[INFO|trainer.py:1722] 2024-06-18 22:06:50,510 >>   Num examples = 400
+[INFO|trainer.py:1723] 2024-06-18 22:06:50,510 >>   Num Epochs = 7
+[INFO|trainer.py:1724] 2024-06-18 22:06:50,510 >>   Instantaneous batch size per device = 2
+[INFO|trainer.py:1727] 2024-06-18 22:06:50,510 >>   Total train batch size (w. parallel, distributed & accumulation) = 4
+[INFO|trainer.py:1728] 2024-06-18 22:06:50,510 >>   Gradient Accumulation steps = 2
+[INFO|trainer.py:1729] 2024-06-18 22:06:50,510 >>   Total optimization steps = 700
+[INFO|trainer.py:1730] 2024-06-18 22:06:50,515 >>   Number of trainable parameters = 11,010,048
+  0%|          | 0/700 [00:00<?, ?it/s][2024-06-18 22:06:53,288] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-06-18 22:06:59,028] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-06-18 22:07:04,785] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-06-18 22:07:10,524] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-06-18 22:07:19,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 3513.35 | bwd_microstep: 2111.15 | bwd_inner_microstep: 2047.92 | bwd_allreduce_microstep: 63.05 | step_microstep: 0.05
+[2024-06-18 22:07:23,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 3.04
+[2024-06-18 22:07:23,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1955.52 | bwd_microstep: 1906.07 | bwd_inner_microstep: 1900.62 | bwd_allreduce_microstep: 5.34 | step_microstep: 120.48
+[2024-06-18 22:07:23,946] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 5468.86 | bwd: 4017.23 | bwd_inner: 3948.67 | bwd_allreduce: 68.39 | step: 120.54
+  0%|          | 1/700 [00:33<6:29:33, 33.44s/it]                                                 {'loss': 1.4031, 'learning_rate': 4.7619047619047615e-06, 'epoch': 0.01}
+  0%|          | 1/700 [00:33<6:29:33, 33.44s/it][2024-06-18 22:07:27,416] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1814.25 | bwd_microstep: 1632.74 | bwd_inner_microstep: 1627.96 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.04
+[2024-06-18 22:07:31,376] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:07:31,376] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1962.28 | bwd_microstep: 1918.50 | bwd_inner_microstep: 1913.06 | bwd_allreduce_microstep: 5.32 | step_microstep: 61.28
+[2024-06-18 22:07:31,376] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3776.50 | bwd: 3551.27 | bwd_inner: 3541.07 | bwd_allreduce: 10.08 | step: 61.33
+  0%|          | 2/700 [00:40<3:31:01, 18.14s/it]                                                 {'loss': 1.0378, 'learning_rate': 9.523809523809523e-06, 'epoch': 0.02}
+  0%|          | 2/700 [00:40<3:31:01, 18.14s/it][2024-06-18 22:07:35,230] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1950.75 | bwd_microstep: 1881.22 | bwd_inner_microstep: 1876.10 | bwd_allreduce_microstep: 4.99 | step_microstep: 0.08
+[2024-06-18 22:07:39,446] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.90
+[2024-06-18 22:07:39,447] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2042.21 | bwd_microstep: 2094.85 | bwd_inner_microstep: 2089.39 | bwd_allreduce_microstep: 5.29 | step_microstep: 61.68
+[2024-06-18 22:07:39,447] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3992.92 | bwd: 3976.09 | bwd_inner: 3965.64 | bwd_allreduce: 10.26 | step: 61.77
+  0%|          | 3/700 [00:48<2:37:18, 13.54s/it]                                                 {'loss': 1.419, 'learning_rate': 1.4285714285714285e-05, 'epoch': 0.03}
+  0%|          | 3/700 [00:48<2:37:18, 13.54s/it][2024-06-18 22:07:43,082] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1879.57 | bwd_microstep: 1733.54 | bwd_inner_microstep: 1728.59 | bwd_allreduce_microstep: 4.79 | step_microstep: 0.08
+[2024-06-18 22:07:46,763] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.92
+[2024-06-18 22:07:46,763] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1874.53 | bwd_microstep: 1726.10 | bwd_inner_microstep: 1720.37 | bwd_allreduce_microstep: 5.65 | step_microstep: 62.77
+[2024-06-18 22:07:46,764] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3754.07 | bwd: 3459.66 | bwd_inner: 3449.05 | bwd_allreduce: 10.42 | step: 62.86
+  1%|          | 4/700 [00:56<2:08:34, 11.08s/it]                                                 {'loss': 0.8751, 'learning_rate': 1.9047619047619046e-05, 'epoch': 0.04}
+  1%|          | 4/700 [00:56<2:08:34, 11.08s/it][2024-06-18 22:07:50,377] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1872.16 | bwd_microstep: 1718.85 | bwd_inner_microstep: 1714.02 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.07
+[2024-06-18 22:07:54,434] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:07:54,434] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1995.20 | bwd_microstep: 1983.88 | bwd_inner_microstep: 1978.42 | bwd_allreduce_microstep: 5.31 | step_microstep: 60.92
+[2024-06-18 22:07:54,435] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3867.33 | bwd: 3702.76 | bwd_inner: 3692.54 | bwd_allreduce: 10.04 | step: 61.00
+  1%|          | 5/700 [01:03<1:54:07,  9.85s/it]                                                 {'loss': 0.8246, 'learning_rate': 2.380952380952381e-05, 'epoch': 0.05}
+  1%|          | 5/700 [01:03<1:54:07,  9.85s/it][2024-06-18 22:07:57,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1520.85 | bwd_microstep: 1876.11 | bwd_inner_microstep: 1870.94 | bwd_allreduce_microstep: 5.08 | step_microstep: 0.09
+[2024-06-18 22:08:01,787] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.91
+[2024-06-18 22:08:01,787] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1957.05 | bwd_microstep: 1896.44 | bwd_inner_microstep: 1890.92 | bwd_allreduce_microstep: 5.34 | step_microstep: 61.70
+[2024-06-18 22:08:01,788] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3477.88 | bwd: 3772.57 | bwd_inner: 3761.97 | bwd_allreduce: 10.39 | step: 61.80
+  1%|          | 6/700 [01:11<1:44:08,  9.00s/it]                                                 {'loss': 1.5731, 'learning_rate': 2.857142857142857e-05, 'epoch': 0.06}
+  1%|          | 6/700 [01:11<1:44:08,  9.00s/it][2024-06-18 22:08:05,671] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1957.83 | bwd_microstep: 1903.58 | bwd_inner_microstep: 1898.45 | bwd_allreduce_microstep: 4.99 | step_microstep: 0.14
+[2024-06-18 22:08:09,553] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 22:08:09,554] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1843.43 | bwd_microstep: 1960.11 | bwd_inner_microstep: 1954.66 | bwd_allreduce_microstep: 5.38 | step_microstep: 61.30
+[2024-06-18 22:08:09,554] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3801.22 | bwd: 3863.71 | bwd_inner: 3853.18 | bwd_allreduce: 10.35 | step: 61.45
+  1%|          | 7/700 [01:19<1:39:19,  8.60s/it]                                                 {'loss': 1.4364, 'learning_rate': 3.3333333333333335e-05, 'epoch': 0.07}
+  1%|          | 7/700 [01:19<1:39:19,  8.60s/it][2024-06-18 22:08:12,914] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1499.01 | bwd_microstep: 1839.44 | bwd_inner_microstep: 1834.29 | bwd_allreduce_microstep: 5.06 | step_microstep: 0.09
+[2024-06-18 22:08:16,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:08:16,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1990.92 | bwd_microstep: 1973.23 | bwd_inner_microstep: 1967.84 | bwd_allreduce_microstep: 5.30 | step_microstep: 61.09
+[2024-06-18 22:08:16,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3489.90 | bwd: 3812.70 | bwd_inner: 3802.19 | bwd_allreduce: 10.37 | step: 61.19
+  1%|          | 8/700 [01:26<1:34:47,  8.22s/it]                                                 {'loss': 1.3878, 'learning_rate': 3.809523809523809e-05, 'epoch': 0.08}
+  1%|          | 8/700 [01:26<1:34:47,  8.22s/it][2024-06-18 22:08:20,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1901.07 | bwd_microstep: 1803.29 | bwd_inner_microstep: 1798.23 | bwd_allreduce_microstep: 4.96 | step_microstep: 0.09
+[2024-06-18 22:08:24,629] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.89
+[2024-06-18 22:08:24,629] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1959.15 | bwd_microstep: 1904.95 | bwd_inner_microstep: 1899.57 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.73
+[2024-06-18 22:08:24,630] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3860.18 | bwd: 3708.26 | bwd_inner: 3697.88 | bwd_allreduce: 10.21 | step: 61.83
+  1%|▏         | 9/700 [01:34<1:32:40,  8.05s/it]                                                 {'loss': 1.265, 'learning_rate': 4.2857142857142856e-05, 'epoch': 0.09}
+  1%|▏         | 9/700 [01:34<1:32:40,  8.05s/it][2024-06-18 22:08:28,546] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.29 | bwd_microstep: 1927.41 | bwd_inner_microstep: 1922.27 | bwd_allreduce_microstep: 5.01 | step_microstep: 0.14
+[2024-06-18 22:08:32,483] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 22:08:32,484] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1959.60 | bwd_microstep: 1897.89 | bwd_inner_microstep: 1892.38 | bwd_allreduce_microstep: 5.42 | step_microstep: 62.18
+[2024-06-18 22:08:32,484] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3926.86 | bwd: 3825.32 | bwd_inner: 3814.75 | bwd_allreduce: 10.43 | step: 62.33
+  1%|▏         | 10/700 [01:41<1:31:51,  7.99s/it]                                                  {'loss': 1.3973, 'learning_rate': 4.761904761904762e-05, 'epoch': 0.1}
+  1%|▏         | 10/700 [01:41<1:31:51,  7.99s/it][2024-06-18 22:08:36,453] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1987.10 | bwd_microstep: 1960.27 | bwd_inner_microstep: 1955.34 | bwd_allreduce_microstep: 4.85 | step_microstep: 0.07
+[2024-06-18 22:08:40,376] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:08:40,376] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1956.36 | bwd_microstep: 1887.26 | bwd_inner_microstep: 1882.00 | bwd_allreduce_microstep: 5.20 | step_microstep: 61.34
+[2024-06-18 22:08:40,377] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3943.43 | bwd: 3847.56 | bwd_inner: 3837.37 | bwd_allreduce: 10.06 | step: 61.42
+  2%|▏         | 11/700 [01:49<1:31:23,  7.96s/it]                                                  {'loss': 1.6149, 'learning_rate': 5.2380952380952384e-05, 'epoch': 0.11}
+  2%|▏         | 11/700 [01:49<1:31:23,  7.96s/it][2024-06-18 22:08:43,866] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1641.18 | bwd_microstep: 1826.97 | bwd_inner_microstep: 1821.92 | bwd_allreduce_microstep: 4.97 | step_microstep: 0.09
+[2024-06-18 22:08:47,902] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.91
+[2024-06-18 22:08:47,903] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1988.84 | bwd_microstep: 1966.10 | bwd_inner_microstep: 1960.58 | bwd_allreduce_microstep: 5.37 | step_microstep: 61.67
+[2024-06-18 22:08:47,903] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3629.99 | bwd: 3793.10 | bwd_inner: 3782.57 | bwd_allreduce: 10.32 | step: 61.77
+  2%|▏         | 12/700 [01:57<1:29:45,  7.83s/it]                                                  {'loss': 1.6563, 'learning_rate': 5.714285714285714e-05, 'epoch': 0.12}
+  2%|▏         | 12/700 [01:57<1:29:45,  7.83s/it][2024-06-18 22:08:49,827] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 832.95 | bwd_microstep: 1069.91 | bwd_inner_microstep: 1065.09 | bwd_allreduce_microstep: 4.75 | step_microstep: 0.07
+[2024-06-18 22:08:53,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 22:08:53,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1836.71 | bwd_microstep: 1941.32 | bwd_inner_microstep: 1935.83 | bwd_allreduce_microstep: 5.41 | step_microstep: 61.42
+[2024-06-18 22:08:53,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2669.63 | bwd: 3011.26 | bwd_inner: 3000.96 | bwd_allreduce: 10.16 | step: 61.50
+  2%|▏         | 13/700 [02:03<1:22:31,  7.21s/it]                                                  {'loss': 1.7338, 'learning_rate': 6.19047619047619e-05, 'epoch': 0.13}
+  2%|▏         | 13/700 [02:03<1:22:31,  7.21s/it][2024-06-18 22:08:56,891] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1542.98 | bwd_microstep: 1642.75 | bwd_inner_microstep: 1637.86 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.07
+[2024-06-18 22:09:00,874] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:09:00,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1969.77 | bwd_microstep: 1934.57 | bwd_inner_microstep: 1929.10 | bwd_allreduce_microstep: 5.37 | step_microstep: 61.61
+[2024-06-18 22:09:00,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3512.72 | bwd: 3577.34 | bwd_inner: 3567.03 | bwd_allreduce: 10.10 | step: 61.69
+  2%|▏         | 14/700 [02:10<1:22:20,  7.20s/it]                                                  {'loss': 1.1037, 'learning_rate': 6.666666666666667e-05, 'epoch': 0.14}
+  2%|▏         | 14/700 [02:10<1:22:20,  7.20s/it][2024-06-18 22:09:03,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1415.34 | bwd_microstep: 1653.73 | bwd_inner_microstep: 1648.76 | bwd_allreduce_microstep: 4.89 | step_microstep: 0.07
+[2024-06-18 22:09:07,627] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 22:09:07,627] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1726.75 | bwd_microstep: 1855.65 | bwd_inner_microstep: 1850.25 | bwd_allreduce_microstep: 5.30 | step_microstep: 61.05
+[2024-06-18 22:09:07,627] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3142.06 | bwd: 3509.41 | bwd_inner: 3499.07 | bwd_allreduce: 10.20 | step: 61.13
+  2%|▏         | 15/700 [02:17<1:20:40,  7.07s/it]                                                  {'loss': 0.7607, 'learning_rate': 7.142857142857143e-05, 'epoch': 0.15}
+  2%|▏         | 15/700 [02:17<1:20:40,  7.07s/it][2024-06-18 22:09:11,611] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1991.65 | bwd_microstep: 1970.55 | bwd_inner_microstep: 1965.54 | bwd_allreduce_microstep: 4.97 | step_microstep: 0.09
+[2024-06-18 22:09:15,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 22:09:15,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1989.93 | bwd_microstep: 1968.64 | bwd_inner_microstep: 1963.33 | bwd_allreduce_microstep: 5.23 | step_microstep: 61.53
+[2024-06-18 22:09:15,650] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3981.54 | bwd: 3939.21 | bwd_inner: 3928.89 | bwd_allreduce: 10.21 | step: 61.63
+  2%|▏         | 16/700 [02:25<1:23:50,  7.35s/it]                                                  {'loss': 1.5945, 'learning_rate': 7.619047619047618e-05, 'epoch': 0.16}
+  2%|▏         | 16/700 [02:25<1:23:50,  7.35s/it][2024-06-18 22:09:18,664] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1385.13 | bwd_microstep: 1607.68 | bwd_inner_microstep: 1602.91 | bwd_allreduce_microstep: 4.68 | step_microstep: 0.07
+[2024-06-18 22:09:22,631] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:09:22,632] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1966.53 | bwd_microstep: 1922.04 | bwd_inner_microstep: 1916.56 | bwd_allreduce_microstep: 5.33 | step_microstep: 61.29
+[2024-06-18 22:09:22,632] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3351.63 | bwd: 3529.74 | bwd_inner: 3519.55 | bwd_allreduce: 10.00 | step: 61.37
+  2%|▏         | 17/700 [02:32<1:22:26,  7.24s/it]                                                  {'loss': 0.9, 'learning_rate': 8.095238095238096e-05, 'epoch': 0.17}
+  2%|▏         | 17/700 [02:32<1:22:26,  7.24s/it][2024-06-18 22:09:26,199] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1853.69 | bwd_microstep: 1691.97 | bwd_inner_microstep: 1687.26 | bwd_allreduce_microstep: 4.67 | step_microstep: 0.07
+[2024-06-18 22:09:29,984] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 22:09:29,984] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1903.39 | bwd_microstep: 1802.27 | bwd_inner_microstep: 1796.80 | bwd_allreduce_microstep: 5.40 | step_microstep: 61.69
+[2024-06-18 22:09:29,985] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3757.04 | bwd: 3494.26 | bwd_inner: 3484.07 | bwd_allreduce: 10.08 | step: 61.77
+  3%|▎         | 18/700 [02:39<1:22:42,  7.28s/it]                                                  {'loss': 0.2399, 'learning_rate': 8.571428571428571e-05, 'epoch': 0.18}
+  3%|▎         | 18/700 [02:39<1:22:42,  7.28s/it][2024-06-18 22:09:33,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1962.53 | bwd_microstep: 1912.26 | bwd_inner_microstep: 1907.44 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.07
+[2024-06-18 22:09:37,674] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:09:37,675] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1905.34 | bwd_microstep: 1808.04 | bwd_inner_microstep: 1802.52 | bwd_allreduce_microstep: 5.38 | step_microstep: 61.60
+[2024-06-18 22:09:37,675] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3867.84 | bwd: 3720.31 | bwd_inner: 3710.03 | bwd_allreduce: 10.12 | step: 61.68
+  3%|▎         | 19/700 [02:47<1:23:59,  7.40s/it]                                                  {'loss': 0.5496, 'learning_rate': 9.047619047619048e-05, 'epoch': 0.19}
+  3%|▎         | 19/700 [02:47<1:23:59,  7.40s/it][2024-06-18 22:09:41,632] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1983.29 | bwd_microstep: 1952.34 | bwd_inner_microstep: 1947.63 | bwd_allreduce_microstep: 4.65 | step_microstep: 0.07
+[2024-06-18 22:09:45,577] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:09:45,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1962.46 | bwd_microstep: 1904.22 | bwd_inner_microstep: 1898.78 | bwd_allreduce_microstep: 5.29 | step_microstep: 60.96
+[2024-06-18 22:09:45,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3945.71 | bwd: 3856.58 | bwd_inner: 3846.46 | bwd_allreduce: 9.92 | step: 61.03
+  3%|▎         | 20/700 [02:55<1:25:34,  7.55s/it]                                                  {'loss': 1.388, 'learning_rate': 9.523809523809524e-05, 'epoch': 0.2}
+  3%|▎         | 20/700 [02:55<1:25:34,  7.55s/it][2024-06-18 22:09:48,891] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1488.52 | bwd_microstep: 1803.96 | bwd_inner_microstep: 1799.18 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.07
+[2024-06-18 22:09:52,677] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.92
+[2024-06-18 22:09:52,678] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1902.96 | bwd_microstep: 1802.71 | bwd_inner_microstep: 1797.10 | bwd_allreduce_microstep: 5.52 | step_microstep: 63.12
+[2024-06-18 22:09:52,678] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3391.45 | bwd: 3606.70 | bwd_inner: 3596.32 | bwd_allreduce: 10.23 | step: 63.20
+  3%|▎         | 21/700 [03:02<1:23:55,  7.42s/it]                                                  {'loss': 0.7568, 'learning_rate': 0.0001, 'epoch': 0.21}
+  3%|▎         | 21/700 [03:02<1:23:55,  7.42s/it][2024-06-18 22:09:56,542] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1955.38 | bwd_microstep: 1886.59 | bwd_inner_microstep: 1880.93 | bwd_allreduce_microstep: 5.53 | step_microstep: 0.09
+[2024-06-18 22:10:00,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 22:10:00,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1995.69 | bwd_microstep: 1973.74 | bwd_inner_microstep: 1968.29 | bwd_allreduce_microstep: 5.36 | step_microstep: 62.13
+[2024-06-18 22:10:00,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3951.03 | bwd: 3860.35 | bwd_inner: 3849.32 | bwd_allreduce: 10.89 | step: 62.23
+  3%|▎         | 22/700 [03:10<1:25:29,  7.57s/it]                                                  {'loss': 1.1276, 'learning_rate': 9.999946482054772e-05, 'epoch': 0.22}
+  3%|▎         | 22/700 [03:10<1:25:29,  7.57s/it][2024-06-18 22:10:04,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1968.16 | bwd_microstep: 1924.31 | bwd_inner_microstep: 1919.45 | bwd_allreduce_microstep: 4.77 | step_microstep: 0.07
+[2024-06-18 22:10:08,494] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.93
+[2024-06-18 22:10:08,495] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1971.30 | bwd_microstep: 1933.56 | bwd_inner_microstep: 1927.60 | bwd_allreduce_microstep: 5.86 | step_microstep: 62.89
+[2024-06-18 22:10:08,495] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3939.42 | bwd: 3857.89 | bwd_inner: 3847.10 | bwd_allreduce: 10.65 | step: 62.97
+  3%|▎         | 23/700 [03:17<1:26:30,  7.67s/it]                                                  {'loss': 1.4121, 'learning_rate': 9.999785929364756e-05, 'epoch': 0.23}
+  3%|▎         | 23/700 [03:17<1:26:30,  7.67s/it][2024-06-18 22:10:11,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1389.77 | bwd_microstep: 1611.53 | bwd_inner_microstep: 1606.59 | bwd_allreduce_microstep: 4.79 | step_microstep: 0.07
+[2024-06-18 22:10:15,314] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:10:15,314] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1908.40 | bwd_microstep: 1808.90 | bwd_inner_microstep: 1803.52 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.38
+[2024-06-18 22:10:15,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3298.14 | bwd: 3420.45 | bwd_inner: 3410.17 | bwd_allreduce: 10.08 | step: 61.46
+  3%|▎         | 24/700 [03:24<1:23:30,  7.41s/it]                                                  {'loss': 0.9086, 'learning_rate': 9.999518345366932e-05, 'epoch': 0.24}
+  3%|▎         | 24/700 [03:24<1:23:30,  7.41s/it][2024-06-18 22:10:18,958] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1882.55 | bwd_microstep: 1738.79 | bwd_inner_microstep: 1733.92 | bwd_allreduce_microstep: 4.79 | step_microstep: 0.09
+[2024-06-18 22:10:22,931] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:10:22,932] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.48 | bwd_microstep: 1927.16 | bwd_inner_microstep: 1921.81 | bwd_allreduce_microstep: 5.27 | step_microstep: 60.78
+[2024-06-18 22:10:22,932] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3850.00 | bwd: 3665.97 | bwd_inner: 3655.77 | bwd_allreduce: 10.06 | step: 60.88
+  4%|▎         | 25/700 [03:32<1:24:04,  7.47s/it]                                                  {'loss': 0.5801, 'learning_rate': 9.999143735789518e-05, 'epoch': 0.25}
+  4%|▎         | 25/700 [03:32<1:24:04,  7.47s/it][2024-06-18 22:10:26,799] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1956.93 | bwd_microstep: 1888.36 | bwd_inner_microstep: 1883.60 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.07
+[2024-06-18 22:10:30,848] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:10:30,848] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1993.45 | bwd_microstep: 1977.25 | bwd_inner_microstep: 1971.85 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.27
+[2024-06-18 22:10:30,849] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3950.35 | bwd: 3865.63 | bwd_inner: 3855.51 | bwd_allreduce: 9.93 | step: 61.35
+  4%|▎         | 26/700 [03:40<1:25:26,  7.61s/it]                                                  {'loss': 1.0704, 'learning_rate': 9.998662108651848e-05, 'epoch': 0.26}
+  4%|▎         | 26/700 [03:40<1:25:26,  7.61s/it][2024-06-18 22:10:34,838] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1994.38 | bwd_microstep: 1972.70 | bwd_inner_microstep: 1967.59 | bwd_allreduce_microstep: 5.01 | step_microstep: 0.10
+[2024-06-18 22:10:38,761] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 22:10:38,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1955.48 | bwd_microstep: 1886.71 | bwd_inner_microstep: 1881.43 | bwd_allreduce_microstep: 5.21 | step_microstep: 61.73
+[2024-06-18 22:10:38,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3949.83 | bwd: 3859.43 | bwd_inner: 3849.05 | bwd_allreduce: 10.23 | step: 61.85
+  4%|▍         | 27/700 [03:48<1:26:21,  7.70s/it]                                                  {'loss': 1.0435, 'learning_rate': 9.9980734742642e-05, 'epoch': 0.27}
+  4%|▍         | 27/700 [03:48<1:26:21,  7.70s/it][2024-06-18 22:10:42,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1884.44 | bwd_microstep: 1739.45 | bwd_inner_microstep: 1734.67 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.07
+[2024-06-18 22:10:46,366] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 22:10:46,366] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1963.46 | bwd_microstep: 1916.32 | bwd_inner_microstep: 1911.03 | bwd_allreduce_microstep: 5.21 | step_microstep: 61.47
+[2024-06-18 22:10:46,367] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3847.87 | bwd: 3655.80 | bwd_inner: 3645.74 | bwd_allreduce: 9.92 | step: 61.55
+  4%|▍         | 28/700 [03:55<1:25:54,  7.67s/it]                                                  {'loss': 0.6334, 'learning_rate': 9.997377845227576e-05, 'epoch': 0.28}
+  4%|▍         | 28/700 [03:55<1:25:54,  7.67s/it][2024-06-18 22:10:50,265] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1963.18 | bwd_microstep: 1913.66 | bwd_inner_microstep: 1908.91 | bwd_allreduce_microstep: 4.66 | step_microstep: 0.07
+[2024-06-18 22:10:54,255] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.80
+[2024-06-18 22:10:54,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1972.95 | bwd_microstep: 1938.38 | bwd_inner_microstep: 1933.03 | bwd_allreduce_microstep: 5.28 | step_microstep: 60.94
+[2024-06-18 22:10:54,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3936.09 | bwd: 3852.06 | bwd_inner: 3841.98 | bwd_allreduce: 9.95 | step: 61.02
+  4%|▍         | 29/700 [04:03<1:26:30,  7.74s/it]                                                  {'loss': 1.0811, 'learning_rate': 9.996575236433428e-05, 'epoch': 0.29}
+  4%|▍         | 29/700 [04:03<1:26:30,  7.74s/it][2024-06-18 22:10:58,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1987.46 | bwd_microstep: 1959.47 | bwd_inner_microstep: 1954.52 | bwd_allreduce_microstep: 4.80 | step_microstep: 0.14
+[2024-06-18 22:11:00,579] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:11:00,579] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1005.67 | bwd_microstep: 1270.82 | bwd_inner_microstep: 1265.42 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.27
+[2024-06-18 22:11:00,580] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2993.11 | bwd: 3230.32 | bwd_inner: 3220.06 | bwd_allreduce: 10.01 | step: 61.41
+  4%|▍         | 30/700 [04:10<1:21:39,  7.31s/it]                                                  {'loss': 1.2314, 'learning_rate': 9.995665665063349e-05, 'epoch': 0.3}
+  4%|▍         | 30/700 [04:10<1:21:39,  7.31s/it][2024-06-18 22:11:04,446] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1955.72 | bwd_microstep: 1889.53 | bwd_inner_microstep: 1884.74 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.07
+[2024-06-18 22:11:07,525] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:11:07,526] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1387.71 | bwd_microstep: 1612.18 | bwd_inner_microstep: 1604.44 | bwd_allreduce_microstep: 7.57 | step_microstep: 61.22
+[2024-06-18 22:11:07,526] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3343.40 | bwd: 3501.74 | bwd_inner: 3489.28 | bwd_allreduce: 12.25 | step: 61.30
+  4%|▍         | 31/700 [04:17<1:20:18,  7.20s/it]                                                  {'loss': 0.4723, 'learning_rate': 9.994649150588693e-05, 'epoch': 0.31}
+  4%|▍         | 31/700 [04:17<1:20:18,  7.20s/it][2024-06-18 22:11:11,257] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1903.77 | bwd_microstep: 1806.04 | bwd_inner_microstep: 1801.26 | bwd_allreduce_microstep: 4.69 | step_microstep: 0.09
+[2024-06-18 22:11:14,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 22:11:14,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1887.38 | bwd_microstep: 1745.42 | bwd_inner_microstep: 1740.13 | bwd_allreduce_microstep: 5.13 | step_microstep: 60.54
+[2024-06-18 22:11:14,970] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3791.13 | bwd: 3551.49 | bwd_inner: 3541.49 | bwd_allreduce: 9.80 | step: 60.63
+  5%|▍         | 32/700 [04:24<1:20:59,  7.27s/it]                                                  {'loss': 0.0684, 'learning_rate': 9.993525714770166e-05, 'epoch': 0.32}
+  5%|▍         | 32/700 [04:24<1:20:59,  7.27s/it][2024-06-18 22:11:18,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1417.25 | bwd_microstep: 1657.37 | bwd_inner_microstep: 1652.62 | bwd_allreduce_microstep: 4.66 | step_microstep: 0.07
+[2024-06-18 22:11:22,035] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:11:22,036] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1966.65 | bwd_microstep: 1925.25 | bwd_inner_microstep: 1919.96 | bwd_allreduce_microstep: 5.20 | step_microstep: 60.95
+[2024-06-18 22:11:22,036] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3383.87 | bwd: 3582.64 | bwd_inner: 3572.63 | bwd_allreduce: 9.87 | step: 61.02
+  5%|▍         | 33/700 [04:31<1:20:10,  7.21s/it]                                                  {'loss': 0.5988, 'learning_rate': 9.992295381657361e-05, 'epoch': 0.33}
+  5%|▍         | 33/700 [04:31<1:20:10,  7.21s/it][2024-06-18 22:11:25,769] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1906.84 | bwd_microstep: 1804.85 | bwd_inner_microstep: 1800.12 | bwd_allreduce_microstep: 4.65 | step_microstep: 0.07
+[2024-06-18 22:11:29,481] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:11:29,482] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1887.14 | bwd_microstep: 1746.42 | bwd_inner_microstep: 1740.99 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.31
+[2024-06-18 22:11:29,482] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3793.95 | bwd: 3551.30 | bwd_inner: 3541.21 | bwd_allreduce: 9.90 | step: 61.39
+  5%|▍         | 34/700 [04:38<1:20:50,  7.28s/it]                                                  {'loss': 0.0821, 'learning_rate': 9.990958177588236e-05, 'epoch': 0.34}
+  5%|▍         | 34/700 [04:38<1:20:50,  7.28s/it][2024-06-18 22:11:33,050] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1853.37 | bwd_microstep: 1692.56 | bwd_inner_microstep: 1687.87 | bwd_allreduce_microstep: 4.63 | step_microstep: 0.07
+[2024-06-18 22:11:36,637] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.92
+[2024-06-18 22:11:36,638] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1649.14 | bwd_microstep: 1858.31 | bwd_inner_microstep: 1852.77 | bwd_allreduce_microstep: 5.43 | step_microstep: 63.55
+[2024-06-18 22:11:36,638] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3502.49 | bwd: 3550.89 | bwd_inner: 3540.68 | bwd_allreduce: 10.07 | step: 63.62
+  5%|▌         | 35/700 [04:46<1:20:17,  7.24s/it]                                                  {'loss': 0.6222, 'learning_rate': 9.989514131188559e-05, 'epoch': 0.35}
+  5%|▌         | 35/700 [04:46<1:20:17,  7.24s/it][2024-06-18 22:11:39,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1439.38 | bwd_microstep: 1718.88 | bwd_inner_microstep: 1714.03 | bwd_allreduce_microstep: 4.68 | step_microstep: 0.07
+[2024-06-18 22:11:43,095] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:11:43,095] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1547.52 | bwd_microstep: 1651.30 | bwd_inner_microstep: 1646.02 | bwd_allreduce_microstep: 5.19 | step_microstep: 61.20
+[2024-06-18 22:11:43,096] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2986.87 | bwd: 3370.20 | bwd_inner: 3360.15 | bwd_allreduce: 9.86 | step: 61.28
+  5%|▌         | 36/700 [04:52<1:17:33,  7.01s/it]                                                  {'loss': 0.4488, 'learning_rate': 9.987963273371286e-05, 'epoch': 0.36}
+  5%|▌         | 36/700 [04:52<1:17:33,  7.01s/it][2024-06-18 22:11:46,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1957.87 | bwd_microstep: 1889.73 | bwd_inner_microstep: 1884.94 | bwd_allreduce_microstep: 4.65 | step_microstep: 0.08
+[2024-06-18 22:11:51,020] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:11:51,020] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1997.77 | bwd_microstep: 1978.00 | bwd_inner_microstep: 1972.64 | bwd_allreduce_microstep: 5.20 | step_microstep: 60.95
+[2024-06-18 22:11:51,021] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3955.60 | bwd: 3867.76 | bwd_inner: 3857.71 | bwd_allreduce: 9.83 | step: 61.03
+  5%|▌         | 37/700 [05:00<1:20:28,  7.28s/it]                                                  {'loss': 1.4852, 'learning_rate': 9.986305637335907e-05, 'epoch': 0.37}
+  5%|▌         | 37/700 [05:00<1:20:28,  7.28s/it][2024-06-18 22:11:54,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1985.82 | bwd_microstep: 1955.22 | bwd_inner_microstep: 1950.51 | bwd_allreduce_microstep: 4.66 | step_microstep: 0.07
+[2024-06-18 22:11:58,448] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 22:11:58,448] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1705.03 | bwd_microstep: 1681.19 | bwd_inner_microstep: 1675.79 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.46
+[2024-06-18 22:11:58,449] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3690.81 | bwd: 3636.43 | bwd_inner: 3626.32 | bwd_allreduce: 9.97 | step: 61.54
+  5%|▌         | 38/700 [05:07<1:20:50,  7.33s/it]                                                  {'loss': 1.4569, 'learning_rate': 9.984541258567731e-05, 'epoch': 0.38}
+  5%|▌         | 38/700 [05:07<1:20:50,  7.33s/it][2024-06-18 22:12:02,329] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1960.93 | bwd_microstep: 1897.98 | bwd_inner_microstep: 1893.25 | bwd_allreduce_microstep: 4.65 | step_microstep: 0.07
+[2024-06-18 22:12:05,736] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 22:12:05,737] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1497.76 | bwd_microstep: 1831.07 | bwd_inner_microstep: 1825.83 | bwd_allreduce_microstep: 5.15 | step_microstep: 61.27
+[2024-06-18 22:12:05,737] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3458.66 | bwd: 3729.07 | bwd_inner: 3719.13 | bwd_allreduce: 9.80 | step: 61.35
+  6%|▌         | 39/700 [05:15<1:20:35,  7.32s/it]                                                  {'loss': 1.1673, 'learning_rate': 9.98267017483713e-05, 'epoch': 0.39}
+  6%|▌         | 39/700 [05:15<1:20:35,  7.32s/it][2024-06-18 22:12:09,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1876.47 | bwd_microstep: 1725.32 | bwd_inner_microstep: 1720.59 | bwd_allreduce_microstep: 4.65 | step_microstep: 0.07
+[2024-06-18 22:12:12,992] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 22:12:12,993] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1860.66 | bwd_microstep: 1693.41 | bwd_inner_microstep: 1688.04 | bwd_allreduce_microstep: 5.21 | step_microstep: 60.83
+[2024-06-18 22:12:12,993] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3737.10 | bwd: 3418.76 | bwd_inner: 3408.72 | bwd_allreduce: 9.84 | step: 60.91
+  6%|▌         | 40/700 [05:22<1:20:16,  7.30s/it]                                                  {'loss': 0.195, 'learning_rate': 9.980692426198728e-05, 'epoch': 0.4}
+  6%|▌         | 40/700 [05:22<1:20:16,  7.30s/it][2024-06-18 22:12:16,084] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1415.43 | bwd_microstep: 1654.02 | bwd_inner_microstep: 1649.19 | bwd_allreduce_microstep: 4.66 | step_microstep: 0.08
+[2024-06-18 22:12:20,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 22:12:20,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1971.38 | bwd_microstep: 1936.24 | bwd_inner_microstep: 1930.97 | bwd_allreduce_microstep: 5.17 | step_microstep: 61.26
+[2024-06-18 22:12:20,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3386.78 | bwd: 3590.28 | bwd_inner: 3580.27 | bwd_allreduce: 9.84 | step: 61.36
+  6%|▌         | 41/700 [05:29<1:19:25,  7.23s/it]                                                  {'loss': 1.3146, 'learning_rate': 9.978608054990539e-05, 'epoch': 0.41}
+  6%|▌         | 41/700 [05:29<1:19:25,  7.23s/it][2024-06-18 22:12:23,807] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1908.57 | bwd_microstep: 1806.95 | bwd_inner_microstep: 1802.25 | bwd_allreduce_microstep: 4.63 | step_microstep: 0.07
+[2024-06-18 22:12:27,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 22:12:27,866] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1998.92 | bwd_microstep: 1980.68 | bwd_inner_microstep: 1975.35 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.24
+[2024-06-18 22:12:27,866] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3907.47 | bwd: 3787.65 | bwd_inner: 3777.64 | bwd_allreduce: 9.88 | step: 61.32
+  6%|▌         | 42/700 [05:37<1:21:09,  7.40s/it]                                                  {'loss': 0.4517, 'learning_rate': 9.97641710583307e-05, 'epoch': 0.42}
+  6%|▌         | 42/700 [05:37<1:21:09,  7.40s/it][2024-06-18 22:12:31,176] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1488.12 | bwd_microstep: 1800.87 | bwd_inner_microstep: 1796.15 | bwd_allreduce_microstep: 4.63 | step_microstep: 0.07
+[2024-06-18 22:12:35,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 22:12:35,141] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1966.91 | bwd_microstep: 1919.88 | bwd_inner_microstep: 1914.52 | bwd_allreduce_microstep: 5.19 | step_microstep: 60.87
+[2024-06-18 22:12:35,141] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3455.00 | bwd: 3720.77 | bwd_inner: 3710.77 | bwd_allreduce: 9.80 | step: 60.95
+  6%|▌         | 43/700 [05:44<1:20:37,  7.36s/it]                                                  {'loss': 0.7465, 'learning_rate': 9.974119625628361e-05, 'epoch': 0.43}
+  6%|▌         | 43/700 [05:44<1:20:37,  7.36s/it][2024-06-18 22:12:39,022] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1961.10 | bwd_microstep: 1898.38 | bwd_inner_microstep: 1893.51 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.08
+[2024-06-18 22:12:42,824] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:12:42,824] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1909.79 | bwd_microstep: 1813.55 | bwd_inner_microstep: 1808.13 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.19
+[2024-06-18 22:12:42,825] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3870.86 | bwd: 3711.95 | bwd_inner: 3701.78 | bwd_allreduce: 9.93 | step: 61.27
+  6%|▋         | 44/700 [05:52<1:21:33,  7.46s/it]                                                  {'loss': 0.4864, 'learning_rate': 9.971715663558979e-05, 'epoch': 0.44}
+  6%|▋         | 44/700 [05:52<1:21:33,  7.46s/it][2024-06-18 22:12:46,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1960.11 | bwd_microstep: 1896.00 | bwd_inner_microstep: 1890.96 | bwd_allreduce_microstep: 4.90 | step_microstep: 0.09
+[2024-06-18 22:12:50,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 22:12:50,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1674.46 | bwd_microstep: 1892.60 | bwd_inner_microstep: 1887.20 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.28
+[2024-06-18 22:12:50,350] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3634.54 | bwd: 3788.62 | bwd_inner: 3778.29 | bwd_allreduce: 10.12 | step: 61.38
+  6%|▋         | 45/700 [05:59<1:21:38,  7.48s/it]                                                  {'loss': 1.0293, 'learning_rate': 9.969205271086968e-05, 'epoch': 0.45}
+  6%|▋         | 45/700 [05:59<1:21:38,  7.48s/it][2024-06-18 22:12:53,916] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1852.94 | bwd_microstep: 1691.79 | bwd_inner_microstep: 1687.08 | bwd_allreduce_microstep: 4.62 | step_microstep: 0.08
+[2024-06-18 22:12:58,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:12:58,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2010.97 | bwd_microstep: 2000.56 | bwd_inner_microstep: 1995.23 | bwd_allreduce_microstep: 5.18 | step_microstep: 61.17
+[2024-06-18 22:12:58,007] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3863.88 | bwd: 3692.37 | bwd_inner: 3682.40 | bwd_allreduce: 9.81 | step: 61.25
+  7%|▋         | 46/700 [06:07<1:22:06,  7.53s/it]                                                  {'loss': 0.7191, 'learning_rate': 9.966588501952746e-05, 'epoch': 0.46}
+  7%|▋         | 46/700 [06:07<1:22:06,  7.53s/it][2024-06-18 22:13:01,889] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1961.38 | bwd_microstep: 1899.04 | bwd_inner_microstep: 1894.31 | bwd_allreduce_microstep: 4.65 | step_microstep: 0.07
+[2024-06-18 22:13:05,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:13:05,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1958.26 | bwd_microstep: 1893.35 | bwd_inner_microstep: 1887.97 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.11
+[2024-06-18 22:13:05,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3919.61 | bwd: 3792.41 | bwd_inner: 3782.33 | bwd_allreduce: 9.92 | step: 61.19
+  7%|▋         | 47/700 [06:15<1:22:53,  7.62s/it]                                                  {'loss': 0.9721, 'learning_rate': 9.963865412173957e-05, 'epoch': 0.47}
+  7%|▋         | 47/700 [06:15<1:22:53,  7.62s/it][2024-06-18 22:13:07,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 988.49 | bwd_microstep: 1071.97 | bwd_inner_microstep: 1067.24 | bwd_allreduce_microstep: 4.66 | step_microstep: 0.07
+[2024-06-18 22:13:11,849] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 22:13:11,849] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1963.48 | bwd_microstep: 1906.02 | bwd_inner_microstep: 1900.74 | bwd_allreduce_microstep: 5.20 | step_microstep: 61.04
+[2024-06-18 22:13:11,850] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2951.94 | bwd: 2978.01 | bwd_inner: 2968.01 | bwd_allreduce: 9.86 | step: 61.12
+  7%|▋         | 48/700 [06:21<1:17:35,  7.14s/it]                                                  {'loss': 1.1731, 'learning_rate': 9.961036060044268e-05, 'epoch': 0.48}
+  7%|▋         | 48/700 [06:21<1:17:35,  7.14s/it][2024-06-18 22:13:15,179] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1493.40 | bwd_microstep: 1814.64 | bwd_inner_microstep: 1809.69 | bwd_allreduce_microstep: 4.86 | step_microstep: 0.14
+[2024-06-18 22:13:18,271] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 22:13:18,271] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1392.04 | bwd_microstep: 1620.70 | bwd_inner_microstep: 1615.07 | bwd_allreduce_microstep: 5.48 | step_microstep: 62.06
+[2024-06-18 22:13:18,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2885.41 | bwd: 3435.36 | bwd_inner: 3424.84 | bwd_allreduce: 10.32 | step: 62.20
+  7%|▋         | 49/700 [06:27<1:15:08,  6.93s/it]                                                  {'loss': 1.0871, 'learning_rate': 9.958100506132127e-05, 'epoch': 0.49}
+  7%|▋         | 49/700 [06:27<1:15:08,  6.93s/it][2024-06-18 22:13:21,940] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1797.44 | bwd_microstep: 1849.16 | bwd_inner_microstep: 1844.33 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.07
+[2024-06-18 22:13:25,984] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:13:25,985] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1994.43 | bwd_microstep: 1970.25 | bwd_inner_microstep: 1964.88 | bwd_allreduce_microstep: 5.28 | step_microstep: 62.14
+[2024-06-18 22:13:25,985] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3791.85 | bwd: 3819.43 | bwd_inner: 3809.26 | bwd_allreduce: 10.03 | step: 62.22
+  7%|▋         | 50/700 [06:35<1:17:34,  7.16s/it]                                                  {'loss': 0.8415, 'learning_rate': 9.955058813279455e-05, 'epoch': 0.5}
+  7%|▋         | 50/700 [06:35<1:17:34,  7.16s/it][2024-06-18 22:13:29,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1960.34 | bwd_microstep: 1898.17 | bwd_inner_microstep: 1893.28 | bwd_allreduce_microstep: 4.81 | step_microstep: 0.09
+[2024-06-18 22:13:33,208] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 22:13:33,209] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1575.92 | bwd_microstep: 1685.64 | bwd_inner_microstep: 1677.85 | bwd_allreduce_microstep: 7.64 | step_microstep: 63.88
+[2024-06-18 22:13:33,209] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3536.23 | bwd: 3583.84 | bwd_inner: 3571.20 | bwd_allreduce: 12.44 | step: 63.97
+  7%|▋         | 51/700 [06:42<1:17:40,  7.18s/it]                                                  {'loss': 0.584, 'learning_rate': 9.951911046600313e-05, 'epoch': 0.51}
+  7%|▋         | 51/700 [06:42<1:17:40,  7.18s/it][2024-06-18 22:13:36,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1729.14 | bwd_microstep: 1712.42 | bwd_inner_microstep: 1707.53 | bwd_allreduce_microstep: 4.81 | step_microstep: 0.08
+[2024-06-18 22:13:40,665] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:13:40,665] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1973.67 | bwd_microstep: 1937.29 | bwd_inner_microstep: 1931.80 | bwd_allreduce_microstep: 5.34 | step_microstep: 63.65
+[2024-06-18 22:13:40,666] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3702.79 | bwd: 3649.73 | bwd_inner: 3639.41 | bwd_allreduce: 10.16 | step: 63.73
+  7%|▋         | 52/700 [06:50<1:18:26,  7.26s/it]                                                  {'loss': 0.6379, 'learning_rate': 9.948657273479507e-05, 'epoch': 0.52}
+  7%|▋         | 52/700 [06:50<1:18:26,  7.26s/it][2024-06-18 22:13:44,320] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1886.81 | bwd_microstep: 1746.24 | bwd_inner_microstep: 1741.35 | bwd_allreduce_microstep: 4.81 | step_microstep: 0.08
+[2024-06-18 22:13:48,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:13:48,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1969.71 | bwd_microstep: 1924.15 | bwd_inner_microstep: 1918.72 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.95
+[2024-06-18 22:13:48,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3856.49 | bwd: 3670.41 | bwd_inner: 3660.15 | bwd_allreduce: 10.12 | step: 62.03
+  8%|▊         | 53/700 [06:57<1:19:30,  7.37s/it]                                                  {'loss': 0.5592, 'learning_rate': 9.945297563571135e-05, 'epoch': 0.53}
+  8%|▊         | 53/700 [06:57<1:19:30,  7.37s/it][2024-06-18 22:13:52,216] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1968.99 | bwd_microstep: 1928.92 | bwd_inner_microstep: 1924.08 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.08
+[2024-06-18 22:13:56,179] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.99
+[2024-06-18 22:13:56,180] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1963.51 | bwd_microstep: 1915.82 | bwd_inner_microstep: 1908.22 | bwd_allreduce_microstep: 7.38 | step_microstep: 66.54
+[2024-06-18 22:13:56,180] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3932.47 | bwd: 3844.76 | bwd_inner: 3832.47 | bwd_allreduce: 12.06 | step: 66.62
+  8%|▊         | 54/700 [07:05<1:21:02,  7.53s/it]                                                  {'loss': 1.1618, 'learning_rate': 9.941831988797104e-05, 'epoch': 0.54}
+  8%|▊         | 54/700 [07:05<1:21:02,  7.53s/it][2024-06-18 22:14:00,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1993.68 | bwd_microstep: 1969.98 | bwd_inner_microstep: 1965.10 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.08
+[2024-06-18 22:14:04,150] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:14:04,150] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1971.27 | bwd_microstep: 1931.12 | bwd_inner_microstep: 1925.02 | bwd_allreduce_microstep: 6.00 | step_microstep: 62.22
+[2024-06-18 22:14:04,151] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3964.92 | bwd: 3901.13 | bwd_inner: 3890.23 | bwd_allreduce: 10.75 | step: 62.30
+  8%|▊         | 55/700 [07:13<1:22:20,  7.66s/it]                                                  {'loss': 1.3189, 'learning_rate': 9.938260623345591e-05, 'epoch': 0.55}
+  8%|▊         | 55/700 [07:13<1:22:20,  7.66s/it][2024-06-18 22:14:08,122] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1987.61 | bwd_microstep: 1961.54 | bwd_inner_microstep: 1956.67 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.08
+[2024-06-18 22:14:12,064] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:14:12,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1962.35 | bwd_microstep: 1899.83 | bwd_inner_microstep: 1894.54 | bwd_allreduce_microstep: 5.21 | step_microstep: 62.83
+[2024-06-18 22:14:12,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3949.92 | bwd: 3861.40 | bwd_inner: 3851.31 | bwd_allreduce: 9.95 | step: 62.91
+  8%|▊         | 56/700 [07:21<1:23:02,  7.74s/it]                                                  {'loss': 1.0713, 'learning_rate': 9.934583543669453e-05, 'epoch': 0.56}
+  8%|▊         | 56/700 [07:21<1:23:02,  7.74s/it][2024-06-18 22:14:15,639] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1856.20 | bwd_microstep: 1695.86 | bwd_inner_microstep: 1691.01 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.08
+[2024-06-18 22:14:18,738] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.92
+[2024-06-18 22:14:18,739] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1394.54 | bwd_microstep: 1620.24 | bwd_inner_microstep: 1614.65 | bwd_allreduce_microstep: 5.46 | step_microstep: 65.70
+[2024-06-18 22:14:18,739] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3250.71 | bwd: 3316.12 | bwd_inner: 3305.80 | bwd_allreduce: 10.17 | step: 65.78
+  8%|▊         | 57/700 [07:28<1:19:29,  7.42s/it]                                                  {'loss': 0.6324, 'learning_rate': 9.930800828484592e-05, 'epoch': 0.57}
+  8%|▊         | 57/700 [07:28<1:19:29,  7.42s/it][2024-06-18 22:14:22,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1889.71 | bwd_microstep: 1748.26 | bwd_inner_microstep: 1743.38 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.08
+[2024-06-18 22:14:26,483] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 2.02
+[2024-06-18 22:14:26,484] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2004.82 | bwd_microstep: 1993.37 | bwd_inner_microstep: 1987.69 | bwd_allreduce_microstep: 5.59 | step_microstep: 65.71
+[2024-06-18 22:14:26,484] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3894.49 | bwd: 3741.66 | bwd_inner: 3731.15 | bwd_allreduce: 10.36 | step: 65.80
+  8%|▊         | 58/700 [07:35<1:20:25,  7.52s/it]                                                  {'loss': 0.6729, 'learning_rate': 9.926912558768262e-05, 'epoch': 0.58}
+  8%|▊         | 58/700 [07:35<1:20:25,  7.52s/it][2024-06-18 22:14:30,423] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1974.18 | bwd_microstep: 1939.94 | bwd_inner_microstep: 1934.92 | bwd_allreduce_microstep: 4.93 | step_microstep: 0.08
+[2024-06-18 22:14:34,476] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:14:34,477] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1998.35 | bwd_microstep: 1973.26 | bwd_inner_microstep: 1967.88 | bwd_allreduce_microstep: 5.27 | step_microstep: 62.65
+[2024-06-18 22:14:34,477] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3972.50 | bwd: 3913.22 | bwd_inner: 3902.87 | bwd_allreduce: 10.21 | step: 62.74
+  8%|▊         | 59/700 [07:43<1:21:49,  7.66s/it]                                                  {'loss': 0.7628, 'learning_rate': 9.922918817757345e-05, 'epoch': 0.59}
+  8%|▊         | 59/700 [07:43<1:21:49,  7.66s/it][2024-06-18 22:14:37,507] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1388.90 | bwd_microstep: 1619.56 | bwd_inner_microstep: 1614.52 | bwd_allreduce_microstep: 4.95 | step_microstep: 0.10
+[2024-06-18 22:14:41,557] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 22:14:41,557] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1997.14 | bwd_microstep: 1970.13 | bwd_inner_microstep: 1964.77 | bwd_allreduce_microstep: 5.27 | step_microstep: 62.17
+[2024-06-18 22:14:41,558] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3386.00 | bwd: 3589.71 | bwd_inner: 3579.35 | bwd_allreduce: 10.22 | step: 62.27
+  9%|▊         | 60/700 [07:51<1:19:50,  7.49s/it]                                                  {'loss': 0.641, 'learning_rate': 9.918819690946567e-05, 'epoch': 0.6}
+  9%|▊         | 60/700 [07:51<1:19:50,  7.49s/it][2024-06-18 22:14:45,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1823.38 | bwd_microstep: 1642.25 | bwd_inner_microstep: 1637.25 | bwd_allreduce_microstep: 4.91 | step_microstep: 0.08
+[2024-06-18 22:14:48,986] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.93
+[2024-06-18 22:14:48,986] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1959.24 | bwd_microstep: 1895.95 | bwd_inner_microstep: 1890.41 | bwd_allreduce_microstep: 5.44 | step_microstep: 65.86
+[2024-06-18 22:14:48,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3782.59 | bwd: 3538.22 | bwd_inner: 3527.71 | bwd_allreduce: 10.36 | step: 65.95
+  9%|▊         | 61/700 [07:58<1:19:32,  7.47s/it]                                                  {'loss': 0.5939, 'learning_rate': 9.914615266086668e-05, 'epoch': 0.61}
+  9%|▊         | 61/700 [07:58<1:19:32,  7.47s/it][2024-06-18 22:14:52,978] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1997.07 | bwd_microstep: 1970.65 | bwd_inner_microstep: 1965.78 | bwd_allreduce_microstep: 4.78 | step_microstep: 0.08
+[2024-06-18 22:14:56,985] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.92
+[2024-06-18 22:14:56,986] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1976.02 | bwd_microstep: 1948.07 | bwd_inner_microstep: 1942.54 | bwd_allreduce_microstep: 5.44 | step_microstep: 65.52
+[2024-06-18 22:14:56,986] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3973.06 | bwd: 3918.75 | bwd_inner: 3908.37 | bwd_allreduce: 10.23 | step: 65.61
+  9%|▉         | 62/700 [08:06<1:21:06,  7.63s/it]                                                  {'loss': 1.1748, 'learning_rate': 9.910305633182518e-05, 'epoch': 0.62}
+  9%|▉         | 62/700 [08:06<1:21:06,  7.63s/it][2024-06-18 22:15:00,183] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1443.45 | bwd_microstep: 1730.03 | bwd_inner_microstep: 1724.49 | bwd_allreduce_microstep: 5.44 | step_microstep: 0.08
+[2024-06-18 22:15:03,638] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 2.59
+[2024-06-18 22:15:03,639] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1507.48 | bwd_microstep: 1850.14 | bwd_inner_microstep: 1843.25 | bwd_allreduce_microstep: 6.76 | step_microstep: 79.11
+[2024-06-18 22:15:03,639] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2950.90 | bwd: 3580.20 | bwd_inner: 3567.81 | bwd_allreduce: 12.22 | step: 79.20
+  9%|▉         | 63/700 [08:13<1:17:53,  7.34s/it]                                                  {'loss': 1.1561, 'learning_rate': 9.905890884491195e-05, 'epoch': 0.63}
+  9%|▉         | 63/700 [08:13<1:17:53,  7.34s/it][2024-06-18 22:15:07,547] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1966.90 | bwd_microstep: 1912.81 | bwd_inner_microstep: 1908.02 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.08
+[2024-06-18 22:15:11,506] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:15:11,506] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1964.40 | bwd_microstep: 1914.81 | bwd_inner_microstep: 1909.47 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.67
+[2024-06-18 22:15:11,507] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3931.26 | bwd: 3827.64 | bwd_inner: 3817.54 | bwd_allreduce: 9.97 | step: 61.75
+  9%|▉         | 64/700 [08:20<1:19:26,  7.49s/it]                                                  {'loss': 1.1403, 'learning_rate': 9.901371114520012e-05, 'epoch': 0.64}
+  9%|▉         | 64/700 [08:20<1:19:26,  7.49s/it][2024-06-18 22:15:15,412] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1963.85 | bwd_microstep: 1919.48 | bwd_inner_microstep: 1912.56 | bwd_allreduce_microstep: 6.76 | step_microstep: 0.08
+[2024-06-18 22:15:19,206] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:15:19,207] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1907.94 | bwd_microstep: 1806.60 | bwd_inner_microstep: 1801.31 | bwd_allreduce_microstep: 5.20 | step_microstep: 61.17
+[2024-06-18 22:15:19,207] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3871.75 | bwd: 3726.11 | bwd_inner: 3713.97 | bwd_allreduce: 11.95 | step: 61.26
+  9%|▉         | 65/700 [08:28<1:19:58,  7.56s/it]                                                  {'loss': 0.6795, 'learning_rate': 9.89674642002449e-05, 'epoch': 0.65}
+  9%|▉         | 65/700 [08:28<1:19:58,  7.56s/it][2024-06-18 22:15:23,084] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1960.84 | bwd_microstep: 1895.18 | bwd_inner_microstep: 1890.35 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.08
+[2024-06-18 22:15:27,059] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.95
+[2024-06-18 22:15:27,060] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1968.39 | bwd_microstep: 1925.29 | bwd_inner_microstep: 1919.76 | bwd_allreduce_microstep: 5.43 | step_microstep: 63.21
+[2024-06-18 22:15:27,060] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3929.20 | bwd: 3820.49 | bwd_inner: 3810.16 | bwd_allreduce: 10.18 | step: 63.30
+  9%|▉         | 66/700 [08:36<1:20:47,  7.65s/it]                                                  {'loss': 1.1345, 'learning_rate': 9.892016900006284e-05, 'epoch': 0.66}
+  9%|▉         | 66/700 [08:36<1:20:47,  7.65s/it][2024-06-18 22:15:30,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1962.98 | bwd_microstep: 1899.84 | bwd_inner_microstep: 1894.95 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.08
+[2024-06-18 22:15:34,898] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:15:34,899] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.76 | bwd_microstep: 1906.56 | bwd_inner_microstep: 1901.21 | bwd_allreduce_microstep: 5.20 | step_microstep: 61.62
+[2024-06-18 22:15:34,899] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3930.72 | bwd: 3806.42 | bwd_inner: 3796.31 | bwd_allreduce: 9.93 | step: 61.71
+ 10%|▉         | 67/700 [08:44<1:21:16,  7.70s/it]                                                  {'loss': 1.1063, 'learning_rate': 9.887182655711077e-05, 'epoch': 0.67}
+ 10%|▉         | 67/700 [08:44<1:21:16,  7.70s/it][2024-06-18 22:15:37,922] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1391.40 | bwd_microstep: 1610.42 | bwd_inner_microstep: 1605.58 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.08
+[2024-06-18 22:15:41,720] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:15:41,720] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1912.30 | bwd_microstep: 1806.61 | bwd_inner_microstep: 1801.09 | bwd_allreduce_microstep: 5.37 | step_microstep: 61.64
+[2024-06-18 22:15:41,721] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3303.67 | bwd: 3417.05 | bwd_inner: 3406.74 | bwd_allreduce: 10.11 | step: 61.72
+ 10%|▉         | 68/700 [08:51<1:18:21,  7.44s/it]                                                  {'loss': 0.6593, 'learning_rate': 9.882243790626393e-05, 'epoch': 0.68}
+ 10%|▉         | 68/700 [08:51<1:18:21,  7.44s/it][2024-06-18 22:15:45,038] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1489.99 | bwd_microstep: 1806.19 | bwd_inner_microstep: 1801.35 | bwd_allreduce_microstep: 4.68 | step_microstep: 0.07
+[2024-06-18 22:15:48,967] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:15:48,967] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1959.53 | bwd_microstep: 1890.08 | bwd_inner_microstep: 1884.78 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.68
+[2024-06-18 22:15:48,968] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3449.49 | bwd: 3696.30 | bwd_inner: 3686.21 | bwd_allreduce: 9.95 | step: 61.76
+ 10%|▉         | 69/700 [08:58<1:17:37,  7.38s/it]                                                  {'loss': 0.9359, 'learning_rate': 9.877200410479399e-05, 'epoch': 0.69}
+ 10%|▉         | 69/700 [08:58<1:17:37,  7.38s/it][2024-06-18 22:15:52,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1966.76 | bwd_microstep: 1920.17 | bwd_inner_microstep: 1915.36 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.08
+[2024-06-18 22:15:56,807] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:15:56,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1959.13 | bwd_microstep: 1892.78 | bwd_inner_microstep: 1887.43 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.62
+[2024-06-18 22:15:56,808] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3925.86 | bwd: 3812.98 | bwd_inner: 3802.83 | bwd_allreduce: 10.01 | step: 61.70
+ 10%|█         | 70/700 [09:06<1:18:56,  7.52s/it]                                                  {'loss': 1.0167, 'learning_rate': 9.872052623234632e-05, 'epoch': 0.7}
+ 10%|█         | 70/700 [09:06<1:18:56,  7.52s/it][2024-06-18 22:16:00,793] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1993.96 | bwd_microstep: 1969.21 | bwd_inner_microstep: 1964.36 | bwd_allreduce_microstep: 4.69 | step_microstep: 0.07
+[2024-06-18 22:16:04,732] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:16:04,732] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1961.66 | bwd_microstep: 1897.07 | bwd_inner_microstep: 1891.72 | bwd_allreduce_microstep: 5.19 | step_microstep: 61.95
+[2024-06-18 22:16:04,733] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3955.59 | bwd: 3866.31 | bwd_inner: 3856.22 | bwd_allreduce: 9.89 | step: 62.03
+ 10%|█         | 71/700 [09:14<1:20:05,  7.64s/it]                                                  {'loss': 0.8953, 'learning_rate': 9.866800539091688e-05, 'epoch': 0.71}
+ 10%|█         | 71/700 [09:14<1:20:05,  7.64s/it][2024-06-18 22:16:08,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1957.02 | bwd_microstep: 1887.50 | bwd_inner_microstep: 1882.39 | bwd_allreduce_microstep: 5.01 | step_microstep: 0.10
+[2024-06-18 22:16:12,586] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.90
+[2024-06-18 22:16:12,587] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1971.61 | bwd_microstep: 1934.49 | bwd_inner_microstep: 1929.19 | bwd_allreduce_microstep: 5.21 | step_microstep: 62.22
+[2024-06-18 22:16:12,587] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3928.59 | bwd: 3822.01 | bwd_inner: 3811.65 | bwd_allreduce: 10.22 | step: 62.32
+ 10%|█         | 72/700 [09:22<1:20:38,  7.70s/it]                                                  {'loss': 0.8634, 'learning_rate': 9.861444270482868e-05, 'epoch': 0.72}
+ 10%|█         | 72/700 [09:22<1:20:38,  7.70s/it][2024-06-18 22:16:14,186] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 733.77 | bwd_microstep: 844.49 | bwd_inner_microstep: 839.70 | bwd_allreduce_microstep: 4.68 | step_microstep: 0.08
+[2024-06-18 22:16:18,168] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:16:18,169] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.62 | bwd_microstep: 1932.08 | bwd_inner_microstep: 1926.77 | bwd_allreduce_microstep: 5.20 | step_microstep: 61.73
+[2024-06-18 22:16:18,169] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2704.36 | bwd: 2776.59 | bwd_inner: 2766.56 | bwd_allreduce: 9.90 | step: 61.81
+ 10%|█         | 73/700 [09:27<1:13:51,  7.07s/it]                                                  {'loss': 0.6286, 'learning_rate': 9.85598393207077e-05, 'epoch': 0.73}
+ 10%|█         | 73/700 [09:27<1:13:51,  7.07s/it][2024-06-18 22:16:22,081] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.99 | bwd_microstep: 1922.04 | bwd_inner_microstep: 1917.29 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.08
+[2024-06-18 22:16:25,417] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:16:25,418] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1572.65 | bwd_microstep: 1685.11 | bwd_inner_microstep: 1679.72 | bwd_allreduce_microstep: 5.22 | step_microstep: 61.78
+[2024-06-18 22:16:25,418] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3540.61 | bwd: 3607.18 | bwd_inner: 3597.09 | bwd_allreduce: 9.91 | step: 61.87
+ 11%|█         | 74/700 [09:34<1:14:18,  7.12s/it]                                                  {'loss': 1.0384, 'learning_rate': 9.850419640745831e-05, 'epoch': 0.74}
+ 11%|█         | 74/700 [09:34<1:14:18,  7.12s/it][2024-06-18 22:16:29,338] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.53 | bwd_microstep: 1927.92 | bwd_inner_microstep: 1923.23 | bwd_allreduce_microstep: 4.64 | step_microstep: 0.07
+[2024-06-18 22:16:33,281] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 22:16:33,281] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1961.68 | bwd_microstep: 1900.82 | bwd_inner_microstep: 1895.26 | bwd_allreduce_microstep: 5.48 | step_microstep: 62.51
+[2024-06-18 22:16:33,282] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3932.19 | bwd: 3828.77 | bwd_inner: 3818.52 | bwd_allreduce: 10.13 | step: 62.59
+ 11%|█         | 75/700 [09:42<1:16:30,  7.34s/it]                                                  {'loss': 1.2193, 'learning_rate': 9.844751515623824e-05, 'epoch': 0.75}
+ 11%|█         | 75/700 [09:42<1:16:30,  7.34s/it][2024-06-18 22:16:36,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1884.26 | bwd_microstep: 1738.40 | bwd_inner_microstep: 1733.68 | bwd_allreduce_microstep: 4.67 | step_microstep: 0.07
+[2024-06-18 22:16:40,867] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 22:16:40,868] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1961.78 | bwd_microstep: 1899.56 | bwd_inner_microstep: 1894.18 | bwd_allreduce_microstep: 5.30 | step_microstep: 61.86
+[2024-06-18 22:16:40,868] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3846.01 | bwd: 3637.99 | bwd_inner: 3627.89 | bwd_allreduce: 9.98 | step: 61.94
+ 11%|█         | 76/700 [09:50<1:17:08,  7.42s/it]                                                  {'loss': 0.5009, 'learning_rate': 9.838979678043315e-05, 'epoch': 0.76}
+ 11%|█         | 76/700 [09:50<1:17:08,  7.42s/it][2024-06-18 22:16:44,843] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1990.59 | bwd_microstep: 1963.32 | bwd_inner_microstep: 1958.56 | bwd_allreduce_microstep: 4.67 | step_microstep: 0.07
+[2024-06-18 22:16:47,930] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:16:47,930] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1391.79 | bwd_microstep: 1615.35 | bwd_inner_microstep: 1610.04 | bwd_allreduce_microstep: 5.21 | step_microstep: 62.04
+[2024-06-18 22:16:47,930] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3382.35 | bwd: 3578.69 | bwd_inner: 3568.66 | bwd_allreduce: 9.90 | step: 62.12
+ 11%|█         | 77/700 [09:57<1:15:54,  7.31s/it]                                                  {'loss': 0.9311, 'learning_rate': 9.833104251563056e-05, 'epoch': 0.77}
+ 11%|█         | 77/700 [09:57<1:15:54,  7.31s/it][2024-06-18 22:16:51,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1884.16 | bwd_microstep: 1739.57 | bwd_inner_microstep: 1734.65 | bwd_allreduce_microstep: 4.83 | step_microstep: 0.08
+[2024-06-18 22:16:55,552] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:16:55,552] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1969.98 | bwd_microstep: 1926.88 | bwd_inner_microstep: 1921.61 | bwd_allreduce_microstep: 5.17 | step_microstep: 61.54
+[2024-06-18 22:16:55,553] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3854.11 | bwd: 3666.48 | bwd_inner: 3656.33 | bwd_allreduce: 10.01 | step: 61.62
+ 11%|█         | 78/700 [10:05<1:16:45,  7.40s/it]                                                  {'loss': 0.787, 'learning_rate': 9.827125361959353e-05, 'epoch': 0.78}
+ 11%|█         | 78/700 [10:05<1:16:45,  7.40s/it][2024-06-18 22:16:58,748] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1542.32 | bwd_microstep: 1631.08 | bwd_inner_microstep: 1626.34 | bwd_allreduce_microstep: 4.65 | step_microstep: 0.07
+[2024-06-18 22:17:02,549] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:17:02,549] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1912.64 | bwd_microstep: 1809.27 | bwd_inner_microstep: 1803.94 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.71
+[2024-06-18 22:17:02,550] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3454.93 | bwd: 3440.37 | bwd_inner: 3430.34 | bwd_allreduce: 9.90 | step: 61.79
+ 11%|█▏        | 79/700 [10:12<1:15:22,  7.28s/it]                                                  {'loss': 0.0479, 'learning_rate': 9.821043137223356e-05, 'epoch': 0.79}
+ 11%|█▏        | 79/700 [10:12<1:15:22,  7.28s/it][2024-06-18 22:17:06,018] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1730.98 | bwd_microstep: 1716.30 | bwd_inner_microstep: 1711.51 | bwd_allreduce_microstep: 4.67 | step_microstep: 0.07
+[2024-06-18 22:17:10,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:17:10,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1996.29 | bwd_microstep: 1971.37 | bwd_inner_microstep: 1966.01 | bwd_allreduce_microstep: 5.19 | step_microstep: 61.43
+[2024-06-18 22:17:10,066] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3727.24 | bwd: 3687.69 | bwd_inner: 3677.65 | bwd_allreduce: 9.87 | step: 61.52
+ 11%|█▏        | 80/700 [10:19<1:15:58,  7.35s/it]                                                  {'loss': 1.2287, 'learning_rate': 9.814857707558335e-05, 'epoch': 0.8}
+ 11%|█▏        | 80/700 [10:19<1:15:58,  7.35s/it][2024-06-18 22:17:13,985] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.78 | bwd_microstep: 1927.34 | bwd_inner_microstep: 1922.29 | bwd_allreduce_microstep: 4.95 | step_microstep: 0.10
+[2024-06-18 22:17:17,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.89
+[2024-06-18 22:17:17,970] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.99 | bwd_microstep: 1931.59 | bwd_inner_microstep: 1926.37 | bwd_allreduce_microstep: 5.17 | step_microstep: 61.88
+[2024-06-18 22:17:17,970] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3941.74 | bwd: 3858.96 | bwd_inner: 3848.69 | bwd_allreduce: 10.13 | step: 61.98
+ 12%|█▏        | 81/700 [10:27<1:17:33,  7.52s/it]                                                  {'loss': 1.2224, 'learning_rate': 9.808569205376884e-05, 'epoch': 0.81}
+ 12%|█▏        | 81/700 [10:27<1:17:33,  7.52s/it][2024-06-18 22:17:21,709] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1910.18 | bwd_microstep: 1807.32 | bwd_inner_microstep: 1802.47 | bwd_allreduce_microstep: 4.68 | step_microstep: 0.08
+[2024-06-18 22:17:25,642] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:17:25,643] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1959.84 | bwd_microstep: 1893.19 | bwd_inner_microstep: 1887.87 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.44
+[2024-06-18 22:17:25,643] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3869.99 | bwd: 3700.53 | bwd_inner: 3690.43 | bwd_allreduce: 9.94 | step: 61.52
+ 12%|█▏        | 82/700 [10:35<1:17:54,  7.56s/it]                                                  {'loss': 0.7389, 'learning_rate': 9.802177765298091e-05, 'epoch': 0.82}
+ 12%|█▏        | 82/700 [10:35<1:17:54,  7.56s/it][2024-06-18 22:17:29,545] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1964.72 | bwd_microstep: 1915.28 | bwd_inner_microstep: 1910.49 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.07
+[2024-06-18 22:17:33,102] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:17:33,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1644.27 | bwd_microstep: 1834.25 | bwd_inner_microstep: 1828.94 | bwd_allreduce_microstep: 5.22 | step_microstep: 61.37
+[2024-06-18 22:17:33,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3608.96 | bwd: 3749.55 | bwd_inner: 3739.48 | bwd_allreduce: 9.93 | step: 61.45
+ 12%|█▏        | 83/700 [10:42<1:17:27,  7.53s/it]                                                  {'loss': 0.8626, 'learning_rate': 9.795683524144649e-05, 'epoch': 0.83}
+ 12%|█▏        | 83/700 [10:42<1:17:27,  7.53s/it][2024-06-18 22:17:36,985] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1961.06 | bwd_microstep: 1899.63 | bwd_inner_microstep: 1894.75 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.07
+[2024-06-18 22:17:40,851] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 22:17:40,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1936.42 | bwd_microstep: 1848.54 | bwd_inner_microstep: 1842.98 | bwd_allreduce_microstep: 5.49 | step_microstep: 62.55
+[2024-06-18 22:17:40,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3897.44 | bwd: 3748.19 | bwd_inner: 3737.82 | bwd_allreduce: 10.22 | step: 62.63
+ 12%|█▏        | 84/700 [10:50<1:18:00,  7.60s/it]                                                  {'loss': 1.0958, 'learning_rate': 9.789086620939936e-05, 'epoch': 0.84}
+ 12%|█▏        | 84/700 [10:50<1:18:00,  7.60s/it][2024-06-18 22:17:44,501] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1885.72 | bwd_microstep: 1741.46 | bwd_inner_microstep: 1736.57 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.07
+[2024-06-18 22:17:48,426] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:17:48,427] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1958.14 | bwd_microstep: 1887.20 | bwd_inner_microstep: 1881.86 | bwd_allreduce_microstep: 5.17 | step_microstep: 61.57
+[2024-06-18 22:17:48,427] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3843.83 | bwd: 3628.68 | bwd_inner: 3618.58 | bwd_allreduce: 9.88 | step: 61.66
+ 12%|█▏        | 85/700 [10:57<1:17:48,  7.59s/it]                                                  {'loss': 0.7304, 'learning_rate': 9.782387196905034e-05, 'epoch': 0.85}
+ 12%|█▏        | 85/700 [10:57<1:17:48,  7.59s/it][2024-06-18 22:17:52,401] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1990.22 | bwd_microstep: 1962.14 | bwd_inner_microstep: 1957.27 | bwd_allreduce_microstep: 4.79 | step_microstep: 0.09
+[2024-06-18 22:17:56,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:17:56,381] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.72 | bwd_microstep: 1928.57 | bwd_inner_microstep: 1923.27 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.55
+[2024-06-18 22:17:56,381] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3960.90 | bwd: 3890.74 | bwd_inner: 3880.56 | bwd_allreduce: 10.04 | step: 61.64
+ 12%|█▏        | 86/700 [11:05<1:18:47,  7.70s/it]                                                  {'loss': 1.0366, 'learning_rate': 9.775585395455708e-05, 'epoch': 0.86}
+ 12%|█▏        | 86/700 [11:05<1:18:47,  7.70s/it][2024-06-18 22:17:59,575] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1541.32 | bwd_microstep: 1631.88 | bwd_inner_microstep: 1627.08 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.08
+[2024-06-18 22:18:03,570] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 22:18:03,570] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1973.89 | bwd_microstep: 1940.68 | bwd_inner_microstep: 1935.37 | bwd_allreduce_microstep: 5.22 | step_microstep: 61.76
+[2024-06-18 22:18:03,570] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3515.18 | bwd: 3572.58 | bwd_inner: 3562.50 | bwd_allreduce: 9.94 | step: 61.84
+ 12%|█▏        | 87/700 [11:13<1:17:06,  7.55s/it]                                                  {'loss': 0.5899, 'learning_rate': 9.76868136219933e-05, 'epoch': 0.87}
+ 12%|█▏        | 87/700 [11:13<1:17:06,  7.55s/it][2024-06-18 22:18:07,062] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1829.69 | bwd_microstep: 1640.67 | bwd_inner_microstep: 1635.86 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.08
+[2024-06-18 22:18:11,036] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:18:11,036] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.85 | bwd_microstep: 1926.04 | bwd_inner_microstep: 1920.71 | bwd_allreduce_microstep: 5.20 | step_microstep: 61.75
+[2024-06-18 22:18:11,037] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3797.51 | bwd: 3566.73 | bwd_inner: 3556.65 | bwd_allreduce: 9.95 | step: 61.83
+ 13%|█▎        | 88/700 [11:20<1:16:43,  7.52s/it]                                                  {'loss': 0.645, 'learning_rate': 9.761675244931772e-05, 'epoch': 0.88}
+ 13%|█▎        | 88/700 [11:20<1:16:43,  7.52s/it][2024-06-18 22:18:14,786] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1915.31 | bwd_microstep: 1812.63 | bwd_inner_microstep: 1807.78 | bwd_allreduce_microstep: 4.68 | step_microstep: 0.07
+[2024-06-18 22:18:18,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.93
+[2024-06-18 22:18:18,273] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1523.95 | bwd_microstep: 1880.32 | bwd_inner_microstep: 1874.61 | bwd_allreduce_microstep: 5.61 | step_microstep: 63.99
+[2024-06-18 22:18:18,273] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3439.23 | bwd: 3692.97 | bwd_inner: 3682.50 | bwd_allreduce: 10.28 | step: 64.07
+ 13%|█▎        | 89/700 [11:27<1:15:43,  7.44s/it]                                                  {'loss': 0.5779, 'learning_rate': 9.754567193634232e-05, 'epoch': 0.89}
+ 13%|█▎        | 89/700 [11:27<1:15:43,  7.44s/it][2024-06-18 22:18:22,406] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2147.13 | bwd_microstep: 1963.00 | bwd_inner_microstep: 1958.12 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.07
+[2024-06-18 22:18:26,092] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.80
+[2024-06-18 22:18:26,092] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1878.95 | bwd_microstep: 1727.67 | bwd_inner_microstep: 1722.35 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.39
+[2024-06-18 22:18:26,093] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 4026.04 | bwd: 3690.69 | bwd_inner: 3680.54 | bwd_allreduce: 10.01 | step: 61.47
+ 13%|█▎        | 90/700 [11:35<1:16:46,  7.55s/it]                                                  {'loss': 0.6998, 'learning_rate': 9.747357360470033e-05, 'epoch': 0.9}
+ 13%|█▎        | 90/700 [11:35<1:16:46,  7.55s/it][2024-06-18 22:18:29,959] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1956.44 | bwd_microstep: 1887.93 | bwd_inner_microstep: 1883.12 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.08
+[2024-06-18 22:18:33,952] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:18:33,953] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1974.35 | bwd_microstep: 1938.35 | bwd_inner_microstep: 1933.06 | bwd_allreduce_microstep: 5.24 | step_microstep: 62.01
+[2024-06-18 22:18:33,953] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3930.75 | bwd: 3826.30 | bwd_inner: 3816.21 | bwd_allreduce: 9.97 | step: 62.10
+ 13%|█▎        | 91/700 [11:43<1:17:35,  7.64s/it]                                                  {'loss': 1.1801, 'learning_rate': 9.740045899781352e-05, 'epoch': 0.91}
+ 13%|█▎        | 91/700 [11:43<1:17:35,  7.64s/it][2024-06-18 22:18:37,830] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1960.51 | bwd_microstep: 1894.72 | bwd_inner_microstep: 1889.78 | bwd_allreduce_microstep: 4.80 | step_microstep: 0.08
+[2024-06-18 22:18:41,898] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 22:18:41,899] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2001.31 | bwd_microstep: 1987.53 | bwd_inner_microstep: 1982.17 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.63
+[2024-06-18 22:18:41,899] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3961.79 | bwd: 3882.28 | bwd_inner: 3872.04 | bwd_allreduce: 10.05 | step: 61.71
+ 13%|█▎        | 92/700 [11:51<1:18:22,  7.73s/it]                                                  {'loss': 1.2694, 'learning_rate': 9.732632968085936e-05, 'epoch': 0.92}
+ 13%|█▎        | 92/700 [11:51<1:18:22,  7.73s/it][2024-06-18 22:18:43,468] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 726.86 | bwd_microstep: 821.88 | bwd_inner_microstep: 816.98 | bwd_allreduce_microstep: 4.81 | step_microstep: 0.14
+[2024-06-18 22:18:46,956] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:18:46,956] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1527.07 | bwd_microstep: 1880.80 | bwd_inner_microstep: 1875.34 | bwd_allreduce_microstep: 5.32 | step_microstep: 61.94
+[2024-06-18 22:18:46,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2253.90 | bwd: 2702.70 | bwd_inner: 2692.39 | bwd_allreduce: 10.12 | step: 62.09
+ 13%|█▎        | 93/700 [11:56<1:10:07,  6.93s/it]                                                  {'loss': 0.6501, 'learning_rate': 9.725118724073731e-05, 'epoch': 0.93}
+ 13%|█▎        | 93/700 [11:56<1:10:07,  6.93s/it][2024-06-18 22:18:50,825] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1957.54 | bwd_microstep: 1889.64 | bwd_inner_microstep: 1884.67 | bwd_allreduce_microstep: 4.84 | step_microstep: 0.07
+[2024-06-18 22:18:54,149] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:18:54,150] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1568.84 | bwd_microstep: 1675.80 | bwd_inner_microstep: 1670.44 | bwd_allreduce_microstep: 5.20 | step_microstep: 61.85
+[2024-06-18 22:18:54,150] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3526.35 | bwd: 3565.46 | bwd_inner: 3555.25 | bwd_allreduce: 10.04 | step: 61.93
+ 13%|█▎        | 94/700 [12:03<1:10:48,  7.01s/it]                                                  {'loss': 0.697, 'learning_rate': 9.717503328603498e-05, 'epoch': 0.94}
+ 13%|█▎        | 94/700 [12:03<1:10:48,  7.01s/it][2024-06-18 22:18:57,889] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1912.19 | bwd_microstep: 1804.68 | bwd_inner_microstep: 1799.63 | bwd_allreduce_microstep: 4.90 | step_microstep: 0.08
+[2024-06-18 22:19:01,863] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:19:01,863] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.29 | bwd_microstep: 1927.52 | bwd_inner_microstep: 1922.03 | bwd_allreduce_microstep: 5.33 | step_microstep: 61.71
+[2024-06-18 22:19:01,864] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3879.45 | bwd: 3732.22 | bwd_inner: 3721.78 | bwd_allreduce: 10.19 | step: 61.80
+ 14%|█▎        | 95/700 [12:11<1:12:48,  7.22s/it]                                                  {'loss': 0.5169, 'learning_rate': 9.709786944699364e-05, 'epoch': 0.95}
+ 14%|█▎        | 95/700 [12:11<1:12:48,  7.22s/it][2024-06-18 22:19:05,770] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1965.22 | bwd_microstep: 1919.03 | bwd_inner_microstep: 1914.04 | bwd_allreduce_microstep: 4.89 | step_microstep: 0.07
+[2024-06-18 22:19:09,738] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:19:09,739] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1968.69 | bwd_microstep: 1921.17 | bwd_inner_microstep: 1915.72 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.77
+[2024-06-18 22:19:09,739] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3933.89 | bwd: 3840.22 | bwd_inner: 3829.84 | bwd_allreduce: 10.18 | step: 61.85
+ 14%|█▎        | 96/700 [12:19<1:14:40,  7.42s/it]                                                  {'loss': 0.7606, 'learning_rate': 9.701969737547331e-05, 'epoch': 0.96}
+ 14%|█▎        | 96/700 [12:19<1:14:40,  7.42s/it][2024-06-18 22:19:13,632] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1961.96 | bwd_microstep: 1908.76 | bwd_inner_microstep: 1903.82 | bwd_allreduce_microstep: 4.80 | step_microstep: 0.07
+[2024-06-18 22:19:17,614] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:19:17,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.67 | bwd_microstep: 1932.89 | bwd_inner_microstep: 1927.46 | bwd_allreduce_microstep: 5.29 | step_microstep: 61.47
+[2024-06-18 22:19:17,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3932.60 | bwd: 3841.68 | bwd_inner: 3831.38 | bwd_allreduce: 10.05 | step: 61.55
+ 14%|█▍        | 97/700 [12:27<1:15:55,  7.55s/it]                                                  {'loss': 1.0151, 'learning_rate': 9.694051874491748e-05, 'epoch': 0.97}
+ 14%|█▍        | 97/700 [12:27<1:15:55,  7.55s/it][2024-06-18 22:19:21,396] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1833.34 | bwd_microstep: 1926.46 | bwd_inner_microstep: 1921.26 | bwd_allreduce_microstep: 5.06 | step_microstep: 0.09
+[2024-06-18 22:19:25,197] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 22:19:25,197] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1912.11 | bwd_microstep: 1808.58 | bwd_inner_microstep: 1803.25 | bwd_allreduce_microstep: 5.25 | step_microstep: 62.05
+[2024-06-18 22:19:25,198] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3745.42 | bwd: 3735.07 | bwd_inner: 3724.58 | bwd_allreduce: 10.29 | step: 62.15
+ 14%|█▍        | 98/700 [12:34<1:15:53,  7.56s/it]                                                  {'loss': 0.8041, 'learning_rate': 9.686033525031719e-05, 'epoch': 0.98}
+ 14%|█▍        | 98/700 [12:34<1:15:53,  7.56s/it][2024-06-18 22:19:28,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1881.22 | bwd_microstep: 1730.62 | bwd_inner_microstep: 1725.68 | bwd_allreduce_microstep: 4.80 | step_microstep: 0.07
+[2024-06-18 22:19:32,403] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 22:19:32,403] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1651.81 | bwd_microstep: 1841.36 | bwd_inner_microstep: 1835.91 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.50
+[2024-06-18 22:19:32,404] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3533.00 | bwd: 3572.00 | bwd_inner: 3561.69 | bwd_allreduce: 10.07 | step: 61.59
+ 14%|█▍        | 99/700 [12:41<1:14:41,  7.46s/it]                                                  {'loss': 0.8203, 'learning_rate': 9.677914860817476e-05, 'epoch': 0.99}
+ 14%|█▍        | 99/700 [12:41<1:14:41,  7.46s/it][2024-06-18 22:19:36,318] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1968.12 | bwd_microstep: 1924.25 | bwd_inner_microstep: 1919.41 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.08
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+[2024-06-18 22:19:41,061] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 22:19:41,061] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1962.78 | bwd_microstep: 1916.43 | bwd_inner_microstep: 1910.94 | bwd_allreduce_microstep: 5.35 | step_microstep: 62.26
+[2024-06-18 22:19:41,062] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3930.84 | bwd: 3840.71 | bwd_inner: 3830.45 | bwd_allreduce: 10.03 | step: 62.34
+ 14%|█▍        | 100/700 [12:50<1:18:10,  7.82s/it]                                                   {'loss': 1.0445, 'learning_rate': 9.669696055646713e-05, 'epoch': 1.0}
+ 14%|█▍        | 100/700 [12:50<1:18:10,  7.82s/it][2024-06-18 22:19:43,756] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-06-18 22:19:49,529] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-06-18 22:19:55,336] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-06-18 22:20:01,093] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-06-18 22:20:08,450] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1991.80 | bwd_microstep: 1944.86 | bwd_inner_microstep: 1939.88 | bwd_allreduce_microstep: 4.85 | step_microstep: 0.08
+[2024-06-18 22:20:11,585] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.89
+[2024-06-18 22:20:11,585] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1408.10 | bwd_microstep: 1645.99 | bwd_inner_microstep: 1640.54 | bwd_allreduce_microstep: 5.30 | step_microstep: 62.24
+[2024-06-18 22:20:11,586] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3399.84 | bwd: 3590.88 | bwd_inner: 3580.54 | bwd_allreduce: 10.14 | step: 62.32
+ 14%|█▍        | 101/700 [13:21<2:26:02, 14.63s/it]                                                   {'loss': 1.0269, 'learning_rate': 9.661377285460855e-05, 'epoch': 1.01}
+ 14%|█▍        | 101/700 [13:21<2:26:02, 14.63s/it][2024-06-18 22:20:15,746] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2044.33 | bwd_microstep: 2094.22 | bwd_inner_microstep: 2089.31 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.08
+[2024-06-18 22:20:19,670] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:20:19,671] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1956.89 | bwd_microstep: 1888.36 | bwd_inner_microstep: 1882.89 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.49
+[2024-06-18 22:20:19,671] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 4001.19 | bwd: 3982.60 | bwd_inner: 3972.31 | bwd_allreduce: 10.02 | step: 61.57
+ 15%|█▍        | 102/700 [13:29<2:06:14, 12.67s/it]                                                   {'loss': 1.1789, 'learning_rate': 9.652958728341296e-05, 'epoch': 1.02}
+ 15%|█▍        | 102/700 [13:29<2:06:14, 12.67s/it][2024-06-18 22:20:23,395] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1900.86 | bwd_microstep: 1801.79 | bwd_inner_microstep: 1796.91 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.07
+[2024-06-18 22:20:27,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:20:27,459] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1999.08 | bwd_microstep: 1983.91 | bwd_inner_microstep: 1978.58 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.75
+[2024-06-18 22:20:27,459] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3899.91 | bwd: 3785.73 | bwd_inner: 3775.56 | bwd_allreduce: 9.98 | step: 61.84
+ 15%|█▍        | 103/700 [13:36<1:51:27, 11.20s/it]                                                   {'loss': 0.5468, 'learning_rate': 9.644440564505588e-05, 'epoch': 1.03}
+ 15%|█▍        | 103/700 [13:36<1:51:27, 11.20s/it][2024-06-18 22:20:31,364] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1966.20 | bwd_microstep: 1917.21 | bwd_inner_microstep: 1912.27 | bwd_allreduce_microstep: 4.79 | step_microstep: 0.08
+[2024-06-18 22:20:34,993] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 22:20:34,994] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1855.09 | bwd_microstep: 1694.52 | bwd_inner_microstep: 1689.10 | bwd_allreduce_microstep: 5.28 | step_microstep: 61.84
+[2024-06-18 22:20:34,994] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3821.26 | bwd: 3611.76 | bwd_inner: 3601.48 | bwd_allreduce: 10.04 | step: 61.93
+ 15%|█▍        | 104/700 [13:44<1:40:20, 10.10s/it]                                                   {'loss': 0.5322, 'learning_rate': 9.635822976303581e-05, 'epoch': 1.04}
+ 15%|█▍        | 104/700 [13:44<1:40:20, 10.10s/it][2024-06-18 22:20:38,728] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1908.11 | bwd_microstep: 1804.68 | bwd_inner_microstep: 1799.74 | bwd_allreduce_microstep: 4.79 | step_microstep: 0.07
+[2024-06-18 22:20:42,686] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 22:20:42,687] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1963.97 | bwd_microstep: 1914.06 | bwd_inner_microstep: 1908.66 | bwd_allreduce_microstep: 5.32 | step_microstep: 61.91
+[2024-06-18 22:20:42,687] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3872.05 | bwd: 3718.76 | bwd_inner: 3708.47 | bwd_allreduce: 10.10 | step: 62.00
+ 15%|█▌        | 105/700 [13:52<1:33:00,  9.38s/it]                                                   {'loss': 0.4772, 'learning_rate': 9.627106148213522e-05, 'epoch': 1.05}
+ 15%|█▌        | 105/700 [13:52<1:33:00,  9.38s/it][2024-06-18 22:20:45,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1541.55 | bwd_microstep: 1634.95 | bwd_inner_microstep: 1630.06 | bwd_allreduce_microstep: 4.79 | step_microstep: 0.08
+[2024-06-18 22:20:49,867] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:20:49,867] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.28 | bwd_microstep: 1932.23 | bwd_inner_microstep: 1926.89 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.50
+[2024-06-18 22:20:49,867] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3511.80 | bwd: 3567.20 | bwd_inner: 3557.00 | bwd_allreduce: 10.04 | step: 61.58
+ 15%|█▌        | 106/700 [13:59<1:26:19,  8.72s/it]                                                   {'loss': 0.5595, 'learning_rate': 9.6182902668381e-05, 'epoch': 1.06}
+ 15%|█▌        | 106/700 [13:59<1:26:19,  8.72s/it][2024-06-18 22:20:52,893] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1393.35 | bwd_microstep: 1611.28 | bwd_inner_microstep: 1606.20 | bwd_allreduce_microstep: 4.99 | step_microstep: 0.09
+[2024-06-18 22:20:56,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 22:20:56,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1751.94 | bwd_microstep: 1772.08 | bwd_inner_microstep: 1766.78 | bwd_allreduce_microstep: 5.21 | step_microstep: 62.08
+[2024-06-18 22:20:56,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3145.26 | bwd: 3383.38 | bwd_inner: 3373.03 | bwd_allreduce: 10.21 | step: 62.18
+ 15%|█▌        | 107/700 [14:05<1:19:59,  8.09s/it]                                                   {'loss': 0.593, 'learning_rate': 9.609375520900459e-05, 'epoch': 1.07}
+ 15%|█▌        | 107/700 [14:05<1:19:59,  8.09s/it][2024-06-18 22:20:59,597] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1419.49 | bwd_microstep: 1659.22 | bwd_inner_microstep: 1654.40 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.08
+[2024-06-18 22:21:03,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:21:03,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1887.16 | bwd_microstep: 1741.78 | bwd_inner_microstep: 1736.33 | bwd_allreduce_microstep: 5.30 | step_microstep: 61.71
+[2024-06-18 22:21:03,307] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3306.62 | bwd: 3401.02 | bwd_inner: 3390.83 | bwd_allreduce: 10.03 | step: 61.79
+ 15%|█▌        | 108/700 [14:12<1:16:02,  7.71s/it]                                                   {'loss': 0.5759, 'learning_rate': 9.600362101240152e-05, 'epoch': 1.08}
+ 15%|█▌        | 108/700 [14:12<1:16:02,  7.71s/it][2024-06-18 22:21:06,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1887.43 | bwd_microstep: 1741.50 | bwd_inner_microstep: 1736.60 | bwd_allreduce_microstep: 4.77 | step_microstep: 0.08
+[2024-06-18 22:21:10,884] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:21:10,884] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1958.43 | bwd_microstep: 1887.65 | bwd_inner_microstep: 1882.35 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.61
+[2024-06-18 22:21:10,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3845.83 | bwd: 3629.18 | bwd_inner: 3619.00 | bwd_allreduce: 10.02 | step: 61.69
+ 16%|█▌        | 109/700 [14:20<1:15:32,  7.67s/it]                                                   {'loss': 0.5372, 'learning_rate': 9.59125020080906e-05, 'epoch': 1.09}
+ 16%|█▌        | 109/700 [14:20<1:15:32,  7.67s/it][2024-06-18 22:21:14,775] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1963.20 | bwd_microstep: 1905.89 | bwd_inner_microstep: 1900.92 | bwd_allreduce_microstep: 4.82 | step_microstep: 0.08
+[2024-06-18 22:21:18,851] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.90
+[2024-06-18 22:21:18,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2006.76 | bwd_microstep: 1989.09 | bwd_inner_microstep: 1983.46 | bwd_allreduce_microstep: 5.48 | step_microstep: 62.43
+[2024-06-18 22:21:18,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3969.93 | bwd: 3895.00 | bwd_inner: 3884.50 | bwd_allreduce: 10.26 | step: 62.51
+ 16%|█▌        | 110/700 [14:28<1:16:17,  7.76s/it]                                                   {'loss': 1.1394, 'learning_rate': 9.582040014667258e-05, 'epoch': 1.1}
+ 16%|█▌        | 110/700 [14:28<1:16:17,  7.76s/it][2024-06-18 22:21:22,765] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.58 | bwd_microstep: 1923.10 | bwd_inner_microstep: 1918.26 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.07
+[2024-06-18 22:21:26,714] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:21:26,715] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1963.82 | bwd_microstep: 1906.38 | bwd_inner_microstep: 1900.74 | bwd_allreduce_microstep: 5.49 | step_microstep: 62.18
+[2024-06-18 22:21:26,715] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3931.38 | bwd: 3829.50 | bwd_inner: 3819.07 | bwd_allreduce: 10.23 | step: 62.26
+ 16%|█▌        | 111/700 [14:36<1:16:28,  7.79s/it]                                                   {'loss': 0.8768, 'learning_rate': 9.572731739978839e-05, 'epoch': 1.11}
+ 16%|█▌        | 111/700 [14:36<1:16:28,  7.79s/it][2024-06-18 22:21:30,605] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1963.54 | bwd_microstep: 1904.77 | bwd_inner_microstep: 1899.78 | bwd_allreduce_microstep: 4.91 | step_microstep: 0.09
+[2024-06-18 22:21:34,584] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:21:34,585] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1972.63 | bwd_microstep: 1926.29 | bwd_inner_microstep: 1920.75 | bwd_allreduce_microstep: 5.39 | step_microstep: 62.19
+[2024-06-18 22:21:34,585] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3936.14 | bwd: 3831.08 | bwd_inner: 3820.60 | bwd_allreduce: 10.29 | step: 62.29
+ 16%|█▌        | 112/700 [14:44<1:16:34,  7.81s/it]                                                   {'loss': 0.9546, 'learning_rate': 9.563325576007701e-05, 'epoch': 1.12}
+ 16%|█▌        | 112/700 [14:44<1:16:34,  7.81s/it][2024-06-18 22:21:38,331] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1912.37 | bwd_microstep: 1811.06 | bwd_inner_microstep: 1806.12 | bwd_allreduce_microstep: 4.80 | step_microstep: 0.07
+[2024-06-18 22:21:41,745] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:21:41,746] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1596.05 | bwd_microstep: 1738.80 | bwd_inner_microstep: 1733.44 | bwd_allreduce_microstep: 5.28 | step_microstep: 61.62
+[2024-06-18 22:21:41,746] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3508.39 | bwd: 3549.89 | bwd_inner: 3539.62 | bwd_allreduce: 10.06 | step: 61.70
+ 16%|█▌        | 113/700 [14:51<1:14:31,  7.62s/it]                                                   {'loss': 0.0215, 'learning_rate': 9.553821724113268e-05, 'epoch': 1.13}
+ 16%|█▌        | 113/700 [14:51<1:14:31,  7.62s/it][2024-06-18 22:21:45,318] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1856.10 | bwd_microstep: 1693.79 | bwd_inner_microstep: 1688.92 | bwd_allreduce_microstep: 4.82 | step_microstep: 0.08
+[2024-06-18 22:21:48,880] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:21:48,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1647.15 | bwd_microstep: 1836.28 | bwd_inner_microstep: 1830.94 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.77
+[2024-06-18 22:21:48,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3503.22 | bwd: 3530.10 | bwd_inner: 3519.88 | bwd_allreduce: 10.10 | step: 61.86
+ 16%|█▋        | 114/700 [14:58<1:12:59,  7.47s/it]                                                   {'loss': 0.4819, 'learning_rate': 9.544220387746192e-05, 'epoch': 1.14}
+ 16%|█▋        | 114/700 [14:58<1:12:59,  7.47s/it][2024-06-18 22:21:52,264] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1698.30 | bwd_microstep: 1663.65 | bwd_inner_microstep: 1658.69 | bwd_allreduce_microstep: 4.83 | step_microstep: 0.07
+[2024-06-18 22:21:56,268] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.93
+[2024-06-18 22:21:56,269] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1979.91 | bwd_microstep: 1941.01 | bwd_inner_microstep: 1935.35 | bwd_allreduce_microstep: 5.57 | step_microstep: 64.82
+[2024-06-18 22:21:56,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3678.18 | bwd: 3604.68 | bwd_inner: 3594.10 | bwd_allreduce: 10.39 | step: 64.91
+ 16%|█▋        | 115/700 [15:05<1:12:36,  7.45s/it]                                                   {'loss': 0.4828, 'learning_rate': 9.534521772443988e-05, 'epoch': 1.15}
+ 16%|█▋        | 115/700 [15:05<1:12:36,  7.45s/it][2024-06-18 22:21:59,482] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1546.47 | bwd_microstep: 1644.13 | bwd_inner_microstep: 1639.34 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.07
+[2024-06-18 22:22:03,461] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:22:03,462] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1973.29 | bwd_microstep: 1925.80 | bwd_inner_microstep: 1920.40 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.70
+[2024-06-18 22:22:03,462] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3519.73 | bwd: 3569.96 | bwd_inner: 3559.84 | bwd_allreduce: 9.95 | step: 61.78
+ 17%|█▋        | 116/700 [15:12<1:11:44,  7.37s/it]                                                   {'loss': 1.0581, 'learning_rate': 9.524726085826644e-05, 'epoch': 1.16}
+ 17%|█▋        | 116/700 [15:12<1:11:44,  7.37s/it][2024-06-18 22:22:07,340] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1962.28 | bwd_microstep: 1894.47 | bwd_inner_microstep: 1889.54 | bwd_allreduce_microstep: 4.78 | step_microstep: 0.07
+[2024-06-18 22:22:10,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:22:10,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1530.31 | bwd_microstep: 1881.45 | bwd_inner_microstep: 1875.99 | bwd_allreduce_microstep: 5.32 | step_microstep: 61.77
+[2024-06-18 22:22:10,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3492.56 | bwd: 3775.95 | bwd_inner: 3765.64 | bwd_allreduce: 10.06 | step: 61.85
+ 17%|█▋        | 117/700 [15:20<1:11:37,  7.37s/it]                                                   {'loss': 0.9637, 'learning_rate': 9.514833537592166e-05, 'epoch': 1.17}
+ 17%|█▋        | 117/700 [15:20<1:11:37,  7.37s/it][2024-06-18 22:22:14,013] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1440.40 | bwd_microstep: 1718.95 | bwd_inner_microstep: 1714.16 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.07
+[2024-06-18 22:22:18,082] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:22:18,082] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2007.01 | bwd_microstep: 1982.65 | bwd_inner_microstep: 1977.33 | bwd_allreduce_microstep: 5.19 | step_microstep: 61.65
+[2024-06-18 22:22:18,082] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3447.38 | bwd: 3701.62 | bwd_inner: 3691.58 | bwd_allreduce: 9.90 | step: 61.73
+ 17%|█▋        | 118/700 [15:27<1:11:08,  7.33s/it]                                                   {'loss': 0.5552, 'learning_rate': 9.504844339512095e-05, 'epoch': 1.18}
+ 17%|█▋        | 118/700 [15:27<1:11:08,  7.33s/it][2024-06-18 22:22:21,824] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1911.21 | bwd_microstep: 1808.52 | bwd_inner_microstep: 1803.57 | bwd_allreduce_microstep: 4.86 | step_microstep: 0.14
+[2024-06-18 22:22:25,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 22:22:25,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1707.62 | bwd_microstep: 1683.27 | bwd_inner_microstep: 1677.92 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.77
+[2024-06-18 22:22:25,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3618.81 | bwd: 3491.81 | bwd_inner: 3481.53 | bwd_allreduce: 10.13 | step: 61.92
+ 17%|█▋        | 119/700 [15:34<1:10:40,  7.30s/it]                                                   {'loss': 0.5657, 'learning_rate': 9.494758705426978e-05, 'epoch': 1.19}
+ 17%|█▋        | 119/700 [15:34<1:10:40,  7.30s/it][2024-06-18 22:22:29,221] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1975.83 | bwd_microstep: 1928.05 | bwd_inner_microstep: 1923.10 | bwd_allreduce_microstep: 4.80 | step_microstep: 0.07
+[2024-06-18 22:22:33,160] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:22:33,161] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1966.58 | bwd_microstep: 1892.37 | bwd_inner_microstep: 1886.93 | bwd_allreduce_microstep: 5.29 | step_microstep: 61.75
+[2024-06-18 22:22:33,161] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3942.38 | bwd: 3820.44 | bwd_inner: 3810.14 | bwd_allreduce: 10.06 | step: 61.83
+ 17%|█▋        | 120/700 [15:42<1:12:11,  7.47s/it]                                                   {'loss': 1.1438, 'learning_rate': 9.484576851241773e-05, 'epoch': 1.2}
+ 17%|█▋        | 120/700 [15:42<1:12:11,  7.47s/it][2024-06-18 22:22:36,796] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1882.08 | bwd_microstep: 1730.92 | bwd_inner_microstep: 1726.02 | bwd_allreduce_microstep: 4.82 | step_microstep: 0.09
+[2024-06-18 22:22:40,845] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:22:40,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2001.00 | bwd_microstep: 1968.46 | bwd_inner_microstep: 1963.00 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.77
+[2024-06-18 22:22:40,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3883.05 | bwd: 3699.40 | bwd_inner: 3689.09 | bwd_allreduce: 10.11 | step: 61.87
+ 17%|█▋        | 121/700 [15:50<1:12:41,  7.53s/it]                                                   {'loss': 0.5918, 'learning_rate': 9.474298994921251e-05, 'epoch': 1.21}
+ 17%|█▋        | 121/700 [15:50<1:12:41,  7.53s/it][2024-06-18 22:22:44,421] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1858.77 | bwd_microstep: 1695.24 | bwd_inner_microstep: 1690.35 | bwd_allreduce_microstep: 4.75 | step_microstep: 0.07
+[2024-06-18 22:22:48,132] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:22:48,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1890.42 | bwd_microstep: 1740.84 | bwd_inner_microstep: 1735.35 | bwd_allreduce_microstep: 5.41 | step_microstep: 61.89
+[2024-06-18 22:22:48,133] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3749.15 | bwd: 3436.11 | bwd_inner: 3425.77 | bwd_allreduce: 10.15 | step: 61.98
+ 17%|█▋        | 122/700 [15:57<1:11:51,  7.46s/it]                                                   {'loss': 0.083, 'learning_rate': 9.463925356485313e-05, 'epoch': 1.22}
+ 17%|█▋        | 122/700 [15:57<1:11:51,  7.46s/it][2024-06-18 22:22:51,485] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1593.49 | bwd_microstep: 1737.25 | bwd_inner_microstep: 1732.40 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.08
+[2024-06-18 22:22:55,478] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:22:55,478] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1979.23 | bwd_microstep: 1934.21 | bwd_inner_microstep: 1928.76 | bwd_allreduce_microstep: 5.37 | step_microstep: 61.80
+[2024-06-18 22:22:55,479] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3572.68 | bwd: 3671.48 | bwd_inner: 3661.20 | bwd_allreduce: 10.15 | step: 61.89
+ 18%|█▊        | 123/700 [16:04<1:11:24,  7.43s/it]                                                   {'loss': 0.5083, 'learning_rate': 9.45345615800428e-05, 'epoch': 1.23}
+ 18%|█▊        | 123/700 [16:04<1:11:24,  7.43s/it][2024-06-18 22:22:58,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1276.86 | bwd_microstep: 1379.01 | bwd_inner_microstep: 1374.21 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.07
+[2024-06-18 22:23:01,741] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.93
+[2024-06-18 22:23:01,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1653.74 | bwd_microstep: 1850.90 | bwd_inner_microstep: 1845.25 | bwd_allreduce_microstep: 5.56 | step_microstep: 64.30
+[2024-06-18 22:23:01,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2930.57 | bwd: 3229.93 | bwd_inner: 3219.50 | bwd_allreduce: 10.28 | step: 64.38
+ 18%|█▊        | 124/700 [16:11<1:07:56,  7.08s/it]                                                   {'loss': 1.0781, 'learning_rate': 9.442891623594153e-05, 'epoch': 1.24}
+ 18%|█▊        | 124/700 [16:11<1:07:56,  7.08s/it][2024-06-18 22:23:05,396] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1890.33 | bwd_microstep: 1740.99 | bwd_inner_microstep: 1736.14 | bwd_allreduce_microstep: 4.78 | step_microstep: 0.08
+[2024-06-18 22:23:09,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:23:09,460] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2008.06 | bwd_microstep: 1976.00 | bwd_inner_microstep: 1970.44 | bwd_allreduce_microstep: 5.41 | step_microstep: 61.90
+[2024-06-18 22:23:09,461] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3898.36 | bwd: 3717.01 | bwd_inner: 3706.65 | bwd_allreduce: 10.17 | step: 61.99
+ 18%|█▊        | 125/700 [16:18<1:09:39,  7.27s/it]                                                   {'loss': 0.6055, 'learning_rate': 9.432231979411798e-05, 'epoch': 1.25}
+ 18%|█▊        | 125/700 [16:18<1:09:39,  7.27s/it][2024-06-18 22:23:12,992] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1651.33 | bwd_microstep: 1859.03 | bwd_inner_microstep: 1854.18 | bwd_allreduce_microstep: 4.78 | step_microstep: 0.08
+[2024-06-18 22:23:16,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:23:16,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1972.80 | bwd_microstep: 1911.57 | bwd_inner_microstep: 1906.15 | bwd_allreduce_microstep: 5.30 | step_microstep: 61.83
+[2024-06-18 22:23:16,958] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3624.09 | bwd: 3770.62 | bwd_inner: 3760.40 | bwd_allreduce: 10.08 | step: 61.91
+ 18%|█▊        | 126/700 [16:26<1:10:11,  7.34s/it]                                                   {'loss': 1.1048, 'learning_rate': 9.421477453650118e-05, 'epoch': 1.26}
+ 18%|█▊        | 126/700 [16:26<1:10:11,  7.34s/it][2024-06-18 22:23:19,332] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1080.57 | bwd_microstep: 1273.33 | bwd_inner_microstep: 1268.47 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.07
+[2024-06-18 22:23:22,715] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 22:23:22,715] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1496.64 | bwd_microstep: 1806.37 | bwd_inner_microstep: 1800.98 | bwd_allreduce_microstep: 5.30 | step_microstep: 61.61
+[2024-06-18 22:23:22,716] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2577.16 | bwd: 3079.72 | bwd_inner: 3069.52 | bwd_allreduce: 10.05 | step: 61.69
+ 18%|█▊        | 127/700 [16:32<1:05:32,  6.86s/it]                                                   {'loss': 0.4045, 'learning_rate': 9.410628276533163e-05, 'epoch': 1.27}
+ 18%|█▊        | 127/700 [16:32<1:05:32,  6.86s/it][2024-06-18 22:23:26,589] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1957.58 | bwd_microstep: 1894.35 | bwd_inner_microstep: 1889.28 | bwd_allreduce_microstep: 4.94 | step_microstep: 0.14
+[2024-06-18 22:23:30,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:23:30,572] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1975.36 | bwd_microstep: 1927.06 | bwd_inner_microstep: 1921.54 | bwd_allreduce_microstep: 5.37 | step_microstep: 61.74
+[2024-06-18 22:23:30,572] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3932.91 | bwd: 3821.43 | bwd_inner: 3810.93 | bwd_allreduce: 10.29 | step: 61.89
+ 18%|█▊        | 128/700 [16:40<1:08:16,  7.16s/it]                                                   {'loss': 0.8653, 'learning_rate': 9.399684680311196e-05, 'epoch': 1.28}
+ 18%|█▊        | 128/700 [16:40<1:08:16,  7.16s/it][2024-06-18 22:23:34,488] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1971.33 | bwd_microstep: 1922.70 | bwd_inner_microstep: 1917.93 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.08
+[2024-06-18 22:23:38,299] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:23:38,299] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1918.88 | bwd_microstep: 1812.48 | bwd_inner_microstep: 1807.03 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.73
+[2024-06-18 22:23:38,300] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3890.18 | bwd: 3735.21 | bwd_inner: 3725.02 | bwd_allreduce: 10.06 | step: 61.81
+ 18%|█▊        | 129/700 [16:47<1:09:46,  7.33s/it]                                                   {'loss': 0.3902, 'learning_rate': 9.388646899255733e-05, 'epoch': 1.29}
+ 18%|█▊        | 129/700 [16:47<1:09:46,  7.33s/it][2024-06-18 22:23:42,241] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1978.37 | bwd_microstep: 1941.61 | bwd_inner_microstep: 1936.89 | bwd_allreduce_microstep: 4.68 | step_microstep: 0.08
+[2024-06-18 22:23:46,276] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:23:46,277] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1995.70 | bwd_microstep: 1959.80 | bwd_inner_microstep: 1954.48 | bwd_allreduce_microstep: 5.23 | step_microstep: 62.16
+[2024-06-18 22:23:46,277] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3974.04 | bwd: 3901.44 | bwd_inner: 3891.40 | bwd_allreduce: 9.93 | step: 62.25
+ 19%|█▊        | 130/700 [16:55<1:11:29,  7.53s/it]                                                   {'loss': 1.0075, 'learning_rate': 9.377515169654518e-05, 'epoch': 1.3}
+ 19%|█▊        | 130/700 [16:55<1:11:29,  7.53s/it][2024-06-18 22:23:49,939] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1892.27 | bwd_microstep: 1747.30 | bwd_inner_microstep: 1742.51 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.08
+[2024-06-18 22:23:53,975] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:23:53,976] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1999.00 | bwd_microstep: 1956.97 | bwd_inner_microstep: 1951.55 | bwd_allreduce_microstep: 5.28 | step_microstep: 61.37
+[2024-06-18 22:23:53,976] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3891.23 | bwd: 3704.30 | bwd_inner: 3694.13 | bwd_allreduce: 9.98 | step: 61.45
+ 19%|█▊        | 131/700 [17:03<1:11:51,  7.58s/it]                                                   {'loss': 0.5589, 'learning_rate': 9.366289729806468e-05, 'epoch': 1.31}
+ 19%|█▊        | 131/700 [17:03<1:11:51,  7.58s/it][2024-06-18 22:23:57,888] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.29 | bwd_microstep: 1919.62 | bwd_inner_microstep: 1914.61 | bwd_allreduce_microstep: 4.92 | step_microstep: 0.08
+[2024-06-18 22:24:01,597] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 22:24:01,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1890.42 | bwd_microstep: 1739.93 | bwd_inner_microstep: 1734.50 | bwd_allreduce_microstep: 5.34 | step_microstep: 61.93
+[2024-06-18 22:24:01,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3860.67 | bwd: 3659.57 | bwd_inner: 3649.15 | bwd_allreduce: 10.28 | step: 62.02
+ 19%|█▉        | 132/700 [17:11<1:11:51,  7.59s/it]                                                   {'loss': 0.2421, 'learning_rate': 9.354970820016576e-05, 'epoch': 1.32}
+ 19%|█▉        | 132/700 [17:11<1:11:51,  7.59s/it][2024-06-18 22:24:04,923] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1494.21 | bwd_microstep: 1809.22 | bwd_inner_microstep: 1804.12 | bwd_allreduce_microstep: 5.02 | step_microstep: 0.10
+[2024-06-18 22:24:08,646] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.90
+[2024-06-18 22:24:08,647] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1896.46 | bwd_microstep: 1746.58 | bwd_inner_microstep: 1741.06 | bwd_allreduce_microstep: 5.37 | step_microstep: 61.97
+[2024-06-18 22:24:08,647] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3390.64 | bwd: 3555.82 | bwd_inner: 3545.25 | bwd_allreduce: 10.38 | step: 62.07
+ 19%|█▉        | 133/700 [17:18<1:10:11,  7.43s/it]                                                   {'loss': 0.6159, 'learning_rate': 9.343558682590756e-05, 'epoch': 1.33}
+ 19%|█▉        | 133/700 [17:18<1:10:11,  7.43s/it][2024-06-18 22:24:12,388] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1910.92 | bwd_microstep: 1808.23 | bwd_inner_microstep: 1803.32 | bwd_allreduce_microstep: 4.77 | step_microstep: 0.08
+[2024-06-18 22:24:16,348] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 22:24:16,348] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1968.34 | bwd_microstep: 1912.22 | bwd_inner_microstep: 1906.84 | bwd_allreduce_microstep: 5.28 | step_microstep: 61.80
+[2024-06-18 22:24:16,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3879.23 | bwd: 3720.48 | bwd_inner: 3710.25 | bwd_allreduce: 10.03 | step: 61.88
+ 19%|█▉        | 134/700 [17:25<1:10:50,  7.51s/it]                                                   {'loss': 0.5952, 'learning_rate': 9.332053561830669e-05, 'epoch': 1.34}
+ 19%|█▉        | 134/700 [17:25<1:10:50,  7.51s/it][2024-06-18 22:24:19,961] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1682.64 | bwd_microstep: 1908.02 | bwd_inner_microstep: 1903.09 | bwd_allreduce_microstep: 4.78 | step_microstep: 0.07
+[2024-06-18 22:24:23,387] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 22:24:23,388] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1505.59 | bwd_microstep: 1841.03 | bwd_inner_microstep: 1835.44 | bwd_allreduce_microstep: 5.44 | step_microstep: 62.60
+[2024-06-18 22:24:23,388] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3188.19 | bwd: 3749.07 | bwd_inner: 3738.64 | bwd_allreduce: 10.18 | step: 62.68
+ 19%|█▉        | 135/700 [17:32<1:09:23,  7.37s/it]                                                   {'loss': 1.2189, 'learning_rate': 9.320455704028481e-05, 'epoch': 1.35}
+ 19%|█▉        | 135/700 [17:32<1:09:23,  7.37s/it][2024-06-18 22:24:27,135] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1915.31 | bwd_microstep: 1809.06 | bwd_inner_microstep: 1804.30 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.08
+[2024-06-18 22:24:31,083] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.91
+[2024-06-18 22:24:31,084] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.43 | bwd_microstep: 1901.08 | bwd_inner_microstep: 1895.48 | bwd_allreduce_microstep: 5.45 | step_microstep: 62.52
+[2024-06-18 22:24:31,084] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3882.71 | bwd: 3710.16 | bwd_inner: 3699.84 | bwd_allreduce: 10.17 | step: 62.61
+ 19%|█▉        | 136/700 [17:40<1:10:11,  7.47s/it]                                                   {'loss': 0.3588, 'learning_rate': 9.308765357461604e-05, 'epoch': 1.36}
+ 19%|█▉        | 136/700 [17:40<1:10:11,  7.47s/it][2024-06-18 22:24:34,989] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.96 | bwd_microstep: 1915.18 | bwd_inner_microstep: 1910.38 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.07
+[2024-06-18 22:24:38,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 22:24:38,704] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1892.43 | bwd_microstep: 1742.02 | bwd_inner_microstep: 1736.52 | bwd_allreduce_microstep: 5.42 | step_microstep: 62.64
+[2024-06-18 22:24:38,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3860.36 | bwd: 3657.23 | bwd_inner: 3646.94 | bwd_allreduce: 10.16 | step: 62.73
+ 20%|█▉        | 137/700 [17:48<1:10:29,  7.51s/it]                                                   {'loss': 0.37, 'learning_rate': 9.296982772387365e-05, 'epoch': 1.37}
+ 20%|█▉        | 137/700 [17:48<1:10:29,  7.51s/it][2024-06-18 22:24:42,161] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1536.07 | bwd_microstep: 1899.47 | bwd_inner_microstep: 1894.53 | bwd_allreduce_microstep: 4.79 | step_microstep: 0.07
+[2024-06-18 22:24:46,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:24:46,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2012.72 | bwd_microstep: 1992.73 | bwd_inner_microstep: 1987.24 | bwd_allreduce_microstep: 5.35 | step_microstep: 62.15
+[2024-06-18 22:24:46,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3548.75 | bwd: 3892.22 | bwd_inner: 3881.88 | bwd_allreduce: 10.12 | step: 62.23
+ 20%|█▉        | 138/700 [17:55<1:10:27,  7.52s/it]                                                   {'loss': 1.0717, 'learning_rate': 9.285108201037662e-05, 'epoch': 1.38}
+ 20%|█▉        | 138/700 [17:55<1:10:27,  7.52s/it][2024-06-18 22:24:50,147] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.96 | bwd_microstep: 1905.42 | bwd_inner_microstep: 1900.48 | bwd_allreduce_microstep: 4.80 | step_microstep: 0.08
+[2024-06-18 22:24:53,668] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:24:53,669] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1539.89 | bwd_microstep: 1901.74 | bwd_inner_microstep: 1896.26 | bwd_allreduce_microstep: 5.34 | step_microstep: 61.92
+[2024-06-18 22:24:53,669] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3510.82 | bwd: 3807.18 | bwd_inner: 3796.84 | bwd_allreduce: 10.10 | step: 62.00
+ 20%|█▉        | 139/700 [18:03<1:10:02,  7.49s/it]                                                   {'loss': 1.3066, 'learning_rate': 9.27314189761356e-05, 'epoch': 1.39}
+ 20%|█▉        | 139/700 [18:03<1:10:02,  7.49s/it][2024-06-18 22:24:57,579] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1966.73 | bwd_microstep: 1921.09 | bwd_inner_microstep: 1916.16 | bwd_allreduce_microstep: 4.85 | step_microstep: 0.08
+[2024-06-18 22:25:01,213] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 22:25:01,214] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1763.11 | bwd_microstep: 1792.36 | bwd_inner_microstep: 1786.96 | bwd_allreduce_microstep: 5.32 | step_microstep: 61.51
+[2024-06-18 22:25:01,214] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3729.81 | bwd: 3713.47 | bwd_inner: 3703.16 | bwd_allreduce: 10.18 | step: 61.60
+ 20%|██        | 140/700 [18:10<1:10:04,  7.51s/it]                                                   {'loss': 1.1335, 'learning_rate': 9.261084118279847e-05, 'epoch': 1.4}
+ 20%|██        | 140/700 [18:10<1:10:04,  7.51s/it][2024-06-18 22:25:04,235] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1390.11 | bwd_microstep: 1609.52 | bwd_inner_microstep: 1604.66 | bwd_allreduce_microstep: 4.78 | step_microstep: 0.07
+[2024-06-18 22:25:08,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:25:08,204] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.34 | bwd_microstep: 1918.83 | bwd_inner_microstep: 1913.37 | bwd_allreduce_microstep: 5.30 | step_microstep: 61.44
+[2024-06-18 22:25:08,204] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3360.42 | bwd: 3528.37 | bwd_inner: 3518.11 | bwd_allreduce: 10.07 | step: 61.52
+ 20%|██        | 141/700 [18:17<1:08:29,  7.35s/it]                                                   {'loss': 0.5148, 'learning_rate': 9.24893512115955e-05, 'epoch': 1.41}
+ 20%|██        | 141/700 [18:17<1:08:29,  7.35s/it][2024-06-18 22:25:12,184] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1997.14 | bwd_microstep: 1961.75 | bwd_inner_microstep: 1956.56 | bwd_allreduce_microstep: 5.10 | step_microstep: 0.10
+[2024-06-18 22:25:16,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 22:25:16,166] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1975.58 | bwd_microstep: 1923.07 | bwd_inner_microstep: 1917.73 | bwd_allreduce_microstep: 5.27 | step_microstep: 62.35
+[2024-06-18 22:25:16,166] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3972.68 | bwd: 3884.84 | bwd_inner: 3874.33 | bwd_allreduce: 10.37 | step: 62.46
+ 20%|██        | 142/700 [18:25<1:10:04,  7.54s/it]                                                   {'loss': 1.0804, 'learning_rate': 9.236695166328419e-05, 'epoch': 1.42}
+ 20%|██        | 142/700 [18:25<1:10:04,  7.54s/it][2024-06-18 22:25:19,584] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1523.13 | bwd_microstep: 1873.77 | bwd_inner_microstep: 1868.99 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.08
+[2024-06-18 22:25:23,570] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.94
+[2024-06-18 22:25:23,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1977.25 | bwd_microstep: 1927.83 | bwd_inner_microstep: 1922.28 | bwd_allreduce_microstep: 5.47 | step_microstep: 63.06
+[2024-06-18 22:25:23,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3500.36 | bwd: 3801.62 | bwd_inner: 3791.30 | bwd_allreduce: 10.19 | step: 63.14
+ 20%|██        | 143/700 [18:33<1:09:35,  7.50s/it]                                                   {'loss': 1.2198, 'learning_rate': 9.224364515809343e-05, 'epoch': 1.43}
+ 20%|██        | 143/700 [18:33<1:09:35,  7.50s/it][2024-06-18 22:25:27,433] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1956.15 | bwd_microstep: 1883.03 | bwd_inner_microstep: 1878.20 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.08
+[2024-06-18 22:25:30,862] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 22:25:30,862] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1505.88 | bwd_microstep: 1842.88 | bwd_inner_microstep: 1837.55 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.62
+[2024-06-18 22:25:30,863] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3462.00 | bwd: 3725.93 | bwd_inner: 3715.80 | bwd_allreduce: 9.99 | step: 61.71
+ 21%|██        | 144/700 [18:40<1:08:53,  7.43s/it]                                                   {'loss': 1.1472, 'learning_rate': 9.211943433566755e-05, 'epoch': 1.44}
+ 21%|██        | 144/700 [18:40<1:08:53,  7.43s/it][2024-06-18 22:25:33,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1423.90 | bwd_microstep: 1661.91 | bwd_inner_microstep: 1657.11 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.08
+[2024-06-18 22:25:37,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 22:25:37,916] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.28 | bwd_microstep: 1898.35 | bwd_inner_microstep: 1892.86 | bwd_allreduce_microstep: 5.41 | step_microstep: 62.53
+[2024-06-18 22:25:37,916] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3391.16 | bwd: 3560.28 | bwd_inner: 3550.00 | bwd_allreduce: 10.14 | step: 62.61
+ 21%|██        | 145/700 [18:47<1:07:42,  7.32s/it]                                                   {'loss': 0.9091, 'learning_rate': 9.199432185500973e-05, 'epoch': 1.45}
+ 21%|██        | 145/700 [18:47<1:07:42,  7.32s/it][2024-06-18 22:25:41,546] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1878.05 | bwd_microstep: 1730.30 | bwd_inner_microstep: 1725.45 | bwd_allreduce_microstep: 4.77 | step_microstep: 0.07
+[2024-06-18 22:25:44,075] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:25:44,076] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1160.21 | bwd_microstep: 1289.04 | bwd_inner_microstep: 1283.58 | bwd_allreduce_microstep: 5.38 | step_microstep: 62.47
+[2024-06-18 22:25:44,076] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3038.24 | bwd: 3019.37 | bwd_inner: 3009.07 | bwd_allreduce: 10.16 | step: 62.55
+ 21%|██        | 146/700 [18:53<1:04:22,  6.97s/it]                                                   {'loss': 0.6102, 'learning_rate': 9.186831039442514e-05, 'epoch': 1.46}
+ 21%|██        | 146/700 [18:53<1:04:22,  6.97s/it][2024-06-18 22:25:47,170] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1417.86 | bwd_microstep: 1655.15 | bwd_inner_microstep: 1650.29 | bwd_allreduce_microstep: 4.79 | step_microstep: 0.07
+[2024-06-18 22:25:50,735] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:25:50,736] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1650.28 | bwd_microstep: 1835.42 | bwd_inner_microstep: 1829.89 | bwd_allreduce_microstep: 5.39 | step_microstep: 61.88
+[2024-06-18 22:25:50,736] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3068.10 | bwd: 3490.59 | bwd_inner: 3480.25 | bwd_allreduce: 10.19 | step: 61.96
+ 21%|██        | 147/700 [19:00<1:03:23,  6.88s/it]                                                   {'loss': 0.4588, 'learning_rate': 9.174140265146356e-05, 'epoch': 1.47}
+ 21%|██        | 147/700 [19:00<1:03:23,  6.88s/it][2024-06-18 22:25:54,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1963.18 | bwd_microstep: 1894.25 | bwd_inner_microstep: 1889.33 | bwd_allreduce_microstep: 4.77 | step_microstep: 0.07
+[2024-06-18 22:25:58,425] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 22:25:58,426] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1918.20 | bwd_microstep: 1809.01 | bwd_inner_microstep: 1803.67 | bwd_allreduce_microstep: 5.29 | step_microstep: 61.92
+[2024-06-18 22:25:58,426] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3881.35 | bwd: 3703.28 | bwd_inner: 3693.05 | bwd_allreduce: 10.06 | step: 62.00
+ 21%|██        | 148/700 [19:07<1:05:31,  7.12s/it]                                                   {'loss': 0.5156, 'learning_rate': 9.161360134286166e-05, 'epoch': 1.48}
+ 21%|██        | 148/700 [19:07<1:05:31,  7.12s/it][2024-06-18 22:26:01,781] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1500.98 | bwd_microstep: 1832.74 | bwd_inner_microstep: 1827.84 | bwd_allreduce_microstep: 4.82 | step_microstep: 0.08
+[2024-06-18 22:26:05,214] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:26:05,214] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1509.20 | bwd_microstep: 1844.45 | bwd_inner_microstep: 1839.16 | bwd_allreduce_microstep: 5.21 | step_microstep: 61.65
+[2024-06-18 22:26:05,215] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3010.15 | bwd: 3677.21 | bwd_inner: 3667.04 | bwd_allreduce: 10.04 | step: 61.73
+ 21%|██▏       | 149/700 [19:14<1:04:29,  7.02s/it]                                                   {'loss': 1.0874, 'learning_rate': 9.148490920448477e-05, 'epoch': 1.49}
+ 21%|██▏       | 149/700 [19:14<1:04:29,  7.02s/it][2024-06-18 22:26:09,092] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1961.02 | bwd_microstep: 1894.35 | bwd_inner_microstep: 1889.49 | bwd_allreduce_microstep: 4.79 | step_microstep: 0.08
+[2024-06-18 22:26:12,763] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:26:12,764] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1731.96 | bwd_microstep: 1860.72 | bwd_inner_microstep: 1855.33 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.66
+[2024-06-18 22:26:12,764] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3692.95 | bwd: 3755.10 | bwd_inner: 3744.86 | bwd_allreduce: 10.10 | step: 61.75
+ 21%|██▏       | 150/700 [19:22<1:05:49,  7.18s/it]                                                   {'loss': 0.9126, 'learning_rate': 9.135532899126844e-05, 'epoch': 1.5}
+ 21%|██▏       | 150/700 [19:22<1:05:49,  7.18s/it][2024-06-18 22:26:16,741] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1994.17 | bwd_microstep: 1960.46 | bwd_inner_microstep: 1955.36 | bwd_allreduce_microstep: 5.00 | step_microstep: 0.09
+[2024-06-18 22:26:20,718] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.90
+[2024-06-18 22:26:20,718] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1974.33 | bwd_microstep: 1921.38 | bwd_inner_microstep: 1915.98 | bwd_allreduce_microstep: 5.35 | step_microstep: 62.34
+[2024-06-18 22:26:20,719] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3968.48 | bwd: 3881.86 | bwd_inner: 3871.36 | bwd_allreduce: 10.35 | step: 62.44
+ 22%|██▏       | 151/700 [19:30<1:07:49,  7.41s/it]                                                   {'loss': 0.7957, 'learning_rate': 9.122486347715937e-05, 'epoch': 1.51}
+ 22%|██▏       | 151/700 [19:30<1:07:49,  7.41s/it][2024-06-18 22:26:24,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1996.67 | bwd_microstep: 1964.95 | bwd_inner_microstep: 1960.10 | bwd_allreduce_microstep: 4.78 | step_microstep: 0.08
+[2024-06-18 22:26:28,693] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:26:28,694] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1976.26 | bwd_microstep: 1935.06 | bwd_inner_microstep: 1929.57 | bwd_allreduce_microstep: 5.35 | step_microstep: 61.89
+[2024-06-18 22:26:28,694] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3972.90 | bwd: 3900.03 | bwd_inner: 3889.73 | bwd_allreduce: 10.11 | step: 61.98
+ 22%|██▏       | 152/700 [19:38<1:09:14,  7.58s/it]                                                   {'loss': 0.9366, 'learning_rate': 9.109351545505607e-05, 'epoch': 1.52}
+ 22%|██▏       | 152/700 [19:38<1:09:14,  7.58s/it][2024-06-18 22:26:32,679] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1997.07 | bwd_microstep: 1966.01 | bwd_inner_microstep: 1961.15 | bwd_allreduce_microstep: 4.68 | step_microstep: 0.07
+[2024-06-18 22:26:36,658] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 22:26:36,659] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1973.80 | bwd_microstep: 1926.35 | bwd_inner_microstep: 1920.88 | bwd_allreduce_microstep: 5.39 | step_microstep: 62.34
+[2024-06-18 22:26:36,659] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3970.84 | bwd: 3892.38 | bwd_inner: 3882.14 | bwd_allreduce: 10.07 | step: 62.42
+ 22%|██▏       | 153/700 [19:46<1:10:10,  7.70s/it]                                                   {'loss': 1.1504, 'learning_rate': 9.096128773674902e-05, 'epoch': 1.53}
+ 22%|██▏       | 153/700 [19:46<1:10:10,  7.70s/it][2024-06-18 22:26:40,538] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1962.60 | bwd_microstep: 1894.31 | bwd_inner_microstep: 1889.32 | bwd_allreduce_microstep: 4.85 | step_microstep: 0.08
+[2024-06-18 22:26:44,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:26:44,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1974.10 | bwd_microstep: 1923.79 | bwd_inner_microstep: 1918.46 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.81
+[2024-06-18 22:26:44,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3936.66 | bwd: 3818.12 | bwd_inner: 3807.85 | bwd_allreduce: 10.08 | step: 61.89
+ 22%|██▏       | 154/700 [19:54<1:10:28,  7.74s/it]                                                   {'loss': 0.9205, 'learning_rate': 9.082818315286055e-05, 'epoch': 1.54}
+ 22%|██▏       | 154/700 [19:54<1:10:28,  7.74s/it][2024-06-18 22:26:48,394] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1960.91 | bwd_microstep: 1895.20 | bwd_inner_microstep: 1890.35 | bwd_allreduce_microstep: 4.79 | step_microstep: 0.07
+[2024-06-18 22:26:51,773] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:26:51,773] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1494.05 | bwd_microstep: 1805.49 | bwd_inner_microstep: 1800.14 | bwd_allreduce_microstep: 5.26 | step_microstep: 62.16
+[2024-06-18 22:26:51,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3454.93 | bwd: 3700.71 | bwd_inner: 3690.52 | bwd_allreduce: 10.06 | step: 62.24
+ 22%|██▏       | 155/700 [20:01<1:09:01,  7.60s/it]                                                   {'loss': 1.0, 'learning_rate': 9.069420455278419e-05, 'epoch': 1.55}
+ 22%|██▏       | 155/700 [20:01<1:09:01,  7.60s/it][2024-06-18 22:26:55,769] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2000.41 | bwd_microstep: 1973.42 | bwd_inner_microstep: 1968.57 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.08
+[2024-06-18 22:26:59,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.80
+[2024-06-18 22:26:59,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1994.72 | bwd_microstep: 1955.95 | bwd_inner_microstep: 1950.63 | bwd_allreduce_microstep: 5.23 | step_microstep: 61.87
+[2024-06-18 22:26:59,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3995.10 | bwd: 3929.39 | bwd_inner: 3919.30 | bwd_allreduce: 9.93 | step: 61.95
+ 22%|██▏       | 156/700 [20:09<1:10:03,  7.73s/it]                                                   {'loss': 1.2941, 'learning_rate': 9.055935480462367e-05, 'epoch': 1.56}
+ 22%|██▏       | 156/700 [20:09<1:10:03,  7.73s/it][2024-06-18 22:27:03,712] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1969.29 | bwd_microstep: 1920.00 | bwd_inner_microstep: 1915.21 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.34
+[2024-06-18 22:27:07,745] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:27:07,746] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1995.00 | bwd_microstep: 1957.97 | bwd_inner_microstep: 1952.63 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.47
+[2024-06-18 22:27:07,746] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3964.27 | bwd: 3877.98 | bwd_inner: 3867.89 | bwd_allreduce: 9.96 | step: 61.81
+ 22%|██▏       | 157/700 [20:17<1:10:31,  7.79s/it]                                                   {'loss': 1.1631, 'learning_rate': 9.042363679513158e-05, 'epoch': 1.57}
+ 22%|██▏       | 157/700 [20:17<1:10:31,  7.79s/it][2024-06-18 22:27:11,743] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1999.56 | bwd_microstep: 1975.84 | bwd_inner_microstep: 1971.05 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.07
+[2024-06-18 22:27:15,463] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:27:15,464] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1899.08 | bwd_microstep: 1741.27 | bwd_inner_microstep: 1735.92 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.56
+[2024-06-18 22:27:15,464] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3898.61 | bwd: 3717.13 | bwd_inner: 3707.01 | bwd_allreduce: 9.99 | step: 61.64
+ 23%|██▎       | 158/700 [20:24<1:10:11,  7.77s/it]                                                   {'loss': 0.6903, 'learning_rate': 9.028705342964753e-05, 'epoch': 1.58}
+ 23%|██▎       | 158/700 [20:24<1:10:11,  7.77s/it][2024-06-18 22:27:19,384] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1971.90 | bwd_microstep: 1926.14 | bwd_inner_microstep: 1921.10 | bwd_allreduce_microstep: 4.96 | step_microstep: 0.09
+[2024-06-18 22:27:23,011] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 22:27:23,012] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1760.57 | bwd_microstep: 1786.90 | bwd_inner_microstep: 1781.53 | bwd_allreduce_microstep: 5.22 | step_microstep: 61.76
+[2024-06-18 22:27:23,012] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3732.44 | bwd: 3713.06 | bwd_inner: 3702.73 | bwd_allreduce: 10.18 | step: 61.86
+ 23%|██▎       | 159/700 [20:32<1:09:27,  7.70s/it]                                                   {'loss': 1.0193, 'learning_rate': 9.014960763203592e-05, 'epoch': 1.59}
+ 23%|██▎       | 159/700 [20:32<1:09:27,  7.70s/it][2024-06-18 22:27:26,644] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1882.56 | bwd_microstep: 1727.09 | bwd_inner_microstep: 1722.29 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.07
+[2024-06-18 22:27:30,650] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:27:30,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1984.38 | bwd_microstep: 1941.03 | bwd_inner_microstep: 1935.65 | bwd_allreduce_microstep: 5.29 | step_microstep: 62.09
+[2024-06-18 22:27:30,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3866.90 | bwd: 3668.14 | bwd_inner: 3657.98 | bwd_allreduce: 10.01 | step: 62.17
+ 23%|██▎       | 160/700 [20:40<1:09:09,  7.68s/it]                                                   {'loss': 0.5605, 'learning_rate': 9.001130234462347e-05, 'epoch': 1.6}
+ 23%|██▎       | 160/700 [20:40<1:09:09,  7.68s/it][2024-06-18 22:27:34,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1969.38 | bwd_microstep: 1912.94 | bwd_inner_microstep: 1908.08 | bwd_allreduce_microstep: 4.69 | step_microstep: 0.07
+[2024-06-18 22:27:38,611] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:27:38,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2002.83 | bwd_microstep: 1973.78 | bwd_inner_microstep: 1968.36 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.46
+[2024-06-18 22:27:38,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3972.17 | bwd: 3886.75 | bwd_inner: 3876.60 | bwd_allreduce: 9.96 | step: 61.54
+ 23%|██▎       | 161/700 [20:48<1:09:46,  7.77s/it]                                                   {'loss': 0.8445, 'learning_rate': 8.987214052813604e-05, 'epoch': 1.61}
+ 23%|██▎       | 161/700 [20:48<1:09:46,  7.77s/it][2024-06-18 22:27:41,972] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1502.54 | bwd_microstep: 1836.60 | bwd_inner_microstep: 1831.84 | bwd_allreduce_microstep: 4.67 | step_microstep: 0.08
+[2024-06-18 22:27:46,031] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.89
+[2024-06-18 22:27:46,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2003.22 | bwd_microstep: 1976.25 | bwd_inner_microstep: 1970.76 | bwd_allreduce_microstep: 5.38 | step_microstep: 62.08
+[2024-06-18 22:27:46,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3505.74 | bwd: 3812.87 | bwd_inner: 3802.67 | bwd_allreduce: 10.06 | step: 62.17
+ 23%|██▎       | 162/700 [20:55<1:08:42,  7.66s/it]                                                   {'loss': 0.7994, 'learning_rate': 8.973212516163545e-05, 'epoch': 1.62}
+ 23%|██▎       | 162/700 [20:55<1:08:42,  7.66s/it][2024-06-18 22:27:50,007] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1991.76 | bwd_microstep: 1961.23 | bwd_inner_microstep: 1956.31 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.08
+[2024-06-18 22:27:53,998] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:27:53,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1981.31 | bwd_microstep: 1929.80 | bwd_inner_microstep: 1924.39 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.56
+[2024-06-18 22:27:53,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3973.04 | bwd: 3891.06 | bwd_inner: 3880.86 | bwd_allreduce: 10.01 | step: 61.64
+ 23%|██▎       | 163/700 [21:03<1:09:24,  7.75s/it]                                                   {'loss': 1.0185, 'learning_rate': 8.959125924245559e-05, 'epoch': 1.63}
+ 23%|██▎       | 163/700 [21:03<1:09:24,  7.75s/it][2024-06-18 22:27:57,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1888.32 | bwd_microstep: 1740.19 | bwd_inner_microstep: 1735.32 | bwd_allreduce_microstep: 4.78 | step_microstep: 0.09
+[2024-06-18 22:28:01,584] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 22:28:01,585] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1966.31 | bwd_microstep: 1889.17 | bwd_inner_microstep: 1883.75 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.54
+[2024-06-18 22:28:01,585] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3854.60 | bwd: 3629.39 | bwd_inner: 3619.18 | bwd_allreduce: 10.04 | step: 61.64
+ 23%|██▎       | 164/700 [21:11<1:08:49,  7.70s/it]                                                   {'loss': 0.4566, 'learning_rate': 8.944954578613827e-05, 'epoch': 1.64}
+ 23%|██▎       | 164/700 [21:11<1:08:49,  7.70s/it][2024-06-18 22:28:05,592] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2004.77 | bwd_microstep: 1980.16 | bwd_inner_microstep: 1975.38 | bwd_allreduce_microstep: 4.69 | step_microstep: 0.07
+[2024-06-18 22:28:08,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:28:08,984] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1498.41 | bwd_microstep: 1814.26 | bwd_inner_microstep: 1808.87 | bwd_allreduce_microstep: 5.30 | step_microstep: 61.85
+[2024-06-18 22:28:08,984] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3503.14 | bwd: 3794.44 | bwd_inner: 3784.30 | bwd_allreduce: 10.00 | step: 61.93
+ 24%|██▎       | 165/700 [21:18<1:07:52,  7.61s/it]                                                   {'loss': 1.1035, 'learning_rate': 8.930698782636867e-05, 'epoch': 1.65}
+ 24%|██▎       | 165/700 [21:18<1:07:52,  7.61s/it][2024-06-18 22:28:12,862] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1960.22 | bwd_microstep: 1895.66 | bwd_inner_microstep: 1890.79 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.07
+[2024-06-18 22:28:16,799] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:28:16,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1963.75 | bwd_microstep: 1894.88 | bwd_inner_microstep: 1889.58 | bwd_allreduce_microstep: 5.20 | step_microstep: 61.52
+[2024-06-18 22:28:16,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3923.93 | bwd: 3790.56 | bwd_inner: 3780.50 | bwd_allreduce: 9.91 | step: 61.59
+ 24%|██▎       | 166/700 [21:26<1:08:17,  7.67s/it]                                                   {'loss': 1.0899, 'learning_rate': 8.916358841491046e-05, 'epoch': 1.66}
+ 24%|██▎       | 166/700 [21:26<1:08:17,  7.67s/it][2024-06-18 22:28:20,351] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1853.16 | bwd_microstep: 1675.76 | bwd_inner_microstep: 1670.96 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.08
+[2024-06-18 22:28:24,289] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.94
+[2024-06-18 22:28:24,290] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1964.79 | bwd_microstep: 1892.05 | bwd_inner_microstep: 1886.38 | bwd_allreduce_microstep: 5.57 | step_microstep: 64.11
+[2024-06-18 22:28:24,290] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3817.92 | bwd: 3567.83 | bwd_inner: 3557.39 | bwd_allreduce: 10.29 | step: 64.20
+ 24%|██▍       | 167/700 [21:33<1:07:40,  7.62s/it]                                                   {'loss': 0.664, 'learning_rate': 8.901935062154034e-05, 'epoch': 1.67}
+ 24%|██▍       | 167/700 [21:33<1:07:40,  7.62s/it][2024-06-18 22:28:28,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2004.29 | bwd_microstep: 1977.93 | bwd_inner_microstep: 1973.06 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.07
+[2024-06-18 22:28:32,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:28:32,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2004.67 | bwd_microstep: 1976.90 | bwd_inner_microstep: 1971.48 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.67
+[2024-06-18 22:28:32,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 4008.93 | bwd: 3954.85 | bwd_inner: 3944.69 | bwd_allreduce: 9.96 | step: 61.76
+ 24%|██▍       | 168/700 [21:41<1:08:44,  7.75s/it]                                                   {'loss': 0.7774, 'learning_rate': 8.887427753398248e-05, 'epoch': 1.68}
+ 24%|██▍       | 168/700 [21:41<1:08:44,  7.75s/it][2024-06-18 22:28:36,236] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1965.24 | bwd_microstep: 1892.87 | bwd_inner_microstep: 1888.03 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.08
+[2024-06-18 22:28:39,665] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 22:28:39,665] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1506.74 | bwd_microstep: 1842.63 | bwd_inner_microstep: 1837.29 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.96
+[2024-06-18 22:28:39,666] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3471.95 | bwd: 3735.54 | bwd_inner: 3725.38 | bwd_allreduce: 10.01 | step: 62.05
+ 24%|██▍       | 169/700 [21:49<1:07:26,  7.62s/it]                                                   {'loss': 1.0792, 'learning_rate': 8.872837225784226e-05, 'epoch': 1.69}
+ 24%|██▍       | 169/700 [21:49<1:07:26,  7.62s/it][2024-06-18 22:28:43,294] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1882.07 | bwd_microstep: 1724.81 | bwd_inner_microstep: 1719.99 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.07
+[2024-06-18 22:28:47,286] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:28:47,286] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1978.16 | bwd_microstep: 1934.03 | bwd_inner_microstep: 1928.60 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.59
+[2024-06-18 22:28:47,286] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3860.20 | bwd: 3658.86 | bwd_inner: 3648.69 | bwd_allreduce: 10.00 | step: 61.67
+ 24%|██▍       | 170/700 [21:56<1:07:18,  7.62s/it]                                                   {'loss': 0.3954, 'learning_rate': 8.858163791653994e-05, 'epoch': 1.7}
+ 24%|██▍       | 170/700 [21:56<1:07:18,  7.62s/it][2024-06-18 22:28:51,263] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1994.08 | bwd_microstep: 1961.11 | bwd_inner_microstep: 1956.14 | bwd_allreduce_microstep: 4.89 | step_microstep: 0.14
+[2024-06-18 22:28:55,212] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:28:55,213] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1969.47 | bwd_microstep: 1899.39 | bwd_inner_microstep: 1894.07 | bwd_allreduce_microstep: 5.23 | step_microstep: 61.92
+[2024-06-18 22:28:55,213] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3963.51 | bwd: 3860.52 | bwd_inner: 3850.26 | bwd_allreduce: 10.13 | step: 62.07
+ 24%|██▍       | 171/700 [22:04<1:07:59,  7.71s/it]                                                   {'loss': 1.0453, 'learning_rate': 8.84340776512437e-05, 'epoch': 1.71}
+ 24%|██▍       | 171/700 [22:04<1:07:59,  7.71s/it][2024-06-18 22:28:58,954] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1916.41 | bwd_microstep: 1803.08 | bwd_inner_microstep: 1798.21 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.07
+[2024-06-18 22:29:02,671] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:29:02,671] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1891.53 | bwd_microstep: 1745.09 | bwd_inner_microstep: 1739.65 | bwd_allreduce_microstep: 5.35 | step_microstep: 62.44
+[2024-06-18 22:29:02,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3807.91 | bwd: 3548.19 | bwd_inner: 3537.97 | bwd_allreduce: 10.05 | step: 62.52
+ 25%|██▍       | 172/700 [22:12<1:07:11,  7.64s/it]                                                   {'loss': 0.0937, 'learning_rate': 8.828569462080238e-05, 'epoch': 1.72}
+ 25%|██▍       | 172/700 [22:12<1:07:11,  7.64s/it][2024-06-18 22:29:06,550] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1961.05 | bwd_microstep: 1896.20 | bwd_inner_microstep: 1891.41 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.07
+[2024-06-18 22:29:10,506] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:29:10,507] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1971.39 | bwd_microstep: 1904.76 | bwd_inner_microstep: 1899.41 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.81
+[2024-06-18 22:29:10,507] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3932.41 | bwd: 3800.98 | bwd_inner: 3790.86 | bwd_allreduce: 9.97 | step: 61.89
+ 25%|██▍       | 173/700 [22:19<1:07:35,  7.70s/it]                                                   {'loss': 1.0096, 'learning_rate': 8.813649200167799e-05, 'epoch': 1.73}
+ 25%|██▍       | 173/700 [22:19<1:07:35,  7.70s/it][2024-06-18 22:29:14,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1830.28 | bwd_microstep: 1640.65 | bwd_inner_microstep: 1635.85 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.07
+[2024-06-18 22:29:17,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:29:17,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1969.30 | bwd_microstep: 1893.86 | bwd_inner_microstep: 1888.50 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.78
+[2024-06-18 22:29:17,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3799.56 | bwd: 3534.53 | bwd_inner: 3524.39 | bwd_allreduce: 10.01 | step: 61.86
+ 25%|██▍       | 174/700 [22:27<1:06:47,  7.62s/it]                                                   {'loss': 0.3115, 'learning_rate': 8.798647298787754e-05, 'epoch': 1.74}
+ 25%|██▍       | 174/700 [22:27<1:06:47,  7.62s/it][2024-06-18 22:29:21,825] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1962.78 | bwd_microstep: 1897.61 | bwd_inner_microstep: 1892.86 | bwd_allreduce_microstep: 4.66 | step_microstep: 0.07
+[2024-06-18 22:29:25,790] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:29:25,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1969.67 | bwd_microstep: 1916.34 | bwd_inner_microstep: 1910.93 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.79
+[2024-06-18 22:29:25,791] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3932.43 | bwd: 3813.97 | bwd_inner: 3803.89 | bwd_allreduce: 9.92 | step: 61.87
+ 25%|██▌       | 175/700 [22:35<1:07:15,  7.69s/it]                                                   {'loss': 0.919, 'learning_rate': 8.783564079088477e-05, 'epoch': 1.75}
+ 25%|██▌       | 175/700 [22:35<1:07:15,  7.69s/it][2024-06-18 22:29:28,894] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1423.34 | bwd_microstep: 1658.14 | bwd_inner_microstep: 1653.02 | bwd_allreduce_microstep: 5.02 | step_microstep: 0.10
+[2024-06-18 22:29:32,466] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.93
+[2024-06-18 22:29:32,467] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1653.10 | bwd_microstep: 1837.14 | bwd_inner_microstep: 1831.70 | bwd_allreduce_microstep: 5.28 | step_microstep: 62.69
+[2024-06-18 22:29:32,467] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3076.41 | bwd: 3495.31 | bwd_inner: 3484.83 | bwd_allreduce: 10.30 | step: 62.79
+ 25%|██▌       | 176/700 [22:41<1:04:29,  7.38s/it]                                                   {'loss': 0.4667, 'learning_rate': 8.76839986395914e-05, 'epoch': 1.76}
+ 25%|██▌       | 176/700 [22:41<1:04:29,  7.38s/it][2024-06-18 22:29:36,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1990.33 | bwd_microstep: 1956.33 | bwd_inner_microstep: 1951.44 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.08
+[2024-06-18 22:29:40,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:29:40,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1968.93 | bwd_microstep: 1894.78 | bwd_inner_microstep: 1889.38 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.97
+[2024-06-18 22:29:40,381] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3959.22 | bwd: 3851.13 | bwd_inner: 3840.92 | bwd_allreduce: 10.04 | step: 62.05
+ 25%|██▌       | 177/700 [22:49<1:05:44,  7.54s/it]                                                   {'loss': 0.8432, 'learning_rate': 8.753154978022795e-05, 'epoch': 1.77}
+ 25%|██▌       | 177/700 [22:49<1:05:44,  7.54s/it][2024-06-18 22:29:44,009] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1881.81 | bwd_microstep: 1724.71 | bwd_inner_microstep: 1719.86 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.08
+[2024-06-18 22:29:47,401] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:29:47,401] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1500.23 | bwd_microstep: 1812.52 | bwd_inner_microstep: 1807.12 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.85
+[2024-06-18 22:29:47,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3382.01 | bwd: 3537.26 | bwd_inner: 3527.13 | bwd_allreduce: 9.95 | step: 61.94
+ 25%|██▌       | 178/700 [22:56<1:04:15,  7.39s/it]                                                   {'loss': 0.8193, 'learning_rate': 8.737829747629432e-05, 'epoch': 1.78}
+ 25%|██▌       | 178/700 [22:56<1:04:15,  7.39s/it][2024-06-18 22:29:51,060] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1892.69 | bwd_microstep: 1744.50 | bwd_inner_microstep: 1739.63 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.07
+[2024-06-18 22:29:55,043] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 22:29:55,044] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1976.56 | bwd_microstep: 1925.41 | bwd_inner_microstep: 1919.89 | bwd_allreduce_microstep: 5.43 | step_microstep: 62.47
+[2024-06-18 22:29:55,044] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3869.21 | bwd: 3669.93 | bwd_inner: 3659.60 | bwd_allreduce: 10.13 | step: 62.55
+ 26%|██▌       | 179/700 [23:04<1:04:48,  7.46s/it]                                                   {'loss': 0.4372, 'learning_rate': 8.722424500848987e-05, 'epoch': 1.79}
+ 26%|██▌       | 179/700 [23:04<1:04:48,  7.46s/it][2024-06-18 22:29:58,061] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1390.62 | bwd_microstep: 1605.29 | bwd_inner_microstep: 1600.48 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.07
+[2024-06-18 22:30:01,785] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.80
+[2024-06-18 22:30:01,785] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1895.38 | bwd_microstep: 1747.93 | bwd_inner_microstep: 1742.55 | bwd_allreduce_microstep: 5.29 | step_microstep: 61.62
+[2024-06-18 22:30:01,785] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3285.96 | bwd: 3353.24 | bwd_inner: 3343.08 | bwd_allreduce: 10.01 | step: 61.70
+ 26%|██▌       | 180/700 [23:11<1:02:48,  7.25s/it]                                                   {'loss': 0.0457, 'learning_rate': 8.706939567464321e-05, 'epoch': 1.8}
+ 26%|██▌       | 180/700 [23:11<1:02:48,  7.25s/it][2024-06-18 22:30:05,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1890.66 | bwd_microstep: 1739.55 | bwd_inner_microstep: 1734.70 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.09
+[2024-06-18 22:30:09,149] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:30:09,149] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1891.97 | bwd_microstep: 1739.54 | bwd_inner_microstep: 1734.09 | bwd_allreduce_microstep: 5.28 | step_microstep: 61.97
+[2024-06-18 22:30:09,149] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3782.60 | bwd: 3479.11 | bwd_inner: 3468.91 | bwd_allreduce: 10.02 | step: 62.06
+ 26%|██▌       | 181/700 [23:18<1:02:59,  7.28s/it]                                                   {'loss': 0.015, 'learning_rate': 8.691375278964162e-05, 'epoch': 1.81}
+ 26%|██▌       | 181/700 [23:18<1:02:59,  7.28s/it][2024-06-18 22:30:13,022] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1961.62 | bwd_microstep: 1889.79 | bwd_inner_microstep: 1884.95 | bwd_allreduce_microstep: 4.68 | step_microstep: 0.08
+[2024-06-18 22:30:17,003] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 22:30:17,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1975.38 | bwd_microstep: 1926.21 | bwd_inner_microstep: 1920.85 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.52
+[2024-06-18 22:30:17,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3936.97 | bwd: 3816.02 | bwd_inner: 3805.88 | bwd_allreduce: 9.95 | step: 61.60
+ 26%|██▌       | 182/700 [23:26<1:04:20,  7.45s/it]                                                   {'loss': 1.0568, 'learning_rate': 8.675731968536002e-05, 'epoch': 1.82}
+ 26%|██▌       | 182/700 [23:26<1:04:20,  7.45s/it][2024-06-18 22:30:20,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1647.24 | bwd_microstep: 1844.47 | bwd_inner_microstep: 1839.69 | bwd_allreduce_microstep: 4.69 | step_microstep: 0.07
+[2024-06-18 22:30:24,330] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:30:24,331] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1923.83 | bwd_microstep: 1810.05 | bwd_inner_microstep: 1804.67 | bwd_allreduce_microstep: 5.21 | step_microstep: 61.81
+[2024-06-18 22:30:24,331] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3571.04 | bwd: 3654.55 | bwd_inner: 3644.47 | bwd_allreduce: 9.91 | step: 61.89
+ 26%|██▌       | 183/700 [23:33<1:03:53,  7.42s/it]                                                   {'loss': 0.707, 'learning_rate': 8.660009971058978e-05, 'epoch': 1.83}
+ 26%|██▌       | 183/700 [23:33<1:03:53,  7.42s/it][2024-06-18 22:30:27,992] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1891.75 | bwd_microstep: 1747.55 | bwd_inner_microstep: 1742.73 | bwd_allreduce_microstep: 4.66 | step_microstep: 0.07
+[2024-06-18 22:30:31,564] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.93
+[2024-06-18 22:30:31,565] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1652.09 | bwd_microstep: 1838.67 | bwd_inner_microstep: 1833.06 | bwd_allreduce_microstep: 5.52 | step_microstep: 62.95
+[2024-06-18 22:30:31,565] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3543.80 | bwd: 3586.24 | bwd_inner: 3575.87 | bwd_allreduce: 10.17 | step: 63.03
+ 26%|██▋       | 184/700 [23:41<1:03:18,  7.36s/it]                                                   {'loss': 0.5917, 'learning_rate': 8.644209623096686e-05, 'epoch': 1.84}
+ 26%|██▋       | 184/700 [23:41<1:03:18,  7.36s/it][2024-06-18 22:30:34,597] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1394.96 | bwd_microstep: 1615.32 | bwd_inner_microstep: 1610.55 | bwd_allreduce_microstep: 4.68 | step_microstep: 0.08
+[2024-06-18 22:30:38,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:30:38,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2009.94 | bwd_microstep: 1985.93 | bwd_inner_microstep: 1980.49 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.59
+[2024-06-18 22:30:38,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3404.87 | bwd: 3601.27 | bwd_inner: 3591.14 | bwd_allreduce: 9.95 | step: 61.67
+ 26%|██▋       | 185/700 [23:48<1:02:31,  7.28s/it]                                                   {'loss': 1.2096, 'learning_rate': 8.628331262889991e-05, 'epoch': 1.85}
+ 26%|██▋       | 185/700 [23:48<1:02:31,  7.28s/it][2024-06-18 22:30:42,609] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1979.93 | bwd_microstep: 1935.44 | bwd_inner_microstep: 1930.65 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.08
+[2024-06-18 22:30:46,565] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:30:46,566] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.83 | bwd_microstep: 1905.38 | bwd_inner_microstep: 1900.08 | bwd_allreduce_microstep: 5.22 | step_microstep: 61.76
+[2024-06-18 22:30:46,566] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3950.73 | bwd: 3840.85 | bwd_inner: 3830.77 | bwd_allreduce: 9.94 | step: 61.84
+ 27%|██▋       | 186/700 [23:56<1:03:58,  7.47s/it]                                                   {'loss': 1.1099, 'learning_rate': 8.612375230349778e-05, 'epoch': 1.86}
+ 27%|██▋       | 186/700 [23:56<1:03:58,  7.47s/it][2024-06-18 22:30:50,303] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1910.04 | bwd_microstep: 1804.70 | bwd_inner_microstep: 1799.90 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.07
+[2024-06-18 22:30:54,362] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:30:54,362] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2002.49 | bwd_microstep: 1977.26 | bwd_inner_microstep: 1971.83 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.89
+[2024-06-18 22:30:54,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3912.50 | bwd: 3781.98 | bwd_inner: 3771.83 | bwd_allreduce: 9.99 | step: 61.98
+ 27%|██▋       | 187/700 [24:03<1:04:41,  7.57s/it]                                                   {'loss': 0.5546, 'learning_rate': 8.596341867049677e-05, 'epoch': 1.87}
+ 27%|██▋       | 187/700 [24:03<1:04:41,  7.57s/it][2024-06-18 22:30:58,263] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1969.66 | bwd_microstep: 1908.66 | bwd_inner_microstep: 1903.79 | bwd_allreduce_microstep: 4.80 | step_microstep: 0.14
+[2024-06-18 22:31:02,182] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:31:02,183] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1957.58 | bwd_microstep: 1882.53 | bwd_inner_microstep: 1877.13 | bwd_allreduce_microstep: 5.23 | step_microstep: 61.75
+[2024-06-18 22:31:02,183] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3927.22 | bwd: 3791.21 | bwd_inner: 3781.00 | bwd_allreduce: 10.02 | step: 61.90
+ 27%|██▋       | 188/700 [24:11<1:05:13,  7.64s/it]                                                   {'loss': 0.6804, 'learning_rate': 8.58023151621875e-05, 'epoch': 1.88}
+ 27%|██▋       | 188/700 [24:11<1:05:13,  7.64s/it][2024-06-18 22:31:05,510] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1494.66 | bwd_microstep: 1810.75 | bwd_inner_microstep: 1805.94 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.08
+[2024-06-18 22:31:09,509] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:31:09,509] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1982.56 | bwd_microstep: 1935.92 | bwd_inner_microstep: 1930.58 | bwd_allreduce_microstep: 5.20 | step_microstep: 61.85
+[2024-06-18 22:31:09,510] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3477.18 | bwd: 3746.69 | bwd_inner: 3736.62 | bwd_allreduce: 9.94 | step: 61.93
+ 27%|██▋       | 189/700 [24:19<1:04:16,  7.55s/it]                                                   {'loss': 1.1012, 'learning_rate': 8.564044522734147e-05, 'epoch': 1.89}
+ 27%|██▋       | 189/700 [24:19<1:04:16,  7.55s/it][2024-06-18 22:31:13,387] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1963.90 | bwd_microstep: 1891.72 | bwd_inner_microstep: 1886.89 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.09
+[2024-06-18 22:31:17,391] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:31:17,391] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1980.87 | bwd_microstep: 1943.27 | bwd_inner_microstep: 1937.84 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.97
+[2024-06-18 22:31:17,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3944.74 | bwd: 3835.02 | bwd_inner: 3824.84 | bwd_allreduce: 10.00 | step: 62.06
+ 27%|██▋       | 190/700 [24:26<1:05:00,  7.65s/it]                                                   {'loss': 0.9811, 'learning_rate': 8.54778123311372e-05, 'epoch': 1.9}
+ 27%|██▋       | 190/700 [24:26<1:05:00,  7.65s/it][2024-06-18 22:31:21,136] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1914.46 | bwd_microstep: 1808.29 | bwd_inner_microstep: 1803.51 | bwd_allreduce_microstep: 4.69 | step_microstep: 0.07
+[2024-06-18 22:31:25,190] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:31:25,191] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1998.45 | bwd_microstep: 1976.23 | bwd_inner_microstep: 1970.84 | bwd_allreduce_microstep: 5.23 | step_microstep: 61.98
+[2024-06-18 22:31:25,191] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3912.88 | bwd: 3784.54 | bwd_inner: 3774.45 | bwd_allreduce: 9.93 | step: 62.07
+ 27%|██▋       | 191/700 [24:34<1:05:15,  7.69s/it]                                                   {'loss': 0.3721, 'learning_rate': 8.531441995508609e-05, 'epoch': 1.91}
+ 27%|██▋       | 191/700 [24:34<1:05:15,  7.69s/it][2024-06-18 22:31:29,100] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1973.04 | bwd_microstep: 1914.29 | bwd_inner_microstep: 1909.49 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.07
+[2024-06-18 22:31:32,736] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.80
+[2024-06-18 22:31:32,737] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1863.84 | bwd_microstep: 1692.62 | bwd_inner_microstep: 1687.29 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.74
+[2024-06-18 22:31:32,737] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3836.84 | bwd: 3606.93 | bwd_inner: 3596.82 | bwd_allreduce: 9.97 | step: 61.82
+ 27%|██▋       | 192/700 [24:42<1:04:45,  7.65s/it]                                                   {'loss': 0.3462, 'learning_rate': 8.515027159695781e-05, 'epoch': 1.92}
+ 27%|██▋       | 192/700 [24:42<1:04:45,  7.65s/it][2024-06-18 22:31:36,399] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1897.01 | bwd_microstep: 1743.81 | bwd_inner_microstep: 1738.94 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.07
+[2024-06-18 22:31:40,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.94
+[2024-06-18 22:31:40,364] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1977.78 | bwd_microstep: 1905.53 | bwd_inner_microstep: 1899.96 | bwd_allreduce_microstep: 5.49 | step_microstep: 62.79
+[2024-06-18 22:31:40,364] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3874.76 | bwd: 3649.37 | bwd_inner: 3638.99 | bwd_allreduce: 10.20 | step: 62.87
+ 28%|██▊       | 193/700 [24:49<1:04:34,  7.64s/it]                                                   {'loss': 0.4875, 'learning_rate': 8.498537077070548e-05, 'epoch': 1.93}
+ 28%|██▊       | 193/700 [24:49<1:04:34,  7.64s/it][2024-06-18 22:31:44,237] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1960.19 | bwd_microstep: 1890.29 | bwd_inner_microstep: 1885.50 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.08
+[2024-06-18 22:31:48,196] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:31:48,197] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.28 | bwd_microstep: 1909.98 | bwd_inner_microstep: 1904.56 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.61
+[2024-06-18 22:31:48,197] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3930.44 | bwd: 3800.30 | bwd_inner: 3790.15 | bwd_allreduce: 9.95 | step: 61.69
+ 28%|██▊       | 194/700 [24:57<1:04:56,  7.70s/it]                                                   {'loss': 0.9236, 'learning_rate': 8.481972100639049e-05, 'epoch': 1.94}
+ 28%|██▊       | 194/700 [24:57<1:04:56,  7.70s/it][2024-06-18 22:31:51,399] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1542.65 | bwd_microstep: 1637.86 | bwd_inner_microstep: 1633.00 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.08
+[2024-06-18 22:31:55,211] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:31:55,211] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1919.86 | bwd_microstep: 1811.63 | bwd_inner_microstep: 1806.20 | bwd_allreduce_microstep: 5.28 | step_microstep: 61.70
+[2024-06-18 22:31:55,212] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3462.48 | bwd: 3449.52 | bwd_inner: 3439.34 | bwd_allreduce: 10.02 | step: 61.78
+ 28%|██▊       | 195/700 [25:04<1:03:04,  7.49s/it]                                                   {'loss': 0.036, 'learning_rate': 8.465332585010682e-05, 'epoch': 1.95}
+ 28%|██▊       | 195/700 [25:04<1:03:04,  7.49s/it][2024-06-18 22:31:59,141] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1977.40 | bwd_microstep: 1929.69 | bwd_inner_microstep: 1924.85 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.07
+[2024-06-18 22:32:03,128] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:32:03,128] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1976.36 | bwd_microstep: 1931.82 | bwd_inner_microstep: 1926.51 | bwd_allreduce_microstep: 5.21 | step_microstep: 61.61
+[2024-06-18 22:32:03,129] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3953.73 | bwd: 3861.53 | bwd_inner: 3851.44 | bwd_allreduce: 9.90 | step: 61.69
+ 28%|██▊       | 196/700 [25:12<1:04:00,  7.62s/it]                                                   {'loss': 0.8321, 'learning_rate': 8.448618886390522e-05, 'epoch': 1.96}
+ 28%|██▊       | 196/700 [25:12<1:04:00,  7.62s/it][2024-06-18 22:32:06,344] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1548.01 | bwd_microstep: 1646.56 | bwd_inner_microstep: 1641.67 | bwd_allreduce_microstep: 4.81 | step_microstep: 0.14
+[2024-06-18 22:32:10,329] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:32:10,329] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1976.34 | bwd_microstep: 1928.61 | bwd_inner_microstep: 1923.27 | bwd_allreduce_microstep: 5.25 | step_microstep: 62.08
+[2024-06-18 22:32:10,330] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3524.32 | bwd: 3575.19 | bwd_inner: 3564.99 | bwd_allreduce: 10.06 | step: 62.23
+ 28%|██▊       | 197/700 [25:19<1:02:49,  7.50s/it]                                                   {'loss': 1.1929, 'learning_rate': 8.431831362571691e-05, 'epoch': 1.97}
+ 28%|██▊       | 197/700 [25:19<1:02:49,  7.50s/it][2024-06-18 22:32:14,304] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1994.56 | bwd_microstep: 1958.32 | bwd_inner_microstep: 1953.49 | bwd_allreduce_microstep: 4.75 | step_microstep: 0.08
+[2024-06-18 22:32:18,264] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:32:18,264] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1972.52 | bwd_microstep: 1907.22 | bwd_inner_microstep: 1901.82 | bwd_allreduce_microstep: 5.31 | step_microstep: 62.41
+[2024-06-18 22:32:18,265] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3967.05 | bwd: 3865.56 | bwd_inner: 3855.36 | bwd_allreduce: 10.07 | step: 62.50
+ 28%|██▊       | 198/700 [25:27<1:03:48,  7.63s/it]                                                   {'loss': 1.2032, 'learning_rate': 8.414970372927704e-05, 'epoch': 1.98}
+ 28%|██▊       | 198/700 [25:27<1:03:48,  7.63s/it][2024-06-18 22:32:21,540] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1574.68 | bwd_microstep: 1679.27 | bwd_inner_microstep: 1674.41 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.07
+[2024-06-18 22:32:25,344] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:32:25,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1914.93 | bwd_microstep: 1810.33 | bwd_inner_microstep: 1804.90 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.75
+[2024-06-18 22:32:25,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3489.57 | bwd: 3489.62 | bwd_inner: 3479.44 | bwd_allreduce: 9.96 | step: 61.83
+ 28%|██▊       | 199/700 [25:34<1:02:18,  7.46s/it]                                                   {'loss': 0.0115, 'learning_rate': 8.398036278404767e-05, 'epoch': 1.99}
+ 28%|██▊       | 199/700 [25:34<1:02:18,  7.46s/it][2024-06-18 22:32:29,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1993.02 | bwd_microstep: 1955.08 | bwd_inner_microstep: 1950.32 | bwd_allreduce_microstep: 4.65 | step_microstep: 0.08
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+[2024-06-18 22:32:34,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:32:34,049] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1971.39 | bwd_microstep: 1932.24 | bwd_inner_microstep: 1926.88 | bwd_allreduce_microstep: 5.22 | step_microstep: 62.41
+[2024-06-18 22:32:34,050] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3964.35 | bwd: 3887.34 | bwd_inner: 3877.31 | bwd_allreduce: 9.88 | step: 62.50
+ 29%|██▊       | 200/700 [25:43<1:05:17,  7.84s/it]                                                   {'loss': 1.0193, 'learning_rate': 8.38102944151406e-05, 'epoch': 2.0}
+ 29%|██▊       | 200/700 [25:43<1:05:17,  7.84s/it][2024-06-18 22:32:36,740] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-06-18 22:32:42,516] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-06-18 22:32:48,321] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-06-18 22:32:54,146] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-06-18 22:33:01,354] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1905.00 | bwd_microstep: 1794.96 | bwd_inner_microstep: 1790.09 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.07
+[2024-06-18 22:33:05,372] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 22:33:05,372] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1982.31 | bwd_microstep: 1956.24 | bwd_inner_microstep: 1950.78 | bwd_allreduce_microstep: 5.30 | step_microstep: 61.77
+[2024-06-18 22:33:05,373] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3887.24 | bwd: 3751.23 | bwd_inner: 3741.02 | bwd_allreduce: 10.00 | step: 61.85
+ 29%|██▊       | 201/700 [26:14<2:03:45, 14.88s/it]                                                   {'loss': 0.6684, 'learning_rate': 8.363950226323963e-05, 'epoch': 2.01}
+ 29%|██▊       | 201/700 [26:14<2:03:45, 14.88s/it][2024-06-18 22:33:09,329] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1981.60 | bwd_microstep: 1952.81 | bwd_inner_microstep: 1947.73 | bwd_allreduce_microstep: 4.99 | step_microstep: 0.10
+[2024-06-18 22:33:13,120] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 22:33:13,120] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1902.80 | bwd_microstep: 1806.78 | bwd_inner_microstep: 1801.38 | bwd_allreduce_microstep: 5.30 | step_microstep: 62.02
+[2024-06-18 22:33:13,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3884.36 | bwd: 3759.62 | bwd_inner: 3749.17 | bwd_allreduce: 10.29 | step: 62.13
+ 29%|██▉       | 202/700 [26:22<1:45:45, 12.74s/it]                                                   {'loss': 0.6883, 'learning_rate': 8.346798998452282e-05, 'epoch': 2.02}
+ 29%|██▉       | 202/700 [26:22<1:45:45, 12.74s/it][2024-06-18 22:33:17,081] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1984.58 | bwd_microstep: 1953.61 | bwd_inner_microstep: 1948.79 | bwd_allreduce_microstep: 4.75 | step_microstep: 0.07
+[2024-06-18 22:33:21,084] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.94
+[2024-06-18 22:33:21,085] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1976.31 | bwd_microstep: 1939.85 | bwd_inner_microstep: 1934.42 | bwd_allreduce_microstep: 5.34 | step_microstep: 63.79
+[2024-06-18 22:33:21,085] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3960.85 | bwd: 3893.48 | bwd_inner: 3883.25 | bwd_allreduce: 10.10 | step: 63.87
+ 29%|██▉       | 203/700 [26:30<1:33:40, 11.31s/it]                                                   {'loss': 0.8697, 'learning_rate': 8.329576125058406e-05, 'epoch': 2.03}
+ 29%|██▉       | 203/700 [26:30<1:33:40, 11.31s/it][2024-06-18 22:33:25,008] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1969.23 | bwd_microstep: 1929.76 | bwd_inner_microstep: 1921.20 | bwd_allreduce_microstep: 8.40 | step_microstep: 0.12
+[2024-06-18 22:33:29,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 22:33:29,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.91 | bwd_microstep: 1929.40 | bwd_inner_microstep: 1924.02 | bwd_allreduce_microstep: 5.21 | step_microstep: 62.28
+[2024-06-18 22:33:29,003] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3940.09 | bwd: 3859.19 | bwd_inner: 3845.35 | bwd_allreduce: 13.63 | step: 62.41
+ 29%|██▉       | 204/700 [26:38<1:25:04, 10.29s/it]                                                   {'loss': 0.7854, 'learning_rate': 8.312281974835452e-05, 'epoch': 2.04}
+ 29%|██▉       | 204/700 [26:38<1:25:04, 10.29s/it][2024-06-18 22:33:32,870] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1957.42 | bwd_microstep: 1887.92 | bwd_inner_microstep: 1883.06 | bwd_allreduce_microstep: 4.77 | step_microstep: 0.08
+[2024-06-18 22:33:36,930] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.91
+[2024-06-18 22:33:36,931] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2000.03 | bwd_microstep: 1979.17 | bwd_inner_microstep: 1973.53 | bwd_allreduce_microstep: 5.54 | step_microstep: 62.66
+[2024-06-18 22:33:36,931] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3957.42 | bwd: 3867.11 | bwd_inner: 3856.65 | bwd_allreduce: 10.32 | step: 62.75
+ 29%|██▉       | 205/700 [26:46<1:19:03,  9.58s/it]                                                   {'loss': 0.8573, 'learning_rate': 8.294916918002376e-05, 'epoch': 2.05}
+ 29%|██▉       | 205/700 [26:46<1:19:03,  9.58s/it][2024-06-18 22:33:40,560] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1879.53 | bwd_microstep: 1727.25 | bwd_inner_microstep: 1722.38 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.08
+[2024-06-18 22:33:44,357] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:33:44,358] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1914.33 | bwd_microstep: 1802.74 | bwd_inner_microstep: 1797.31 | bwd_allreduce_microstep: 5.26 | step_microstep: 62.04
+[2024-06-18 22:33:44,358] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3793.83 | bwd: 3530.02 | bwd_inner: 3519.85 | bwd_allreduce: 9.97 | step: 62.12
+ 29%|██▉       | 206/700 [26:53<1:13:34,  8.94s/it]                                                   {'loss': 0.1149, 'learning_rate': 8.277481326296038e-05, 'epoch': 2.06}
+ 29%|██▉       | 206/700 [26:53<1:13:34,  8.94s/it][2024-06-18 22:33:47,466] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1424.19 | bwd_microstep: 1663.11 | bwd_inner_microstep: 1658.26 | bwd_allreduce_microstep: 4.77 | step_microstep: 0.09
+[2024-06-18 22:33:51,446] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:33:51,447] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1972.21 | bwd_microstep: 1928.31 | bwd_inner_microstep: 1922.96 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.60
+[2024-06-18 22:33:51,447] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3396.38 | bwd: 3591.44 | bwd_inner: 3581.27 | bwd_allreduce: 10.03 | step: 61.70
+ 30%|██▉       | 207/700 [27:00<1:08:52,  8.38s/it]                                                   {'loss': 0.9545, 'learning_rate': 8.259975572963257e-05, 'epoch': 2.07}
+ 30%|██▉       | 207/700 [27:00<1:08:52,  8.38s/it][2024-06-18 22:33:55,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1995.28 | bwd_microstep: 1972.91 | bwd_inner_microstep: 1968.03 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.08
+[2024-06-18 22:33:59,387] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:33:59,388] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1971.51 | bwd_microstep: 1898.30 | bwd_inner_microstep: 1892.96 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.44
+[2024-06-18 22:33:59,388] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3966.76 | bwd: 3871.24 | bwd_inner: 3861.09 | bwd_allreduce: 9.99 | step: 61.52
+ 30%|██▉       | 208/700 [27:08<1:07:38,  8.25s/it]                                                   {'loss': 0.7834, 'learning_rate': 8.242400032752813e-05, 'epoch': 2.08}
+ 30%|██▉       | 208/700 [27:08<1:07:38,  8.25s/it][2024-06-18 22:34:03,269] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1962.94 | bwd_microstep: 1896.78 | bwd_inner_microstep: 1892.01 | bwd_allreduce_microstep: 4.68 | step_microstep: 0.08
+[2024-06-18 22:34:07,210] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:34:07,210] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1966.74 | bwd_microstep: 1894.21 | bwd_inner_microstep: 1888.87 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.60
+[2024-06-18 22:34:07,210] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3929.64 | bwd: 3791.01 | bwd_inner: 3780.93 | bwd_allreduce: 9.94 | step: 61.69
+ 30%|██▉       | 209/700 [27:16<1:06:27,  8.12s/it]                                                   {'loss': 0.7207, 'learning_rate': 8.224755081907427e-05, 'epoch': 2.09}
+ 30%|██▉       | 209/700 [27:16<1:06:27,  8.12s/it][2024-06-18 22:34:10,839] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1881.81 | bwd_microstep: 1724.80 | bwd_inner_microstep: 1719.99 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.07
+[2024-06-18 22:34:14,841] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.95
+[2024-06-18 22:34:14,842] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1979.94 | bwd_microstep: 1940.26 | bwd_inner_microstep: 1934.61 | bwd_allreduce_microstep: 5.54 | step_microstep: 64.59
+[2024-06-18 22:34:14,842] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3861.72 | bwd: 3665.08 | bwd_inner: 3654.64 | bwd_allreduce: 10.28 | step: 64.67
+ 30%|███       | 210/700 [27:24<1:05:07,  7.97s/it]                                                   {'loss': 0.4601, 'learning_rate': 8.2070410981557e-05, 'epoch': 2.1}
+ 30%|███       | 210/700 [27:24<1:05:07,  7.97s/it][2024-06-18 22:34:18,866] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2007.12 | bwd_microstep: 1994.22 | bwd_inner_microstep: 1989.42 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.07
+[2024-06-18 22:34:22,923] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:34:22,924] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2003.57 | bwd_microstep: 1974.35 | bwd_inner_microstep: 1969.02 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.83
+[2024-06-18 22:34:22,924] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 4010.66 | bwd: 3968.60 | bwd_inner: 3958.48 | bwd_allreduce: 9.97 | step: 61.91
+ 30%|███       | 211/700 [27:32<1:05:15,  8.01s/it]                                                   {'loss': 0.8886, 'learning_rate': 8.189258460704038e-05, 'epoch': 2.11}
+ 30%|███       | 211/700 [27:32<1:05:15,  8.01s/it][2024-06-18 22:34:26,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1916.62 | bwd_microstep: 1808.50 | bwd_inner_microstep: 1803.73 | bwd_allreduce_microstep: 4.68 | step_microstep: 0.08
+[2024-06-18 22:34:29,922] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:34:29,923] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1443.88 | bwd_microstep: 1726.90 | bwd_inner_microstep: 1721.43 | bwd_allreduce_microstep: 5.30 | step_microstep: 62.53
+[2024-06-18 22:34:29,923] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3360.47 | bwd: 3535.43 | bwd_inner: 3525.26 | bwd_allreduce: 9.98 | step: 62.62
+ 30%|███       | 212/700 [27:39<1:02:39,  7.70s/it]                                                   {'loss': 0.4563, 'learning_rate': 8.171407550228532e-05, 'epoch': 2.12}
+ 30%|███       | 212/700 [27:39<1:02:39,  7.70s/it][2024-06-18 22:34:33,918] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2001.10 | bwd_microstep: 1971.79 | bwd_inner_microstep: 1966.98 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.08
+[2024-06-18 22:34:37,355] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:34:37,355] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1508.95 | bwd_microstep: 1849.23 | bwd_inner_microstep: 1843.87 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.89
+[2024-06-18 22:34:37,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3510.02 | bwd: 3821.04 | bwd_inner: 3810.89 | bwd_allreduce: 10.00 | step: 61.98
+ 30%|███       | 213/700 [27:46<1:01:52,  7.62s/it]                                                   {'loss': 1.0037, 'learning_rate': 8.153488748866796e-05, 'epoch': 2.13}
+ 30%|███       | 213/700 [27:46<1:01:52,  7.62s/it][2024-06-18 22:34:40,765] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1519.31 | bwd_microstep: 1868.66 | bwd_inner_microstep: 1863.69 | bwd_allreduce_microstep: 4.88 | step_microstep: 0.14
+[2024-06-18 22:34:43,931] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:34:43,932] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1425.28 | bwd_microstep: 1661.97 | bwd_inner_microstep: 1656.59 | bwd_allreduce_microstep: 5.29 | step_microstep: 61.76
+[2024-06-18 22:34:43,932] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2944.56 | bwd: 3530.65 | bwd_inner: 3520.34 | bwd_allreduce: 10.18 | step: 61.91
+ 31%|███       | 214/700 [27:53<59:12,  7.31s/it]                                                   {'loss': 0.8796, 'learning_rate': 8.135502440209804e-05, 'epoch': 2.14}
+ 31%|███       | 214/700 [27:53<59:12,  7.31s/it][2024-06-18 22:34:47,585] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1889.43 | bwd_microstep: 1741.82 | bwd_inner_microstep: 1737.02 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.08
+[2024-06-18 22:34:51,309] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:34:51,310] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1895.92 | bwd_microstep: 1748.25 | bwd_inner_microstep: 1742.82 | bwd_allreduce_microstep: 5.34 | step_microstep: 62.74
+[2024-06-18 22:34:51,310] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3785.32 | bwd: 3490.09 | bwd_inner: 3479.90 | bwd_allreduce: 10.05 | step: 62.82
+ 31%|███       | 215/700 [28:00<59:14,  7.33s/it]                                                 {'loss': 0.0385, 'learning_rate': 8.117449009293668e-05, 'epoch': 2.15}
+ 31%|███       | 215/700 [28:00<59:14,  7.33s/it][2024-06-18 22:34:55,285] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1993.25 | bwd_microstep: 1959.97 | bwd_inner_microstep: 1955.18 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.08
+[2024-06-18 22:34:59,231] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:34:59,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.44 | bwd_microstep: 1897.79 | bwd_inner_microstep: 1892.32 | bwd_allreduce_microstep: 5.31 | step_microstep: 62.62
+[2024-06-18 22:34:59,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3960.66 | bwd: 3857.78 | bwd_inner: 3847.60 | bwd_allreduce: 10.01 | step: 62.71
+ 31%|███       | 216/700 [28:08<1:00:33,  7.51s/it]                                                   {'loss': 0.8037, 'learning_rate': 8.0993288425914e-05, 'epoch': 2.16}
+ 31%|███       | 216/700 [28:08<1:00:33,  7.51s/it][2024-06-18 22:35:03,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1960.21 | bwd_microstep: 1889.62 | bwd_inner_microstep: 1884.76 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.08
+[2024-06-18 22:35:07,044] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:35:07,044] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.00 | bwd_microstep: 1890.92 | bwd_inner_microstep: 1885.47 | bwd_allreduce_microstep: 5.28 | step_microstep: 61.58
+[2024-06-18 22:35:07,045] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3930.17 | bwd: 3780.56 | bwd_inner: 3770.38 | bwd_allreduce: 9.99 | step: 61.67
+ 31%|███       | 217/700 [28:16<1:01:10,  7.60s/it]                                                   {'loss': 0.7149, 'learning_rate': 8.081142328004637e-05, 'epoch': 2.17}
+ 31%|███       | 217/700 [28:16<1:01:10,  7.60s/it][2024-06-18 22:35:10,940] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.26 | bwd_microstep: 1906.41 | bwd_inner_microstep: 1901.58 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.08
+[2024-06-18 22:35:14,895] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.93
+[2024-06-18 22:35:14,896] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1974.97 | bwd_microstep: 1899.97 | bwd_inner_microstep: 1894.38 | bwd_allreduce_microstep: 5.51 | step_microstep: 62.56
+[2024-06-18 22:35:14,896] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3942.19 | bwd: 3806.41 | bwd_inner: 3796.03 | bwd_allreduce: 10.24 | step: 62.64
+ 31%|███       | 218/700 [28:24<1:01:39,  7.67s/it]                                                   {'loss': 1.0, 'learning_rate': 8.062889854855333e-05, 'epoch': 2.18}
+ 31%|███       | 218/700 [28:24<1:01:39,  7.67s/it][2024-06-18 22:35:18,636] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1913.27 | bwd_microstep: 1804.04 | bwd_inner_microstep: 1799.18 | bwd_allreduce_microstep: 4.69 | step_microstep: 0.08
+[2024-06-18 22:35:22,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:35:22,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2005.40 | bwd_microstep: 1984.60 | bwd_inner_microstep: 1979.26 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.77
+[2024-06-18 22:35:22,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3918.64 | bwd: 3788.66 | bwd_inner: 3778.54 | bwd_allreduce: 9.95 | step: 61.85
+ 31%|███▏      | 219/700 [28:32<1:01:51,  7.72s/it]                                                   {'loss': 0.4931, 'learning_rate': 8.044571813877431e-05, 'epoch': 2.19}
+ 31%|███▏      | 219/700 [28:32<1:01:51,  7.72s/it][2024-06-18 22:35:26,712] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2009.42 | bwd_microstep: 1974.89 | bwd_inner_microstep: 1970.10 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.07
+[2024-06-18 22:35:30,145] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:35:30,146] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1508.52 | bwd_microstep: 1846.00 | bwd_inner_microstep: 1840.65 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.63
+[2024-06-18 22:35:30,146] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3517.91 | bwd: 3820.91 | bwd_inner: 3810.79 | bwd_allreduce: 9.97 | step: 61.72
+ 31%|███▏      | 220/700 [28:39<1:01:03,  7.63s/it]                                                   {'loss': 1.2229, 'learning_rate': 8.0261885972085e-05, 'epoch': 2.2}
+ 31%|███▏      | 220/700 [28:39<1:01:03,  7.63s/it][2024-06-18 22:35:34,124] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1996.65 | bwd_microstep: 1959.54 | bwd_inner_microstep: 1954.72 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.07
+[2024-06-18 22:35:37,739] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:35:37,739] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1665.02 | bwd_microstep: 1869.82 | bwd_inner_microstep: 1864.49 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.85
+[2024-06-18 22:35:37,740] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3661.63 | bwd: 3829.39 | bwd_inner: 3819.27 | bwd_allreduce: 9.98 | step: 61.93
+ 32%|███▏      | 221/700 [28:47<1:00:50,  7.62s/it]                                                   {'loss': 1.0206, 'learning_rate': 8.00774059838133e-05, 'epoch': 2.21}
+ 32%|███▏      | 221/700 [28:47<1:00:50,  7.62s/it][2024-06-18 22:35:41,400] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1891.19 | bwd_microstep: 1747.13 | bwd_inner_microstep: 1742.13 | bwd_allreduce_microstep: 4.85 | step_microstep: 0.14
+[2024-06-18 22:35:45,027] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:35:45,027] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1761.24 | bwd_microstep: 1785.86 | bwd_inner_microstep: 1780.45 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.90
+[2024-06-18 22:35:45,028] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3652.39 | bwd: 3533.01 | bwd_inner: 3522.71 | bwd_allreduce: 10.08 | step: 62.05
+ 32%|███▏      | 222/700 [28:54<59:55,  7.52s/it]                                                   {'loss': 0.4281, 'learning_rate': 7.989228212315516e-05, 'epoch': 2.22}
+ 32%|███▏      | 222/700 [28:54<59:55,  7.52s/it][2024-06-18 22:35:49,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1997.61 | bwd_microstep: 1955.16 | bwd_inner_microstep: 1950.37 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.07
+[2024-06-18 22:35:52,798] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:35:52,799] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1910.64 | bwd_microstep: 1805.57 | bwd_inner_microstep: 1800.27 | bwd_allreduce_microstep: 5.21 | step_microstep: 61.71
+[2024-06-18 22:35:52,799] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3908.22 | bwd: 3760.75 | bwd_inner: 3750.69 | bwd_allreduce: 9.93 | step: 61.79
+ 32%|███▏      | 223/700 [29:02<1:00:23,  7.60s/it]                                                   {'loss': 0.325, 'learning_rate': 7.970651835309009e-05, 'epoch': 2.23}
+ 32%|███▏      | 223/700 [29:02<1:00:23,  7.60s/it][2024-06-18 22:35:56,673] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1963.89 | bwd_microstep: 1888.78 | bwd_inner_microstep: 1883.94 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.09
+[2024-06-18 22:36:00,474] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:36:00,474] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1916.57 | bwd_microstep: 1804.20 | bwd_inner_microstep: 1798.86 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.86
+[2024-06-18 22:36:00,475] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3880.43 | bwd: 3693.00 | bwd_inner: 3682.84 | bwd_allreduce: 10.02 | step: 61.96
+ 32%|███▏      | 224/700 [29:09<1:00:27,  7.62s/it]                                                   {'loss': 0.4511, 'learning_rate': 7.952011865029614e-05, 'epoch': 2.24}
+ 32%|███▏      | 224/700 [29:09<1:00:27,  7.62s/it][2024-06-18 22:36:04,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2008.72 | bwd_microstep: 1992.51 | bwd_inner_microstep: 1987.70 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.07
+[2024-06-18 22:36:08,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:36:08,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2007.73 | bwd_microstep: 1970.44 | bwd_inner_microstep: 1965.08 | bwd_allreduce_microstep: 5.22 | step_microstep: 61.59
+[2024-06-18 22:36:08,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 4016.42 | bwd: 3962.97 | bwd_inner: 3952.88 | bwd_allreduce: 9.96 | step: 61.67
+ 32%|███▏      | 225/700 [29:18<1:01:25,  7.76s/it]                                                   {'loss': 1.0534, 'learning_rate': 7.933308700506497e-05, 'epoch': 2.25}
+ 32%|███▏      | 225/700 [29:18<1:01:25,  7.76s/it][2024-06-18 22:36:12,534] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1994.96 | bwd_microstep: 1961.58 | bwd_inner_microstep: 1956.78 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.08
+[2024-06-18 22:36:16,520] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.93
+[2024-06-18 22:36:16,521] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1977.06 | bwd_microstep: 1927.89 | bwd_inner_microstep: 1922.51 | bwd_allreduce_microstep: 5.29 | step_microstep: 63.28
+[2024-06-18 22:36:16,521] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3971.98 | bwd: 3889.50 | bwd_inner: 3879.35 | bwd_allreduce: 10.01 | step: 63.36
+ 32%|███▏      | 226/700 [29:26<1:01:47,  7.82s/it]                                                   {'loss': 1.0809, 'learning_rate': 7.914542742121633e-05, 'epoch': 2.26}
+ 32%|███▏      | 226/700 [29:26<1:01:47,  7.82s/it][2024-06-18 22:36:18,101] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 728.27 | bwd_microstep: 829.10 | bwd_inner_microstep: 824.25 | bwd_allreduce_microstep: 4.69 | step_microstep: 0.08
+[2024-06-18 22:36:22,042] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.95
+[2024-06-18 22:36:22,043] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1965.90 | bwd_microstep: 1893.28 | bwd_inner_microstep: 1887.62 | bwd_allreduce_microstep: 5.56 | step_microstep: 63.26
+[2024-06-18 22:36:22,043] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2694.12 | bwd: 2722.41 | bwd_inner: 2711.96 | bwd_allreduce: 10.25 | step: 63.34
+ 32%|███▏      | 227/700 [29:31<56:12,  7.13s/it]                                                   {'loss': 0.7914, 'learning_rate': 7.895714391601232e-05, 'epoch': 2.27}
+ 32%|███▏      | 227/700 [29:31<56:12,  7.13s/it][2024-06-18 22:36:25,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1892.75 | bwd_microstep: 1743.84 | bwd_inner_microstep: 1739.12 | bwd_allreduce_microstep: 4.67 | step_microstep: 0.08
+[2024-06-18 22:36:29,687] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 22:36:29,687] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1974.33 | bwd_microstep: 1924.59 | bwd_inner_microstep: 1919.25 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.88
+[2024-06-18 22:36:29,687] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3867.04 | bwd: 3668.45 | bwd_inner: 3658.40 | bwd_allreduce: 9.92 | step: 61.97
+ 33%|███▎      | 228/700 [29:39<57:18,  7.28s/it]                                                 {'loss': 0.4348, 'learning_rate': 7.876824052007149e-05, 'epoch': 2.28}
+ 33%|███▎      | 228/700 [29:39<57:18,  7.28s/it][2024-06-18 22:36:33,594] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1966.80 | bwd_microstep: 1917.62 | bwd_inner_microstep: 1912.89 | bwd_allreduce_microstep: 4.65 | step_microstep: 0.07
+[2024-06-18 22:36:37,553] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:36:37,554] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1972.46 | bwd_microstep: 1906.78 | bwd_inner_microstep: 1901.43 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.86
+[2024-06-18 22:36:37,554] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3939.23 | bwd: 3824.43 | bwd_inner: 3814.37 | bwd_allreduce: 9.92 | step: 61.94
+ 33%|███▎      | 229/700 [29:47<58:33,  7.46s/it]                                                 {'loss': 0.9297, 'learning_rate': 7.857872127728248e-05, 'epoch': 2.29}
+ 33%|███▎      | 229/700 [29:47<58:33,  7.46s/it][2024-06-18 22:36:41,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1914.81 | bwd_microstep: 1804.87 | bwd_inner_microstep: 1800.08 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.07
+[2024-06-18 22:36:45,099] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:36:45,100] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1917.69 | bwd_microstep: 1806.54 | bwd_inner_microstep: 1801.22 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.28
+[2024-06-18 22:36:45,100] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3832.47 | bwd: 3611.43 | bwd_inner: 3601.35 | bwd_allreduce: 9.95 | step: 61.36
+ 33%|███▎      | 230/700 [29:54<58:38,  7.49s/it]                                                 {'loss': 0.0606, 'learning_rate': 7.838859024471748e-05, 'epoch': 2.3}
+ 33%|███▎      | 230/700 [29:54<58:38,  7.49s/it][2024-06-18 22:36:48,496] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1704.36 | bwd_microstep: 1670.45 | bwd_inner_microstep: 1665.65 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.07
+[2024-06-18 22:36:51,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 22:36:51,104] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1233.05 | bwd_microstep: 1295.02 | bwd_inner_microstep: 1289.60 | bwd_allreduce_microstep: 5.33 | step_microstep: 62.19
+[2024-06-18 22:36:51,104] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2937.38 | bwd: 2965.49 | bwd_inner: 2955.30 | bwd_allreduce: 10.05 | step: 62.27
+ 33%|███▎      | 231/700 [30:00<55:02,  7.04s/it]                                                 {'loss': 0.4665, 'learning_rate': 7.819785149254532e-05, 'epoch': 2.31}
+ 33%|███▎      | 231/700 [30:00<55:02,  7.04s/it][2024-06-18 22:36:54,758] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1889.86 | bwd_microstep: 1742.12 | bwd_inner_microstep: 1737.35 | bwd_allreduce_microstep: 4.69 | step_microstep: 0.08
+[2024-06-18 22:36:58,722] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:36:58,722] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1973.77 | bwd_microstep: 1910.53 | bwd_inner_microstep: 1905.12 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.84
+[2024-06-18 22:36:58,723] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3863.60 | bwd: 3652.68 | bwd_inner: 3642.57 | bwd_allreduce: 9.95 | step: 61.92
+ 33%|███▎      | 232/700 [30:08<56:16,  7.21s/it]                                                 {'loss': 0.4669, 'learning_rate': 7.800650910394449e-05, 'epoch': 2.32}
+ 33%|███▎      | 232/700 [30:08<56:16,  7.21s/it][2024-06-18 22:37:01,828] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1422.93 | bwd_microstep: 1661.42 | bwd_inner_microstep: 1656.64 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.07
+[2024-06-18 22:37:05,788] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.80
+[2024-06-18 22:37:05,789] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1971.05 | bwd_microstep: 1909.88 | bwd_inner_microstep: 1904.50 | bwd_allreduce_microstep: 5.22 | step_microstep: 61.79
+[2024-06-18 22:37:05,789] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3393.95 | bwd: 3571.33 | bwd_inner: 3561.24 | bwd_allreduce: 9.93 | step: 61.87
+ 33%|███▎      | 233/700 [30:15<55:48,  7.17s/it]                                                 {'loss': 0.9979, 'learning_rate': 7.781456717501557e-05, 'epoch': 2.33}
+ 33%|███▎      | 233/700 [30:15<55:48,  7.17s/it][2024-06-18 22:37:09,446] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1892.54 | bwd_microstep: 1742.59 | bwd_inner_microstep: 1737.80 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.08
+[2024-06-18 22:37:13,433] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:37:13,434] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1983.79 | bwd_microstep: 1924.37 | bwd_inner_microstep: 1918.95 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.45
+[2024-06-18 22:37:13,434] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3876.30 | bwd: 3666.99 | bwd_inner: 3656.86 | bwd_allreduce: 9.96 | step: 61.54
+ 33%|███▎      | 234/700 [30:22<56:47,  7.31s/it]                                                 {'loss': 0.376, 'learning_rate': 7.762202981469357e-05, 'epoch': 2.34}
+ 33%|███▎      | 234/700 [30:22<56:47,  7.31s/it][2024-06-18 22:37:17,434] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2003.32 | bwd_microstep: 1975.36 | bwd_inner_microstep: 1970.47 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.08
+[2024-06-18 22:37:21,425] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:37:21,426] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1976.97 | bwd_microstep: 1933.02 | bwd_inner_microstep: 1927.63 | bwd_allreduce_microstep: 5.23 | step_microstep: 61.67
+[2024-06-18 22:37:21,426] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3980.26 | bwd: 3908.40 | bwd_inner: 3898.25 | bwd_allreduce: 9.97 | step: 61.75
+ 34%|███▎      | 235/700 [30:30<58:15,  7.52s/it]                                                 {'loss': 1.0115, 'learning_rate': 7.74289011446601e-05, 'epoch': 2.35}
+ 34%|███▎      | 235/700 [30:30<58:15,  7.52s/it][2024-06-18 22:37:25,173] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1913.77 | bwd_microstep: 1811.78 | bwd_inner_microstep: 1807.05 | bwd_allreduce_microstep: 4.64 | step_microstep: 0.07
+[2024-06-18 22:37:29,138] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.97
+[2024-06-18 22:37:29,139] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1974.21 | bwd_microstep: 1908.97 | bwd_inner_microstep: 1903.30 | bwd_allreduce_microstep: 5.57 | step_microstep: 65.07
+[2024-06-18 22:37:29,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3887.95 | bwd: 3720.77 | bwd_inner: 3710.40 | bwd_allreduce: 10.21 | step: 65.15
+ 34%|███▎      | 236/700 [30:38<58:35,  7.58s/it]                                                 {'loss': 0.3465, 'learning_rate': 7.7235185299255e-05, 'epoch': 2.36}
+ 34%|███▎      | 236/700 [30:38<58:35,  7.58s/it][2024-06-18 22:37:33,016] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1962.75 | bwd_microstep: 1891.33 | bwd_inner_microstep: 1886.57 | bwd_allreduce_microstep: 4.67 | step_microstep: 0.08
+[2024-06-18 22:37:36,968] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:37:36,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1971.00 | bwd_microstep: 1899.92 | bwd_inner_microstep: 1894.50 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.65
+[2024-06-18 22:37:36,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3933.72 | bwd: 3791.28 | bwd_inner: 3781.17 | bwd_allreduce: 9.93 | step: 61.73
+ 34%|███▍      | 237/700 [30:46<59:02,  7.65s/it]                                                 {'loss': 0.6312, 'learning_rate': 7.704088642538782e-05, 'epoch': 2.37}
+ 34%|███▍      | 237/700 [30:46<59:02,  7.65s/it][2024-06-18 22:37:40,078] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1426.14 | bwd_microstep: 1661.28 | bwd_inner_microstep: 1656.47 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.07
+[2024-06-18 22:37:42,731] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:37:42,732] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1162.04 | bwd_microstep: 1413.17 | bwd_inner_microstep: 1407.88 | bwd_allreduce_microstep: 5.20 | step_microstep: 61.64
+[2024-06-18 22:37:42,732] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2588.15 | bwd: 3074.47 | bwd_inner: 3064.39 | bwd_allreduce: 9.94 | step: 61.73
+ 34%|███▍      | 238/700 [30:52<54:33,  7.09s/it]                                                 {'loss': 1.0788, 'learning_rate': 7.68460086824492e-05, 'epoch': 2.38}
+ 34%|███▍      | 238/700 [30:52<54:33,  7.09s/it][2024-06-18 22:37:46,618] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1969.78 | bwd_microstep: 1893.90 | bwd_inner_microstep: 1888.88 | bwd_allreduce_microstep: 4.93 | step_microstep: 0.09
+[2024-06-18 22:37:50,231] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:37:50,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1664.05 | bwd_microstep: 1868.13 | bwd_inner_microstep: 1862.76 | bwd_allreduce_microstep: 5.20 | step_microstep: 61.56
+[2024-06-18 22:37:50,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3633.80 | bwd: 3762.05 | bwd_inner: 3751.74 | bwd_allreduce: 10.14 | step: 61.66
+ 34%|███▍      | 239/700 [30:59<55:23,  7.21s/it]                                                 {'loss': 0.8243, 'learning_rate': 7.665055624222166e-05, 'epoch': 2.39}
+ 34%|███▍      | 239/700 [30:59<55:23,  7.21s/it][2024-06-18 22:37:53,429] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1542.14 | bwd_microstep: 1633.46 | bwd_inner_microstep: 1628.66 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.08
+[2024-06-18 22:37:57,462] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 22:37:57,463] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1997.25 | bwd_microstep: 1955.80 | bwd_inner_microstep: 1950.23 | bwd_allreduce_microstep: 5.47 | step_microstep: 62.53
+[2024-06-18 22:37:57,463] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3539.36 | bwd: 3589.28 | bwd_inner: 3578.95 | bwd_allreduce: 10.18 | step: 62.61
+ 34%|███▍      | 240/700 [31:06<55:19,  7.22s/it]                                                 {'loss': 0.6284, 'learning_rate': 7.645453328879042e-05, 'epoch': 2.4}
+ 34%|███▍      | 240/700 [31:06<55:19,  7.22s/it][2024-06-18 22:38:01,366] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1968.61 | bwd_microstep: 1912.05 | bwd_inner_microstep: 1907.23 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.09
+[2024-06-18 22:38:05,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:38:05,364] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1979.23 | bwd_microstep: 1939.00 | bwd_inner_microstep: 1933.51 | bwd_allreduce_microstep: 5.32 | step_microstep: 61.75
+[2024-06-18 22:38:05,364] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3947.81 | bwd: 3851.07 | bwd_inner: 3840.84 | bwd_allreduce: 10.06 | step: 61.84
+ 34%|███▍      | 241/700 [31:14<56:46,  7.42s/it]                                                 {'loss': 0.5828, 'learning_rate': 7.625794401845377e-05, 'epoch': 2.41}
+ 34%|███▍      | 241/700 [31:14<56:46,  7.42s/it][2024-06-18 22:38:08,566] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1548.39 | bwd_microstep: 1631.74 | bwd_inner_microstep: 1626.88 | bwd_allreduce_microstep: 4.78 | step_microstep: 0.09
+[2024-06-18 22:38:11,938] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:38:11,938] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1492.96 | bwd_microstep: 1800.28 | bwd_inner_microstep: 1794.85 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.72
+[2024-06-18 22:38:11,939] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3041.31 | bwd: 3432.04 | bwd_inner: 3421.83 | bwd_allreduce: 10.05 | step: 61.82
+ 35%|███▍      | 242/700 [31:21<54:42,  7.17s/it]                                                 {'loss': 0.331, 'learning_rate': 7.606079263963317e-05, 'epoch': 2.42}
+ 35%|███▍      | 242/700 [31:21<54:42,  7.17s/it][2024-06-18 22:38:15,823] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.35 | bwd_microstep: 1895.30 | bwd_inner_microstep: 1890.54 | bwd_allreduce_microstep: 4.67 | step_microstep: 0.08
+[2024-06-18 22:38:19,895] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:38:19,896] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2008.75 | bwd_microstep: 1984.19 | bwd_inner_microstep: 1978.80 | bwd_allreduce_microstep: 5.30 | step_microstep: 62.02
+[2024-06-18 22:38:19,896] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3976.07 | bwd: 3879.51 | bwd_inner: 3869.39 | bwd_allreduce: 9.98 | step: 62.10
+ 35%|███▍      | 243/700 [31:29<56:23,  7.40s/it]                                                 {'loss': 0.8812, 'learning_rate': 7.586308337278336e-05, 'epoch': 2.43}
+ 35%|███▍      | 243/700 [31:29<56:23,  7.40s/it][2024-06-18 22:38:22,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1396.30 | bwd_microstep: 1613.89 | bwd_inner_microstep: 1609.10 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.08
+[2024-06-18 22:38:26,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:38:26,573] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1869.29 | bwd_microstep: 1695.80 | bwd_inner_microstep: 1690.43 | bwd_allreduce_microstep: 5.21 | step_microstep: 61.58
+[2024-06-18 22:38:26,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3265.56 | bwd: 3309.72 | bwd_inner: 3299.61 | bwd_allreduce: 9.93 | step: 61.66
+ 35%|███▍      | 244/700 [31:36<54:36,  7.19s/it]                                                 {'loss': 0.4332, 'learning_rate': 7.566482045030179e-05, 'epoch': 2.44}
+ 35%|███▍      | 244/700 [31:36<54:36,  7.19s/it][2024-06-18 22:38:29,829] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1567.74 | bwd_microstep: 1666.21 | bwd_inner_microstep: 1661.39 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.07
+[2024-06-18 22:38:33,557] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.95
+[2024-06-18 22:38:33,558] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1896.91 | bwd_microstep: 1749.04 | bwd_inner_microstep: 1743.38 | bwd_allreduce_microstep: 5.56 | step_microstep: 64.82
+[2024-06-18 22:38:33,558] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3464.62 | bwd: 3415.27 | bwd_inner: 3404.83 | bwd_allreduce: 10.28 | step: 64.91
+ 35%|███▌      | 245/700 [31:43<54:02,  7.13s/it]                                                 {'loss': 0.0149, 'learning_rate': 7.546600811643816e-05, 'epoch': 2.45}
+ 35%|███▌      | 245/700 [31:43<54:02,  7.13s/it][2024-06-18 22:38:37,502] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1979.44 | bwd_microstep: 1941.90 | bwd_inner_microstep: 1937.10 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.07
+[2024-06-18 22:38:41,138] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:38:41,139] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1862.75 | bwd_microstep: 1694.61 | bwd_inner_microstep: 1689.20 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.82
+[2024-06-18 22:38:41,139] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3842.16 | bwd: 3636.53 | bwd_inner: 3626.40 | bwd_allreduce: 9.95 | step: 61.90
+ 35%|███▌      | 246/700 [31:50<54:57,  7.26s/it]                                                 {'loss': 0.4849, 'learning_rate': 7.52666506272035e-05, 'epoch': 2.46}
+ 35%|███▌      | 246/700 [31:50<54:57,  7.26s/it][2024-06-18 22:38:44,803] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1894.27 | bwd_microstep: 1747.54 | bwd_inner_microstep: 1742.73 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.07
+[2024-06-18 22:38:48,868] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:38:48,869] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2005.81 | bwd_microstep: 1979.71 | bwd_inner_microstep: 1974.33 | bwd_allreduce_microstep: 5.28 | step_microstep: 62.00
+[2024-06-18 22:38:48,869] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3900.05 | bwd: 3727.27 | bwd_inner: 3717.11 | bwd_allreduce: 10.02 | step: 62.08
+ 35%|███▌      | 247/700 [31:58<55:53,  7.40s/it]                                                 {'loss': 0.5015, 'learning_rate': 7.50667522502791e-05, 'epoch': 2.47}
+ 35%|███▌      | 247/700 [31:58<55:53,  7.40s/it][2024-06-18 22:38:52,768] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.33 | bwd_microstep: 1909.69 | bwd_inner_microstep: 1904.87 | bwd_allreduce_microstep: 4.68 | step_microstep: 0.08
+[2024-06-18 22:38:56,798] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:38:56,799] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1996.93 | bwd_microstep: 1954.30 | bwd_inner_microstep: 1948.88 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.80
+[2024-06-18 22:38:56,799] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3964.23 | bwd: 3864.01 | bwd_inner: 3853.88 | bwd_allreduce: 9.94 | step: 61.88
+ 35%|███▌      | 248/700 [32:06<56:57,  7.56s/it]                                                 {'loss': 0.8306, 'learning_rate': 7.486631726492512e-05, 'epoch': 2.48}
+ 35%|███▌      | 248/700 [32:06<56:57,  7.56s/it][2024-06-18 22:38:58,370] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 727.38 | bwd_microstep: 822.77 | bwd_inner_microstep: 818.01 | bwd_allreduce_microstep: 4.69 | step_microstep: 0.07
+[2024-06-18 22:39:02,310] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.91
+[2024-06-18 22:39:02,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1963.03 | bwd_microstep: 1896.48 | bwd_inner_microstep: 1891.02 | bwd_allreduce_microstep: 5.38 | step_microstep: 62.65
+[2024-06-18 22:39:02,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2690.38 | bwd: 2719.28 | bwd_inner: 2709.08 | bwd_allreduce: 10.07 | step: 62.73
+ 36%|███▌      | 249/700 [32:11<52:12,  6.95s/it]                                                 {'loss': 0.3214, 'learning_rate': 7.466534996188897e-05, 'epoch': 2.49}
+ 36%|███▌      | 249/700 [32:11<52:12,  6.95s/it][2024-06-18 22:39:06,190] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1964.13 | bwd_microstep: 1892.47 | bwd_inner_microstep: 1887.67 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.07
+[2024-06-18 22:39:10,265] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 22:39:10,266] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2009.69 | bwd_microstep: 1985.81 | bwd_inner_microstep: 1980.35 | bwd_allreduce_microstep: 5.37 | step_microstep: 62.48
+[2024-06-18 22:39:10,266] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3973.79 | bwd: 3878.30 | bwd_inner: 3868.06 | bwd_allreduce: 10.10 | step: 62.56
+ 36%|███▌      | 250/700 [32:19<54:22,  7.25s/it]                                                 {'loss': 0.8702, 'learning_rate': 7.446385464331348e-05, 'epoch': 2.5}
+ 36%|███▌      | 250/700 [32:19<54:22,  7.25s/it][2024-06-18 22:39:14,147] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1964.50 | bwd_microstep: 1893.94 | bwd_inner_microstep: 1889.06 | bwd_allreduce_microstep: 4.80 | step_microstep: 0.09
+[2024-06-18 22:39:18,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:39:18,250] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2015.68 | bwd_microstep: 2006.16 | bwd_inner_microstep: 2000.76 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.63
+[2024-06-18 22:39:18,250] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3980.14 | bwd: 3900.12 | bwd_inner: 3889.91 | bwd_allreduce: 10.04 | step: 61.72
+ 36%|███▌      | 251/700 [32:27<55:53,  7.47s/it]                                                 {'loss': 1.0255, 'learning_rate': 7.426183562264488e-05, 'epoch': 2.51}
+ 36%|███▌      | 251/700 [32:27<55:53,  7.47s/it][2024-06-18 22:39:21,745] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1832.38 | bwd_microstep: 1640.67 | bwd_inner_microstep: 1635.86 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.07
+[2024-06-18 22:39:25,096] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:39:25,096] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1583.22 | bwd_microstep: 1688.38 | bwd_inner_microstep: 1682.88 | bwd_allreduce_microstep: 5.42 | step_microstep: 61.95
+[2024-06-18 22:39:25,097] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3415.57 | bwd: 3329.07 | bwd_inner: 3318.77 | bwd_allreduce: 10.15 | step: 62.03
+ 36%|███▌      | 252/700 [32:34<54:22,  7.28s/it]                                                 {'loss': 0.3734, 'learning_rate': 7.405929722454026e-05, 'epoch': 2.52}
+ 36%|███▌      | 252/700 [32:34<54:22,  7.28s/it][2024-06-18 22:39:26,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 536.95 | bwd_microstep: 510.75 | bwd_inner_microstep: 505.94 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.08
+[2024-06-18 22:39:30,154] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:39:30,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1973.29 | bwd_microstep: 1935.97 | bwd_inner_microstep: 1930.60 | bwd_allreduce_microstep: 5.29 | step_microstep: 61.69
+[2024-06-18 22:39:30,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2510.21 | bwd: 2446.74 | bwd_inner: 2436.60 | bwd_allreduce: 10.00 | step: 61.78
+ 36%|███▌      | 253/700 [32:39<49:17,  6.62s/it]                                                 {'loss': 0.4999, 'learning_rate': 7.385624378477521e-05, 'epoch': 2.53}
+ 36%|███▌      | 253/700 [32:39<49:17,  6.62s/it][2024-06-18 22:39:33,730] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1858.79 | bwd_microstep: 1694.77 | bwd_inner_microstep: 1689.71 | bwd_allreduce_microstep: 4.98 | step_microstep: 0.10
+[2024-06-18 22:39:37,536] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.89
+[2024-06-18 22:39:37,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1916.16 | bwd_microstep: 1809.39 | bwd_inner_microstep: 1804.02 | bwd_allreduce_microstep: 5.30 | step_microstep: 62.03
+[2024-06-18 22:39:37,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3774.92 | bwd: 3504.19 | bwd_inner: 3493.76 | bwd_allreduce: 10.29 | step: 62.14
+ 36%|███▋      | 254/700 [32:47<50:52,  6.85s/it]                                                 {'loss': 0.0178, 'learning_rate': 7.365267965015086e-05, 'epoch': 2.54}
+ 36%|███▋      | 254/700 [32:47<50:52,  6.85s/it][2024-06-18 22:39:41,198] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1891.89 | bwd_microstep: 1747.40 | bwd_inner_microstep: 1742.49 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.08
+[2024-06-18 22:39:45,193] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:39:45,193] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1982.32 | bwd_microstep: 1931.80 | bwd_inner_microstep: 1926.43 | bwd_allreduce_microstep: 5.32 | step_microstep: 62.05
+[2024-06-18 22:39:45,194] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3874.18 | bwd: 3679.22 | bwd_inner: 3668.98 | bwd_allreduce: 10.08 | step: 62.14
+ 36%|███▋      | 255/700 [32:54<52:34,  7.09s/it]                                                 {'loss': 0.4426, 'learning_rate': 7.344860917840091e-05, 'epoch': 2.55}
+ 36%|███▋      | 255/700 [32:54<52:34,  7.09s/it][2024-06-18 22:39:49,187] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1999.87 | bwd_microstep: 1971.99 | bwd_inner_microstep: 1967.10 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.07
+[2024-06-18 22:39:52,788] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 22:39:52,789] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1656.77 | bwd_microstep: 1863.86 | bwd_inner_microstep: 1858.37 | bwd_allreduce_microstep: 5.41 | step_microstep: 62.58
+[2024-06-18 22:39:52,789] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3656.60 | bwd: 3835.87 | bwd_inner: 3825.54 | bwd_allreduce: 10.18 | step: 62.66
+ 37%|███▋      | 256/700 [33:02<53:34,  7.24s/it]                                                 {'loss': 0.8835, 'learning_rate': 7.324403673809831e-05, 'epoch': 2.56}
+ 37%|███▋      | 256/700 [33:02<53:34,  7.24s/it][2024-06-18 22:39:56,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1972.13 | bwd_microstep: 1906.02 | bwd_inner_microstep: 1901.17 | bwd_allreduce_microstep: 4.69 | step_microstep: 0.07
+[2024-06-18 22:40:00,755] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 22:40:00,755] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2005.67 | bwd_microstep: 1981.01 | bwd_inner_microstep: 1975.70 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.75
+[2024-06-18 22:40:00,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3977.77 | bwd: 3887.06 | bwd_inner: 3876.95 | bwd_allreduce: 9.92 | step: 61.83
+ 37%|███▋      | 257/700 [33:10<55:04,  7.46s/it]                                                 {'loss': 1.0596, 'learning_rate': 7.303896670856167e-05, 'epoch': 2.57}
+ 37%|███▋      | 257/700 [33:10<55:04,  7.46s/it][2024-06-18 22:40:04,512] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1914.42 | bwd_microstep: 1819.98 | bwd_inner_microstep: 1815.18 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.07
+[2024-06-18 22:40:08,449] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:40:08,450] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1965.21 | bwd_microstep: 1892.99 | bwd_inner_microstep: 1887.69 | bwd_allreduce_microstep: 5.23 | step_microstep: 61.58
+[2024-06-18 22:40:08,450] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3879.60 | bwd: 3712.98 | bwd_inner: 3702.90 | bwd_allreduce: 9.95 | step: 61.66
+ 37%|███▋      | 258/700 [33:17<55:27,  7.53s/it]                                                 {'loss': 0.3448, 'learning_rate': 7.283340347976166e-05, 'epoch': 2.58}
+ 37%|███▋      | 258/700 [33:17<55:27,  7.53s/it][2024-06-18 22:40:12,367] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1973.66 | bwd_microstep: 1921.98 | bwd_inner_microstep: 1917.17 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.07
+[2024-06-18 22:40:15,686] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:40:15,687] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1569.41 | bwd_microstep: 1671.21 | bwd_inner_microstep: 1665.85 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.43
+[2024-06-18 22:40:15,687] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3543.04 | bwd: 3593.21 | bwd_inner: 3583.07 | bwd_allreduce: 10.01 | step: 61.51
+ 37%|███▋      | 259/700 [33:25<54:41,  7.44s/it]                                                 {'loss': 0.7635, 'learning_rate': 7.262735145222696e-05, 'epoch': 2.59}
+ 37%|███▋      | 259/700 [33:25<54:41,  7.44s/it][2024-06-18 22:40:19,184] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1644.99 | bwd_microstep: 1829.91 | bwd_inner_microstep: 1825.12 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.07
+[2024-06-18 22:40:22,343] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:40:22,344] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1423.77 | bwd_microstep: 1656.63 | bwd_inner_microstep: 1651.27 | bwd_allreduce_microstep: 5.19 | step_microstep: 61.41
+[2024-06-18 22:40:22,344] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3068.72 | bwd: 3486.56 | bwd_inner: 3476.50 | bwd_allreduce: 9.89 | step: 61.49
+ 37%|███▋      | 260/700 [33:31<52:50,  7.21s/it]                                                 {'loss': 0.5643, 'learning_rate': 7.242081503694995e-05, 'epoch': 2.6}
+ 37%|███▋      | 260/700 [33:31<52:50,  7.21s/it][2024-06-18 22:40:26,091] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1917.45 | bwd_microstep: 1807.66 | bwd_inner_microstep: 1802.85 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.07
+[2024-06-18 22:40:29,186] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.91
+[2024-06-18 22:40:29,187] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1396.10 | bwd_microstep: 1619.17 | bwd_inner_microstep: 1613.62 | bwd_allreduce_microstep: 5.45 | step_microstep: 62.87
+[2024-06-18 22:40:29,187] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3313.52 | bwd: 3426.85 | bwd_inner: 3416.52 | bwd_allreduce: 10.19 | step: 62.95
+ 37%|███▋      | 261/700 [33:38<51:55,  7.10s/it]                                                 {'loss': 0.4274, 'learning_rate': 7.22137986552925e-05, 'epoch': 2.61}
+ 37%|███▋      | 261/700 [33:38<51:55,  7.10s/it][2024-06-18 22:40:32,383] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1446.25 | bwd_microstep: 1728.09 | bwd_inner_microstep: 1723.27 | bwd_allreduce_microstep: 4.75 | step_microstep: 0.07
+[2024-06-18 22:40:36,365] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:40:36,365] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1976.41 | bwd_microstep: 1926.09 | bwd_inner_microstep: 1920.74 | bwd_allreduce_microstep: 5.30 | step_microstep: 61.64
+[2024-06-18 22:40:36,366] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3422.64 | bwd: 3654.21 | bwd_inner: 3644.03 | bwd_allreduce: 10.06 | step: 61.72
+ 37%|███▋      | 262/700 [33:45<51:59,  7.12s/it]                                                 {'loss': 0.891, 'learning_rate': 7.200630673889117e-05, 'epoch': 2.62}
+ 37%|███▋      | 262/700 [33:45<51:59,  7.12s/it][2024-06-18 22:40:40,357] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1996.23 | bwd_microstep: 1973.32 | bwd_inner_microstep: 1968.53 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.07
+[2024-06-18 22:40:43,634] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.92
+[2024-06-18 22:40:43,635] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1548.74 | bwd_microstep: 1648.70 | bwd_inner_microstep: 1643.28 | bwd_allreduce_microstep: 5.34 | step_microstep: 62.23
+[2024-06-18 22:40:43,635] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3544.93 | bwd: 3622.04 | bwd_inner: 3611.84 | bwd_allreduce: 10.06 | step: 62.31
+ 38%|███▊      | 263/700 [33:53<52:11,  7.17s/it]                                                 {'loss': 0.6576, 'learning_rate': 7.179834372956236e-05, 'epoch': 2.63}
+ 38%|███▊      | 263/700 [33:53<52:11,  7.17s/it][2024-06-18 22:40:47,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1962.20 | bwd_microstep: 1888.25 | bwd_inner_microstep: 1883.50 | bwd_allreduce_microstep: 4.66 | step_microstep: 0.07
+[2024-06-18 22:40:51,507] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:40:51,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1983.13 | bwd_microstep: 1936.64 | bwd_inner_microstep: 1931.21 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.70
+[2024-06-18 22:40:51,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3945.30 | bwd: 3824.91 | bwd_inner: 3814.80 | bwd_allreduce: 9.93 | step: 61.78
+ 38%|███▊      | 264/700 [34:00<53:36,  7.38s/it]                                                 {'loss': 0.9658, 'learning_rate': 7.15899140792072e-05, 'epoch': 2.64}
+ 38%|███▊      | 264/700 [34:01<53:36,  7.38s/it][2024-06-18 22:40:54,523] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1389.75 | bwd_microstep: 1604.19 | bwd_inner_microstep: 1599.44 | bwd_allreduce_microstep: 4.67 | step_microstep: 0.07
+[2024-06-18 22:40:58,778] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.90
+[2024-06-18 22:40:58,779] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2063.30 | bwd_microstep: 2112.10 | bwd_inner_microstep: 2106.49 | bwd_allreduce_microstep: 5.46 | step_microstep: 62.35
+[2024-06-18 22:40:58,779] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3453.02 | bwd: 3716.31 | bwd_inner: 3706.00 | bwd_allreduce: 10.11 | step: 62.43
+ 38%|███▊      | 265/700 [34:08<53:15,  7.35s/it]                                                 {'loss': 0.6069, 'learning_rate': 7.13810222497164e-05, 'epoch': 2.65}
+ 38%|███▊      | 265/700 [34:08<53:15,  7.35s/it][2024-06-18 22:41:02,764] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1996.53 | bwd_microstep: 1966.73 | bwd_inner_microstep: 1961.84 | bwd_allreduce_microstep: 4.75 | step_microstep: 0.08
+[2024-06-18 22:41:06,723] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:41:06,723] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.52 | bwd_microstep: 1908.58 | bwd_inner_microstep: 1903.26 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.95
+[2024-06-18 22:41:06,724] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3967.02 | bwd: 3875.33 | bwd_inner: 3865.19 | bwd_allreduce: 9.99 | step: 62.03
+ 38%|███▊      | 266/700 [34:16<54:26,  7.53s/it]                                                 {'loss': 0.6643, 'learning_rate': 7.117167271287453e-05, 'epoch': 2.66}
+ 38%|███▊      | 266/700 [34:16<54:26,  7.53s/it][2024-06-18 22:41:10,462] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1912.33 | bwd_microstep: 1804.81 | bwd_inner_microstep: 1799.94 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.09
+[2024-06-18 22:41:13,542] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:41:13,542] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1390.99 | bwd_microstep: 1608.69 | bwd_inner_microstep: 1603.28 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.71
+[2024-06-18 22:41:13,543] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3303.29 | bwd: 3413.53 | bwd_inner: 3403.34 | bwd_allreduce: 9.98 | step: 61.80
+ 38%|███▊      | 267/700 [34:23<52:46,  7.31s/it]                                                 {'loss': 0.0542, 'learning_rate': 7.096186995026439e-05, 'epoch': 2.67}
+ 38%|███▊      | 267/700 [34:23<52:46,  7.31s/it][2024-06-18 22:41:17,454] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.81 | bwd_microstep: 1918.63 | bwd_inner_microstep: 1913.57 | bwd_allreduce_microstep: 4.98 | step_microstep: 0.08
+[2024-06-18 22:41:21,401] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 22:41:21,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1966.25 | bwd_microstep: 1900.66 | bwd_inner_microstep: 1895.26 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.77
+[2024-06-18 22:41:21,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3937.03 | bwd: 3819.32 | bwd_inner: 3808.88 | bwd_allreduce: 10.30 | step: 61.86
+ 38%|███▊      | 268/700 [34:30<53:50,  7.48s/it]                                                 {'loss': 0.7741, 'learning_rate': 7.07516184531711e-05, 'epoch': 2.68}
+ 38%|███▊      | 268/700 [34:30<53:50,  7.48s/it][2024-06-18 22:41:25,291] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.90 | bwd_microstep: 1899.51 | bwd_inner_microstep: 1894.67 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.07
+[2024-06-18 22:41:29,099] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:41:29,099] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1917.52 | bwd_microstep: 1810.37 | bwd_inner_microstep: 1805.13 | bwd_allreduce_microstep: 5.16 | step_microstep: 61.52
+[2024-06-18 22:41:29,100] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3885.39 | bwd: 3709.91 | bwd_inner: 3699.85 | bwd_allreduce: 9.93 | step: 61.60
+ 38%|███▊      | 269/700 [34:38<54:11,  7.54s/it]                                                 {'loss': 0.3555, 'learning_rate': 7.054092272248589e-05, 'epoch': 2.69}
+ 38%|███▊      | 269/700 [34:38<54:11,  7.54s/it][2024-06-18 22:41:32,461] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1501.26 | bwd_microstep: 1839.37 | bwd_inner_microstep: 1834.58 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.07
+[2024-06-18 22:41:36,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.89
+[2024-06-18 22:41:36,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1998.74 | bwd_microstep: 1955.04 | bwd_inner_microstep: 1949.44 | bwd_allreduce_microstep: 5.51 | step_microstep: 63.92
+[2024-06-18 22:41:36,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3499.97 | bwd: 3794.43 | bwd_inner: 3784.06 | bwd_allreduce: 10.22 | step: 64.00
+ 39%|███▊      | 270/700 [34:45<53:44,  7.50s/it]                                                 {'loss': 0.9497, 'learning_rate': 7.032978726860981e-05, 'epoch': 2.7}
+ 39%|███▊      | 270/700 [34:45<53:44,  7.50s/it][2024-06-18 22:41:40,130] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1883.95 | bwd_microstep: 1726.53 | bwd_inner_microstep: 1721.73 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.07
+[2024-06-18 22:41:44,082] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:41:44,083] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1971.40 | bwd_microstep: 1901.84 | bwd_inner_microstep: 1896.47 | bwd_allreduce_microstep: 5.23 | step_microstep: 61.50
+[2024-06-18 22:41:44,083] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3855.32 | bwd: 3628.39 | bwd_inner: 3618.27 | bwd_allreduce: 9.97 | step: 61.58
+ 39%|███▊      | 271/700 [34:53<53:48,  7.53s/it]                                                 {'loss': 0.5577, 'learning_rate': 7.011821661135713e-05, 'epoch': 2.71}
+ 39%|███▊      | 271/700 [34:53<53:48,  7.53s/it][2024-06-18 22:41:47,829] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1913.48 | bwd_microstep: 1810.83 | bwd_inner_microstep: 1805.96 | bwd_allreduce_microstep: 4.77 | step_microstep: 0.08
+[2024-06-18 22:41:51,554] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 22:41:51,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1896.00 | bwd_microstep: 1748.38 | bwd_inner_microstep: 1742.99 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.70
+[2024-06-18 22:41:51,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3809.45 | bwd: 3559.23 | bwd_inner: 3549.06 | bwd_allreduce: 10.02 | step: 61.79
+ 39%|███▉      | 272/700 [35:01<53:34,  7.51s/it]                                                 {'loss': 0.0196, 'learning_rate': 6.990621527985856e-05, 'epoch': 2.72}
+ 39%|███▉      | 272/700 [35:01<53:34,  7.51s/it][2024-06-18 22:41:55,590] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2013.46 | bwd_microstep: 2000.08 | bwd_inner_microstep: 1995.22 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.08
+[2024-06-18 22:41:59,077] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:41:59,078] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1529.06 | bwd_microstep: 1878.32 | bwd_inner_microstep: 1872.93 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.94
+[2024-06-18 22:41:59,078] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3542.48 | bwd: 3878.42 | bwd_inner: 3868.26 | bwd_allreduce: 9.94 | step: 62.03
+ 39%|███▉      | 273/700 [35:08<53:28,  7.51s/it]                                                 {'loss': 1.0745, 'learning_rate': 6.969378781246436e-05, 'epoch': 2.73}
+ 39%|███▉      | 273/700 [35:08<53:28,  7.51s/it][2024-06-18 22:42:02,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1882.26 | bwd_microstep: 1726.32 | bwd_inner_microstep: 1721.35 | bwd_allreduce_microstep: 4.89 | step_microstep: 0.14
+[2024-06-18 22:42:06,263] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 22:42:06,263] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1651.27 | bwd_microstep: 1823.69 | bwd_inner_microstep: 1818.39 | bwd_allreduce_microstep: 5.22 | step_microstep: 61.98
+[2024-06-18 22:42:06,264] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3533.50 | bwd: 3550.03 | bwd_inner: 3539.78 | bwd_allreduce: 10.11 | step: 62.13
+ 39%|███▉      | 274/700 [35:15<52:38,  7.42s/it]                                                 {'loss': 0.3507, 'learning_rate': 6.948093875664718e-05, 'epoch': 2.74}
+ 39%|███▉      | 274/700 [35:15<52:38,  7.42s/it][2024-06-18 22:42:09,585] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1494.30 | bwd_microstep: 1805.90 | bwd_inner_microstep: 1801.06 | bwd_allreduce_microstep: 4.67 | step_microstep: 0.07
+[2024-06-18 22:42:13,179] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:42:13,179] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1656.33 | bwd_microstep: 1857.16 | bwd_inner_microstep: 1851.77 | bwd_allreduce_microstep: 5.31 | step_microstep: 62.27
+[2024-06-18 22:42:13,180] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3150.59 | bwd: 3663.08 | bwd_inner: 3652.92 | bwd_allreduce: 10.00 | step: 62.35
+ 39%|███▉      | 275/700 [35:22<51:27,  7.27s/it]                                                 {'loss': 0.9456, 'learning_rate': 6.926767266890465e-05, 'epoch': 2.75}
+ 39%|███▉      | 275/700 [35:22<51:27,  7.27s/it][2024-06-18 22:42:16,890] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1809.69 | bwd_microstep: 1878.89 | bwd_inner_microstep: 1873.98 | bwd_allreduce_microstep: 4.83 | step_microstep: 0.08
+[2024-06-18 22:42:20,448] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 22:42:20,448] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1836.53 | bwd_microstep: 1641.06 | bwd_inner_microstep: 1635.59 | bwd_allreduce_microstep: 5.33 | step_microstep: 61.82
+[2024-06-18 22:42:20,449] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3646.19 | bwd: 3519.97 | bwd_inner: 3509.64 | bwd_allreduce: 10.15 | step: 61.90
+ 39%|███▉      | 276/700 [35:29<51:20,  7.27s/it]                                                 {'loss': 0.4501, 'learning_rate': 6.905399411466189e-05, 'epoch': 2.76}
+ 39%|███▉      | 276/700 [35:29<51:20,  7.27s/it][2024-06-18 22:42:24,386] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1980.70 | bwd_microstep: 1934.80 | bwd_inner_microstep: 1929.99 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.07
+[2024-06-18 22:42:28,418] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:42:28,419] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1998.79 | bwd_microstep: 1954.63 | bwd_inner_microstep: 1949.31 | bwd_allreduce_microstep: 5.23 | step_microstep: 61.47
+[2024-06-18 22:42:28,419] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3979.46 | bwd: 3889.45 | bwd_inner: 3879.34 | bwd_allreduce: 9.97 | step: 61.55
+ 40%|███▉      | 277/700 [35:37<52:43,  7.48s/it]                                                 {'loss': 0.9141, 'learning_rate': 6.883990766817377e-05, 'epoch': 2.77}
+ 40%|███▉      | 277/700 [35:37<52:43,  7.48s/it][2024-06-18 22:42:31,736] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1492.56 | bwd_microstep: 1803.37 | bwd_inner_microstep: 1798.48 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.09
+[2024-06-18 22:42:35,737] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:42:35,737] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1980.90 | bwd_microstep: 1939.48 | bwd_inner_microstep: 1934.14 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.77
+[2024-06-18 22:42:35,738] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3473.43 | bwd: 3742.87 | bwd_inner: 3732.72 | bwd_allreduce: 9.97 | step: 61.87
+ 40%|███▉      | 278/700 [35:45<52:15,  7.43s/it]                                                 {'loss': 0.7734, 'learning_rate': 6.862541791242698e-05, 'epoch': 2.78}
+ 40%|███▉      | 278/700 [35:45<52:15,  7.43s/it][2024-06-18 22:42:39,726] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1998.33 | bwd_microstep: 1968.23 | bwd_inner_microstep: 1963.19 | bwd_allreduce_microstep: 4.95 | step_microstep: 0.09
+[2024-06-18 22:42:43,723] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 22:42:43,724] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1981.99 | bwd_microstep: 1934.08 | bwd_inner_microstep: 1928.68 | bwd_allreduce_microstep: 5.28 | step_microstep: 62.09
+[2024-06-18 22:42:43,724] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3980.29 | bwd: 3902.34 | bwd_inner: 3891.95 | bwd_allreduce: 10.23 | step: 62.19
+ 40%|███▉      | 279/700 [35:53<53:18,  7.60s/it]                                                 {'loss': 0.7568, 'learning_rate': 6.84105294390419e-05, 'epoch': 2.79}
+ 40%|███▉      | 279/700 [35:53<53:18,  7.60s/it][2024-06-18 22:42:47,647] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1973.84 | bwd_microstep: 1928.04 | bwd_inner_microstep: 1923.22 | bwd_allreduce_microstep: 4.68 | step_microstep: 0.07
+[2024-06-18 22:42:51,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.91
+[2024-06-18 22:42:51,650] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1984.57 | bwd_microstep: 1937.06 | bwd_inner_microstep: 1931.68 | bwd_allreduce_microstep: 5.29 | step_microstep: 62.05
+[2024-06-18 22:42:51,650] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3958.38 | bwd: 3865.13 | bwd_inner: 3854.98 | bwd_allreduce: 9.97 | step: 62.13
+ 40%|████      | 280/700 [36:01<53:52,  7.70s/it]                                                 {'loss': 0.909, 'learning_rate': 6.819524684817438e-05, 'epoch': 2.8}
+ 40%|████      | 280/700 [36:01<53:52,  7.70s/it][2024-06-18 22:42:55,631] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1995.71 | bwd_microstep: 1962.88 | bwd_inner_microstep: 1957.95 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.08
+[2024-06-18 22:42:59,619] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:42:59,620] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1977.05 | bwd_microstep: 1931.30 | bwd_inner_microstep: 1925.95 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.70
+[2024-06-18 22:42:59,620] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3972.71 | bwd: 3894.20 | bwd_inner: 3884.00 | bwd_allreduce: 10.02 | step: 61.79
+ 40%|████      | 281/700 [36:09<54:18,  7.78s/it]                                                 {'loss': 0.7132, 'learning_rate': 6.797957474841716e-05, 'epoch': 2.81}
+ 40%|████      | 281/700 [36:09<54:18,  7.78s/it][2024-06-18 22:43:03,530] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1972.92 | bwd_microstep: 1915.64 | bwd_inner_microstep: 1910.68 | bwd_allreduce_microstep: 4.81 | step_microstep: 0.14
+[2024-06-18 22:43:07,334] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:43:07,335] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1821.53 | bwd_microstep: 1903.37 | bwd_inner_microstep: 1897.94 | bwd_allreduce_microstep: 5.30 | step_microstep: 61.75
+[2024-06-18 22:43:07,335] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3794.42 | bwd: 3819.03 | bwd_inner: 3808.74 | bwd_allreduce: 10.11 | step: 61.89
+ 40%|████      | 282/700 [36:16<54:03,  7.76s/it]                                                 {'loss': 0.9626, 'learning_rate': 6.776351775670129e-05, 'epoch': 2.82}
+ 40%|████      | 282/700 [36:16<54:03,  7.76s/it][2024-06-18 22:43:11,236] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.33 | bwd_microstep: 1909.05 | bwd_inner_microstep: 1904.15 | bwd_allreduce_microstep: 4.75 | step_microstep: 0.08
+[2024-06-18 22:43:15,314] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 22:43:15,314] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2010.95 | bwd_microstep: 1986.11 | bwd_inner_microstep: 1980.75 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.77
+[2024-06-18 22:43:15,314] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3981.24 | bwd: 3895.18 | bwd_inner: 3884.98 | bwd_allreduce: 10.01 | step: 61.85
+ 40%|████      | 283/700 [36:24<54:23,  7.83s/it]                                                 {'loss': 0.9361, 'learning_rate': 6.754708049819728e-05, 'epoch': 2.83}
+ 40%|████      | 283/700 [36:24<54:23,  7.83s/it][2024-06-18 22:43:19,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1997.76 | bwd_microstep: 1961.23 | bwd_inner_microstep: 1956.34 | bwd_allreduce_microstep: 4.75 | step_microstep: 0.07
+[2024-06-18 22:43:23,003] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.90
+[2024-06-18 22:43:23,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1742.25 | bwd_microstep: 1885.45 | bwd_inner_microstep: 1880.05 | bwd_allreduce_microstep: 5.32 | step_microstep: 62.86
+[2024-06-18 22:43:23,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3739.98 | bwd: 3846.70 | bwd_inner: 3836.46 | bwd_allreduce: 10.08 | step: 62.94
+ 41%|████      | 284/700 [36:32<53:58,  7.78s/it]                                                 {'loss': 0.9124, 'learning_rate': 6.733026760621607e-05, 'epoch': 2.84}
+ 41%|████      | 284/700 [36:32<53:58,  7.78s/it][2024-06-18 22:43:26,980] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1995.14 | bwd_microstep: 1958.76 | bwd_inner_microstep: 1953.86 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.07
+[2024-06-18 22:43:30,321] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:43:30,322] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1582.18 | bwd_microstep: 1680.58 | bwd_inner_microstep: 1675.21 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.55
+[2024-06-18 22:43:30,322] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3577.29 | bwd: 3639.37 | bwd_inner: 3629.20 | bwd_allreduce: 9.97 | step: 61.63
+ 41%|████      | 285/700 [36:39<52:52,  7.64s/it]                                                 {'loss': 0.4468, 'learning_rate': 6.711308372210983e-05, 'epoch': 2.85}
+ 41%|████      | 285/700 [36:39<52:52,  7.64s/it][2024-06-18 22:43:34,220] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.88 | bwd_microstep: 1907.92 | bwd_inner_microstep: 1903.01 | bwd_allreduce_microstep: 4.83 | step_microstep: 0.08
+[2024-06-18 22:43:37,933] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:43:37,934] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1894.06 | bwd_microstep: 1740.08 | bwd_inner_microstep: 1734.73 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.54
+[2024-06-18 22:43:37,934] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3861.91 | bwd: 3648.03 | bwd_inner: 3637.79 | bwd_allreduce: 10.10 | step: 61.62
+ 41%|████      | 286/700 [36:47<52:40,  7.63s/it]                                                 {'loss': 0.4842, 'learning_rate': 6.689553349517268e-05, 'epoch': 2.86}
+ 41%|████      | 286/700 [36:47<52:40,  7.63s/it][2024-06-18 22:43:41,317] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1656.61 | bwd_microstep: 1705.58 | bwd_inner_microstep: 1700.80 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.07
+[2024-06-18 22:43:45,040] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.90
+[2024-06-18 22:43:45,040] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1894.76 | bwd_microstep: 1747.89 | bwd_inner_microstep: 1742.31 | bwd_allreduce_microstep: 5.45 | step_microstep: 62.53
+[2024-06-18 22:43:45,041] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3551.33 | bwd: 3453.50 | bwd_inner: 3443.20 | bwd_allreduce: 10.15 | step: 62.62
+ 41%|████      | 287/700 [36:54<51:27,  7.48s/it]                                                 {'loss': 0.663, 'learning_rate': 6.667762158254104e-05, 'epoch': 2.87}
+ 41%|████      | 287/700 [36:54<51:27,  7.48s/it][2024-06-18 22:43:48,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1968.01 | bwd_microstep: 1895.36 | bwd_inner_microstep: 1890.54 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.08
+[2024-06-18 22:43:52,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:43:52,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1971.14 | bwd_microstep: 1898.90 | bwd_inner_microstep: 1893.47 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.51
+[2024-06-18 22:43:52,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3939.11 | bwd: 3794.29 | bwd_inner: 3784.09 | bwd_allreduce: 10.02 | step: 61.59
+ 41%|████      | 288/700 [37:02<52:04,  7.58s/it]                                                 {'loss': 0.9251, 'learning_rate': 6.645935264909404e-05, 'epoch': 2.88}
+ 41%|████      | 288/700 [37:02<52:04,  7.58s/it][2024-06-18 22:43:56,769] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1968.80 | bwd_microstep: 1902.72 | bwd_inner_microstep: 1897.83 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.08
+[2024-06-18 22:44:00,768] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 22:44:00,769] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1980.87 | bwd_microstep: 1937.19 | bwd_inner_microstep: 1931.83 | bwd_allreduce_microstep: 5.28 | step_microstep: 62.08
+[2024-06-18 22:44:00,769] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3949.64 | bwd: 3839.93 | bwd_inner: 3829.74 | bwd_allreduce: 10.05 | step: 62.17
+ 41%|████▏     | 289/700 [37:10<52:35,  7.68s/it]                                                 {'loss': 0.7312, 'learning_rate': 6.624073136735363e-05, 'epoch': 2.89}
+ 41%|████▏     | 289/700 [37:10<52:35,  7.68s/it][2024-06-18 22:44:04,657] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1968.30 | bwd_microstep: 1897.99 | bwd_inner_microstep: 1893.22 | bwd_allreduce_microstep: 4.69 | step_microstep: 0.07
+[2024-06-18 22:44:08,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:44:08,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1974.87 | bwd_microstep: 1937.26 | bwd_inner_microstep: 1931.88 | bwd_allreduce_microstep: 5.22 | step_microstep: 61.56
+[2024-06-18 22:44:08,649] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3943.14 | bwd: 3835.27 | bwd_inner: 3825.19 | bwd_allreduce: 9.91 | step: 61.64
+ 41%|████▏     | 290/700 [37:18<52:52,  7.74s/it]                                                 {'loss': 0.9433, 'learning_rate': 6.602176241738449e-05, 'epoch': 2.9}
+ 41%|████▏     | 290/700 [37:18<52:52,  7.74s/it][2024-06-18 22:44:12,145] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1833.01 | bwd_microstep: 1641.08 | bwd_inner_microstep: 1636.14 | bwd_allreduce_microstep: 4.85 | step_microstep: 0.14
+[2024-06-18 22:44:15,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:44:15,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1395.80 | bwd_microstep: 1611.79 | bwd_inner_microstep: 1606.41 | bwd_allreduce_microstep: 5.29 | step_microstep: 61.75
+[2024-06-18 22:44:15,232] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3228.78 | bwd: 3252.89 | bwd_inner: 3242.60 | bwd_allreduce: 10.15 | step: 61.89
+ 42%|████▏     | 291/700 [37:24<50:23,  7.39s/it]                                                 {'loss': 0.0984, 'learning_rate': 6.580245048669395e-05, 'epoch': 2.91}
+ 42%|████▏     | 291/700 [37:24<50:23,  7.39s/it][2024-06-18 22:44:18,741] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1650.63 | bwd_microstep: 1836.55 | bwd_inner_microstep: 1831.75 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.08
+[2024-06-18 22:44:22,741] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:44:22,741] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1980.18 | bwd_microstep: 1937.83 | bwd_inner_microstep: 1932.35 | bwd_allreduce_microstep: 5.40 | step_microstep: 62.31
+[2024-06-18 22:44:22,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3630.78 | bwd: 3774.40 | bwd_inner: 3764.14 | bwd_allreduce: 10.13 | step: 62.39
+ 42%|████▏     | 292/700 [37:32<50:30,  7.43s/it]                                                 {'loss': 1.0314, 'learning_rate': 6.558280027013154e-05, 'epoch': 2.92}
+ 42%|████▏     | 292/700 [37:32<50:30,  7.43s/it][2024-06-18 22:44:26,644] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.66 | bwd_microstep: 1912.78 | bwd_inner_microstep: 1907.89 | bwd_allreduce_microstep: 4.77 | step_microstep: 0.07
+[2024-06-18 22:44:30,576] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:44:30,577] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1963.82 | bwd_microstep: 1889.40 | bwd_inner_microstep: 1883.91 | bwd_allreduce_microstep: 5.34 | step_microstep: 61.65
+[2024-06-18 22:44:30,577] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3931.45 | bwd: 3802.20 | bwd_inner: 3791.91 | bwd_allreduce: 10.09 | step: 61.73
+ 42%|████▏     | 293/700 [37:40<51:12,  7.55s/it]                                                 {'loss': 0.8026, 'learning_rate': 6.536281646978862e-05, 'epoch': 2.93}
+ 42%|████▏     | 293/700 [37:40<51:12,  7.55s/it][2024-06-18 22:44:34,241] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1897.84 | bwd_microstep: 1744.59 | bwd_inner_microstep: 1739.66 | bwd_allreduce_microstep: 4.81 | step_microstep: 0.08
+[2024-06-18 22:44:38,222] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:44:38,223] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1973.98 | bwd_microstep: 1926.61 | bwd_inner_microstep: 1921.20 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.62
+[2024-06-18 22:44:38,223] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3871.78 | bwd: 3671.21 | bwd_inner: 3660.96 | bwd_allreduce: 10.08 | step: 61.70
+ 42%|████▏     | 294/700 [37:47<51:16,  7.58s/it]                                                 {'loss': 0.1717, 'learning_rate': 6.514250379489753e-05, 'epoch': 2.94}
+ 42%|████▏     | 294/700 [37:47<51:16,  7.58s/it][2024-06-18 22:44:41,325] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1421.61 | bwd_microstep: 1659.35 | bwd_inner_microstep: 1654.41 | bwd_allreduce_microstep: 4.80 | step_microstep: 0.07
+[2024-06-18 22:44:45,019] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 22:44:45,020] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1887.30 | bwd_microstep: 1726.63 | bwd_inner_microstep: 1721.28 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.66
+[2024-06-18 22:44:45,020] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3308.88 | bwd: 3386.01 | bwd_inner: 3375.77 | bwd_allreduce: 10.05 | step: 61.74
+ 42%|████▏     | 295/700 [37:54<49:34,  7.34s/it]                                                 {'loss': 0.054, 'learning_rate': 6.492186696173097e-05, 'epoch': 2.95}
+ 42%|████▏     | 295/700 [37:54<49:34,  7.34s/it][2024-06-18 22:44:48,765] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1910.65 | bwd_microstep: 1812.14 | bwd_inner_microstep: 1807.09 | bwd_allreduce_microstep: 4.96 | step_microstep: 0.09
+[2024-06-18 22:44:52,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 22:44:52,403] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1861.01 | bwd_microstep: 1695.27 | bwd_inner_microstep: 1689.84 | bwd_allreduce_microstep: 5.29 | step_microstep: 62.01
+[2024-06-18 22:44:52,403] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3771.63 | bwd: 3507.44 | bwd_inner: 3497.00 | bwd_allreduce: 10.24 | step: 62.10
+ 42%|████▏     | 296/700 [38:01<49:31,  7.36s/it]                                                 {'loss': 0.0319, 'learning_rate': 6.47009106935009e-05, 'epoch': 2.96}
+ 42%|████▏     | 296/700 [38:01<49:31,  7.36s/it][2024-06-18 22:44:56,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1971.66 | bwd_microstep: 1899.11 | bwd_inner_microstep: 1894.29 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.07
+[2024-06-18 22:45:00,035] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:45:00,036] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1805.21 | bwd_microstep: 1855.25 | bwd_inner_microstep: 1849.84 | bwd_allreduce_microstep: 5.29 | step_microstep: 61.92
+[2024-06-18 22:45:00,036] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3776.85 | bwd: 3754.38 | bwd_inner: 3744.20 | bwd_allreduce: 10.04 | step: 62.00
+ 42%|████▏     | 297/700 [38:09<49:57,  7.44s/it]                                                 {'loss': 0.7367, 'learning_rate': 6.447963972025751e-05, 'epoch': 2.97}
+ 42%|████▏     | 297/700 [38:09<49:57,  7.44s/it][2024-06-18 22:45:03,775] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1912.87 | bwd_microstep: 1804.16 | bwd_inner_microstep: 1799.31 | bwd_allreduce_microstep: 4.77 | step_microstep: 0.08
+[2024-06-18 22:45:07,761] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 22:45:07,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1980.48 | bwd_microstep: 1926.33 | bwd_inner_microstep: 1921.00 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.81
+[2024-06-18 22:45:07,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3893.32 | bwd: 3730.52 | bwd_inner: 3720.34 | bwd_allreduce: 10.05 | step: 61.90
+ 43%|████▎     | 298/700 [38:17<50:25,  7.52s/it]                                                 {'loss': 0.4738, 'learning_rate': 6.425805877878793e-05, 'epoch': 2.98}
+ 43%|████▎     | 298/700 [38:17<50:25,  7.52s/it][2024-06-18 22:45:11,646] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.24 | bwd_microstep: 1892.45 | bwd_inner_microstep: 1887.53 | bwd_allreduce_microstep: 4.78 | step_microstep: 0.08
+[2024-06-18 22:45:15,628] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 22:45:15,629] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1973.98 | bwd_microstep: 1928.11 | bwd_inner_microstep: 1922.44 | bwd_allreduce_microstep: 5.53 | step_microstep: 62.66
+[2024-06-18 22:45:15,629] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3944.20 | bwd: 3820.59 | bwd_inner: 3810.07 | bwd_allreduce: 10.30 | step: 62.74
+ 43%|████▎     | 299/700 [38:25<50:58,  7.63s/it]                                                 {'loss': 0.6486, 'learning_rate': 6.403617261251484e-05, 'epoch': 2.99}
+ 43%|████▎     | 299/700 [38:25<50:58,  7.63s/it][2024-06-18 22:45:19,538] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.63 | bwd_microstep: 1916.57 | bwd_inner_microstep: 1911.60 | bwd_allreduce_microstep: 4.83 | step_microstep: 0.07
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+[2024-06-18 22:45:23,717] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:45:23,718] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1490.39 | bwd_microstep: 1802.82 | bwd_inner_microstep: 1797.43 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.86
+[2024-06-18 22:45:23,718] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3460.95 | bwd: 3719.41 | bwd_inner: 3709.10 | bwd_allreduce: 10.12 | step: 61.95
+ 43%|████▎     | 300/700 [38:33<51:46,  7.77s/it]                                                 {'loss': 0.8341, 'learning_rate': 6.381398597139492e-05, 'epoch': 3.0}
+ 43%|████▎     | 300/700 [38:33<51:46,  7.77s/it][2024-06-18 22:45:26,408] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-06-18 22:45:32,224] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-06-18 22:45:37,969] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-06-18 22:45:43,737] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-06-18 22:45:51,186] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1997.55 | bwd_microstep: 1954.93 | bwd_inner_microstep: 1950.22 | bwd_allreduce_microstep: 4.68 | step_microstep: 0.07
+[2024-06-18 22:45:55,219] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:45:55,220] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1984.85 | bwd_microstep: 1968.23 | bwd_inner_microstep: 1962.81 | bwd_allreduce_microstep: 5.34 | step_microstep: 61.96
+[2024-06-18 22:45:55,220] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3982.34 | bwd: 3923.19 | bwd_inner: 3913.04 | bwd_allreduce: 10.04 | step: 62.04
+ 43%|████▎     | 301/700 [39:04<1:38:59, 14.89s/it]                                                   {'loss': 0.6808, 'learning_rate': 6.359150361181715e-05, 'epoch': 3.01}
+ 43%|████▎     | 301/700 [39:04<1:38:59, 14.89s/it][2024-06-18 22:45:59,126] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1964.82 | bwd_microstep: 1919.76 | bwd_inner_microstep: 1914.91 | bwd_allreduce_microstep: 4.75 | step_microstep: 0.07
+[2024-06-18 22:46:02,504] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:46:02,505] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1490.15 | bwd_microstep: 1807.66 | bwd_inner_microstep: 1802.27 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.77
+[2024-06-18 22:46:02,505] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3454.94 | bwd: 3727.44 | bwd_inner: 3717.23 | bwd_allreduce: 10.07 | step: 61.85
+ 43%|████▎     | 302/700 [39:11<1:23:37, 12.61s/it]                                                   {'loss': 0.7224, 'learning_rate': 6.336873029650104e-05, 'epoch': 3.02}
+ 43%|████▎     | 302/700 [39:11<1:23:37, 12.61s/it][2024-06-18 22:46:05,993] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1638.87 | bwd_microstep: 1827.99 | bwd_inner_microstep: 1823.04 | bwd_allreduce_microstep: 4.86 | step_microstep: 0.08
+[2024-06-18 22:46:09,698] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:46:09,699] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1885.10 | bwd_microstep: 1739.36 | bwd_inner_microstep: 1734.00 | bwd_allreduce_microstep: 5.30 | step_microstep: 61.96
+[2024-06-18 22:46:09,699] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3523.94 | bwd: 3567.37 | bwd_inner: 3557.07 | bwd_allreduce: 10.17 | step: 62.04
+ 43%|████▎     | 303/700 [39:19<1:12:40, 10.98s/it]                                                   {'loss': 0.4141, 'learning_rate': 6.314567079439459e-05, 'epoch': 3.03}
+ 43%|████▎     | 303/700 [39:19<1:12:40, 10.98s/it][2024-06-18 22:46:13,563] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1956.01 | bwd_microstep: 1886.57 | bwd_inner_microstep: 1881.65 | bwd_allreduce_microstep: 4.78 | step_microstep: 0.08
+[2024-06-18 22:46:17,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:46:17,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1958.33 | bwd_microstep: 1893.14 | bwd_inner_microstep: 1887.51 | bwd_allreduce_microstep: 5.53 | step_microstep: 64.49
+[2024-06-18 22:46:17,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3914.30 | bwd: 3779.73 | bwd_inner: 3769.23 | bwd_allreduce: 10.30 | step: 64.57
+ 43%|████▎     | 304/700 [39:26<1:06:10, 10.03s/it]                                                   {'loss': 0.549, 'learning_rate': 6.292232988057235e-05, 'epoch': 3.04}
+ 43%|████▎     | 304/700 [39:26<1:06:10, 10.03s/it][2024-06-18 22:46:21,074] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1860.56 | bwd_microstep: 1692.96 | bwd_inner_microstep: 1688.03 | bwd_allreduce_microstep: 4.79 | step_microstep: 0.08
+[2024-06-18 22:46:24,207] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 22:46:24,208] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1410.54 | bwd_microstep: 1642.22 | bwd_inner_microstep: 1636.84 | bwd_allreduce_microstep: 5.30 | step_microstep: 62.58
+[2024-06-18 22:46:24,208] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3271.07 | bwd: 3335.20 | bwd_inner: 3324.94 | bwd_allreduce: 10.07 | step: 62.66
+ 44%|████▎     | 305/700 [39:33<59:27,  9.03s/it]                                                   {'loss': 0.0144, 'learning_rate': 6.269871233613301e-05, 'epoch': 3.05}
+ 44%|████▎     | 305/700 [39:33<59:27,  9.03s/it][2024-06-18 22:46:28,189] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1994.82 | bwd_microstep: 1963.50 | bwd_inner_microstep: 1958.78 | bwd_allreduce_microstep: 4.68 | step_microstep: 0.07
+[2024-06-18 22:46:32,080] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.92
+[2024-06-18 22:46:32,081] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1941.17 | bwd_microstep: 1870.28 | bwd_inner_microstep: 1864.77 | bwd_allreduce_microstep: 5.41 | step_microstep: 62.24
+[2024-06-18 22:46:32,082] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3935.96 | bwd: 3833.81 | bwd_inner: 3823.59 | bwd_allreduce: 10.11 | step: 62.33
+ 44%|████▎     | 306/700 [39:41<57:01,  8.68s/it]                                                 {'loss': 0.8906, 'learning_rate': 6.247482294809712e-05, 'epoch': 3.06}
+ 44%|████▎     | 306/700 [39:41<57:01,  8.68s/it][2024-06-18 22:46:35,709] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1879.30 | bwd_microstep: 1726.02 | bwd_inner_microstep: 1721.06 | bwd_allreduce_microstep: 4.81 | step_microstep: 0.07
+[2024-06-18 22:46:39,668] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:46:39,669] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1971.08 | bwd_microstep: 1908.42 | bwd_inner_microstep: 1903.14 | bwd_allreduce_microstep: 5.23 | step_microstep: 61.68
+[2024-06-18 22:46:39,669] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3850.35 | bwd: 3634.46 | bwd_inner: 3624.26 | bwd_allreduce: 10.04 | step: 61.76
+ 44%|████▍     | 307/700 [39:49<54:43,  8.36s/it]                                                 {'loss': 0.3686, 'learning_rate': 6.225066650930476e-05, 'epoch': 3.07}
+ 44%|████▍     | 307/700 [39:49<54:43,  8.36s/it][2024-06-18 22:46:43,540] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1960.34 | bwd_microstep: 1888.87 | bwd_inner_microstep: 1883.78 | bwd_allreduce_microstep: 4.95 | step_microstep: 0.14
+[2024-06-18 22:46:47,175] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:46:47,176] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1860.12 | bwd_microstep: 1694.16 | bwd_inner_microstep: 1688.91 | bwd_allreduce_microstep: 5.21 | step_microstep: 61.92
+[2024-06-18 22:46:47,176] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3820.42 | bwd: 3583.05 | bwd_inner: 3572.74 | bwd_allreduce: 10.16 | step: 62.07
+ 44%|████▍     | 308/700 [39:56<52:55,  8.10s/it]                                                 {'loss': 0.2347, 'learning_rate': 6.202624781831268e-05, 'epoch': 3.08}
+ 44%|████▍     | 308/700 [39:56<52:55,  8.10s/it][2024-06-18 22:46:51,173] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2002.23 | bwd_microstep: 1972.88 | bwd_inner_microstep: 1968.16 | bwd_allreduce_microstep: 4.67 | step_microstep: 0.07
+[2024-06-18 22:46:55,166] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:46:55,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1980.57 | bwd_microstep: 1932.97 | bwd_inner_microstep: 1927.66 | bwd_allreduce_microstep: 5.23 | step_microstep: 61.71
+[2024-06-18 22:46:55,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3982.78 | bwd: 3905.88 | bwd_inner: 3895.84 | bwd_allreduce: 9.93 | step: 61.79
+ 44%|████▍     | 309/700 [40:04<52:34,  8.07s/it]                                                 {'loss': 0.7818, 'learning_rate': 6.18015716792918e-05, 'epoch': 3.09}
+ 44%|████▍     | 309/700 [40:04<52:34,  8.07s/it][2024-06-18 22:46:59,083] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1969.66 | bwd_microstep: 1924.67 | bwd_inner_microstep: 1919.81 | bwd_allreduce_microstep: 4.78 | step_microstep: 0.09
+[2024-06-18 22:47:03,029] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:47:03,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1966.00 | bwd_microstep: 1900.17 | bwd_inner_microstep: 1894.89 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.68
+[2024-06-18 22:47:03,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3935.63 | bwd: 3824.86 | bwd_inner: 3814.72 | bwd_allreduce: 10.04 | step: 61.77
+ 44%|████▍     | 310/700 [40:12<52:02,  8.01s/it]                                                 {'loss': 0.5047, 'learning_rate': 6.157664290192421e-05, 'epoch': 3.1}
+ 44%|████▍     | 310/700 [40:12<52:02,  8.01s/it][2024-06-18 22:47:06,958] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1974.40 | bwd_microstep: 1931.80 | bwd_inner_microstep: 1926.83 | bwd_allreduce_microstep: 4.83 | step_microstep: 0.07
+[2024-06-18 22:47:10,951] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:47:10,951] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1976.36 | bwd_microstep: 1935.58 | bwd_inner_microstep: 1930.06 | bwd_allreduce_microstep: 5.46 | step_microstep: 62.00
+[2024-06-18 22:47:10,952] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3950.73 | bwd: 3867.40 | bwd_inner: 3856.94 | bwd_allreduce: 10.27 | step: 62.08
+ 44%|████▍     | 311/700 [40:20<51:44,  7.98s/it]                                                 {'loss': 0.8547, 'learning_rate': 6.135146630130034e-05, 'epoch': 3.11}
+ 44%|████▍     | 311/700 [40:20<51:44,  7.98s/it][2024-06-18 22:47:12,563] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 737.04 | bwd_microstep: 852.97 | bwd_inner_microstep: 848.13 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.08
+[2024-06-18 22:47:15,993] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:47:15,994] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1598.37 | bwd_microstep: 1753.59 | bwd_inner_microstep: 1748.09 | bwd_allreduce_microstep: 5.35 | step_microstep: 61.72
+[2024-06-18 22:47:15,994] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2335.38 | bwd: 2606.58 | bwd_inner: 2596.29 | bwd_allreduce: 10.09 | step: 61.80
+ 45%|████▍     | 312/700 [40:25<45:54,  7.10s/it]                                                 {'loss': 0.7416, 'learning_rate': 6.112604669781572e-05, 'epoch': 3.12}
+ 45%|████▍     | 312/700 [40:25<45:54,  7.10s/it][2024-06-18 22:47:17,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 734.79 | bwd_microstep: 843.13 | bwd_inner_microstep: 838.33 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.07
+[2024-06-18 22:47:21,650] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.89
+[2024-06-18 22:47:21,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2001.70 | bwd_microstep: 1974.47 | bwd_inner_microstep: 1968.94 | bwd_allreduce_microstep: 5.44 | step_microstep: 62.67
+[2024-06-18 22:47:21,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2736.46 | bwd: 2817.62 | bwd_inner: 2807.31 | bwd_allreduce: 10.16 | step: 62.75
+ 45%|████▍     | 313/700 [40:31<42:59,  6.67s/it]                                                 {'loss': 0.5756, 'learning_rate': 6.090038891706801e-05, 'epoch': 3.13}
+ 45%|████▍     | 313/700 [40:31<42:59,  6.67s/it][2024-06-18 22:47:25,554] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1966.47 | bwd_microstep: 1914.95 | bwd_inner_microstep: 1910.11 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.08
+[2024-06-18 22:47:29,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 22:47:29,370] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1921.90 | bwd_microstep: 1812.70 | bwd_inner_microstep: 1807.40 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.91
+[2024-06-18 22:47:29,370] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3888.34 | bwd: 3727.67 | bwd_inner: 3717.53 | bwd_allreduce: 10.03 | step: 61.99
+ 45%|████▍     | 314/700 [40:38<44:55,  6.98s/it]                                                 {'loss': 0.3602, 'learning_rate': 6.067449778975349e-05, 'epoch': 3.14}
+ 45%|████▍     | 314/700 [40:38<44:55,  6.98s/it][2024-06-18 22:47:33,303] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1974.37 | bwd_microstep: 1936.55 | bwd_inner_microstep: 1931.67 | bwd_allreduce_microstep: 4.84 | step_microstep: 0.08
+[2024-06-18 22:47:37,258] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 22:47:37,259] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1969.51 | bwd_microstep: 1905.56 | bwd_inner_microstep: 1900.20 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.56
+[2024-06-18 22:47:37,259] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3943.84 | bwd: 3842.14 | bwd_inner: 3831.88 | bwd_allreduce: 10.13 | step: 61.66
+ 45%|████▌     | 315/700 [40:46<46:32,  7.25s/it]                                                 {'loss': 0.7834, 'learning_rate': 6.044837815156377e-05, 'epoch': 3.15}
+ 45%|████▌     | 315/700 [40:46<46:32,  7.25s/it][2024-06-18 22:47:41,245] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1996.78 | bwd_microstep: 1967.30 | bwd_inner_microstep: 1962.34 | bwd_allreduce_microstep: 4.82 | step_microstep: 0.07
+[2024-06-18 22:47:44,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 22:47:44,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1890.62 | bwd_microstep: 1740.57 | bwd_inner_microstep: 1735.18 | bwd_allreduce_microstep: 5.30 | step_microstep: 61.83
+[2024-06-18 22:47:44,958] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3887.37 | bwd: 3707.89 | bwd_inner: 3697.60 | bwd_allreduce: 10.10 | step: 61.91
+ 45%|████▌     | 316/700 [40:54<47:16,  7.39s/it]                                                 {'loss': 0.4475, 'learning_rate': 6.022203484308216e-05, 'epoch': 3.16}
+ 45%|████▌     | 316/700 [40:54<47:16,  7.39s/it][2024-06-18 22:47:48,929] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1991.87 | bwd_microstep: 1958.13 | bwd_inner_microstep: 1953.06 | bwd_allreduce_microstep: 4.93 | step_microstep: 0.14
+[2024-06-18 22:47:52,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:47:52,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1867.73 | bwd_microstep: 1693.89 | bwd_inner_microstep: 1688.64 | bwd_allreduce_microstep: 5.21 | step_microstep: 61.81
+[2024-06-18 22:47:52,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3859.57 | bwd: 3652.05 | bwd_inner: 3641.75 | bwd_allreduce: 10.14 | step: 61.96
+ 45%|████▌     | 317/700 [41:02<47:35,  7.46s/it]                                                 {'loss': 0.5393, 'learning_rate': 5.9995472709680234e-05, 'epoch': 3.17}
+ 45%|████▌     | 317/700 [41:02<47:35,  7.46s/it][2024-06-18 22:47:56,227] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1890.24 | bwd_microstep: 1743.38 | bwd_inner_microstep: 1738.55 | bwd_allreduce_microstep: 4.77 | step_microstep: 0.07
+[2024-06-18 22:47:59,701] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:47:59,701] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1528.03 | bwd_microstep: 1866.52 | bwd_inner_microstep: 1861.11 | bwd_allreduce_microstep: 5.33 | step_microstep: 61.82
+[2024-06-18 22:47:59,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3418.23 | bwd: 3609.92 | bwd_inner: 3599.68 | bwd_allreduce: 10.11 | step: 61.91
+ 45%|████▌     | 318/700 [41:09<46:50,  7.36s/it]                                                 {'loss': 0.4159, 'learning_rate': 5.976869660141389e-05, 'epoch': 3.18}
+ 45%|████▌     | 318/700 [41:09<46:50,  7.36s/it][2024-06-18 22:48:03,444] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1912.42 | bwd_microstep: 1808.39 | bwd_inner_microstep: 1803.66 | bwd_allreduce_microstep: 4.68 | step_microstep: 0.07
+[2024-06-18 22:48:06,975] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:48:06,976] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1738.35 | bwd_microstep: 1713.65 | bwd_inner_microstep: 1708.21 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.74
+[2024-06-18 22:48:06,976] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3650.74 | bwd: 3522.07 | bwd_inner: 3511.92 | bwd_allreduce: 9.98 | step: 61.82
+ 46%|████▌     | 319/700 [41:16<46:33,  7.33s/it]                                                 {'loss': 0.0905, 'learning_rate': 5.954171137291968e-05, 'epoch': 3.19}
+ 46%|████▌     | 319/700 [41:16<46:33,  7.33s/it][2024-06-18 22:48:10,637] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1892.33 | bwd_microstep: 1746.89 | bwd_inner_microstep: 1742.11 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.07
+[2024-06-18 22:48:14,569] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:48:14,570] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1963.57 | bwd_microstep: 1889.02 | bwd_inner_microstep: 1883.75 | bwd_allreduce_microstep: 5.23 | step_microstep: 61.72
+[2024-06-18 22:48:14,570] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3855.87 | bwd: 3635.93 | bwd_inner: 3625.86 | bwd_allreduce: 9.99 | step: 61.80
+ 46%|████▌     | 320/700 [41:24<46:56,  7.41s/it]                                                 {'loss': 0.3343, 'learning_rate': 5.931452188331083e-05, 'epoch': 3.2}
+ 46%|████▌     | 320/700 [41:24<46:56,  7.41s/it][2024-06-18 22:48:18,310] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1912.33 | bwd_microstep: 1806.44 | bwd_inner_microstep: 1801.47 | bwd_allreduce_microstep: 4.83 | step_microstep: 0.07
+[2024-06-18 22:48:22,370] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:48:22,371] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2002.46 | bwd_microstep: 1977.35 | bwd_inner_microstep: 1971.85 | bwd_allreduce_microstep: 5.36 | step_microstep: 61.60
+[2024-06-18 22:48:22,371] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3914.76 | bwd: 3783.82 | bwd_inner: 3773.42 | bwd_allreduce: 10.16 | step: 61.68
+ 46%|████▌     | 321/700 [41:31<47:33,  7.53s/it]                                                 {'loss': 0.4226, 'learning_rate': 5.908713299607318e-05, 'epoch': 3.21}
+ 46%|████▌     | 321/700 [41:31<47:33,  7.53s/it][2024-06-18 22:48:25,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1657.02 | bwd_microstep: 1707.01 | bwd_inner_microstep: 1702.20 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.07
+[2024-06-18 22:48:29,741] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 22:48:29,741] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1975.58 | bwd_microstep: 1929.13 | bwd_inner_microstep: 1923.57 | bwd_allreduce_microstep: 5.48 | step_microstep: 62.53
+[2024-06-18 22:48:29,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3632.57 | bwd: 3636.17 | bwd_inner: 3625.81 | bwd_allreduce: 10.22 | step: 62.61
+ 46%|████▌     | 322/700 [41:39<47:07,  7.48s/it]                                                 {'loss': 0.6134, 'learning_rate': 5.885954957896115e-05, 'epoch': 3.22}
+ 46%|████▌     | 322/700 [41:39<47:07,  7.48s/it][2024-06-18 22:48:33,372] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1881.16 | bwd_microstep: 1726.68 | bwd_inner_microstep: 1721.72 | bwd_allreduce_microstep: 4.82 | step_microstep: 0.08
+[2024-06-18 22:48:37,424] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:48:37,424] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2001.48 | bwd_microstep: 1970.39 | bwd_inner_microstep: 1965.11 | bwd_allreduce_microstep: 5.23 | step_microstep: 61.84
+[2024-06-18 22:48:37,425] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3882.60 | bwd: 3697.09 | bwd_inner: 3686.88 | bwd_allreduce: 10.05 | step: 61.92
+ 46%|████▌     | 323/700 [41:46<47:23,  7.54s/it]                                                 {'loss': 0.2971, 'learning_rate': 5.863177650389347e-05, 'epoch': 3.23}
+ 46%|████▌     | 323/700 [41:46<47:23,  7.54s/it][2024-06-18 22:48:41,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1960.17 | bwd_microstep: 1890.13 | bwd_inner_microstep: 1885.12 | bwd_allreduce_microstep: 4.86 | step_microstep: 0.08
+[2024-06-18 22:48:45,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 22:48:45,272] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1974.81 | bwd_microstep: 1920.28 | bwd_inner_microstep: 1914.93 | bwd_allreduce_microstep: 5.28 | step_microstep: 62.17
+[2024-06-18 22:48:45,273] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3934.95 | bwd: 3810.44 | bwd_inner: 3800.12 | bwd_allreduce: 10.12 | step: 62.26
+ 46%|████▋     | 324/700 [41:54<47:50,  7.63s/it]                                                 {'loss': 0.6429, 'learning_rate': 5.8403818646848915e-05, 'epoch': 3.24}
+ 46%|████▋     | 324/700 [41:54<47:50,  7.63s/it][2024-06-18 22:48:49,192] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1972.07 | bwd_microstep: 1925.86 | bwd_inner_microstep: 1920.98 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.07
+[2024-06-18 22:48:53,192] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.92
+[2024-06-18 22:48:53,193] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1981.51 | bwd_microstep: 1937.48 | bwd_inner_microstep: 1932.15 | bwd_allreduce_microstep: 5.28 | step_microstep: 63.15
+[2024-06-18 22:48:53,193] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3953.55 | bwd: 3863.37 | bwd_inner: 3853.20 | bwd_allreduce: 9.99 | step: 63.24
+ 46%|████▋     | 325/700 [42:02<48:14,  7.72s/it]                                                 {'loss': 0.5874, 'learning_rate': 5.8175680887761955e-05, 'epoch': 3.25}
+ 46%|████▋     | 325/700 [42:02<48:14,  7.72s/it][2024-06-18 22:48:56,511] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1490.87 | bwd_microstep: 1805.88 | bwd_inner_microstep: 1801.06 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.07
+[2024-06-18 22:49:00,152] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:49:00,152] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1867.34 | bwd_microstep: 1693.91 | bwd_inner_microstep: 1688.48 | bwd_allreduce_microstep: 5.34 | step_microstep: 61.99
+[2024-06-18 22:49:00,153] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3358.18 | bwd: 3499.81 | bwd_inner: 3489.58 | bwd_allreduce: 10.09 | step: 62.07
+ 47%|████▋     | 326/700 [42:09<46:41,  7.49s/it]                                                 {'loss': 0.3562, 'learning_rate': 5.794736811041821e-05, 'epoch': 3.26}
+ 47%|████▋     | 326/700 [42:09<46:41,  7.49s/it][2024-06-18 22:49:03,809] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1893.60 | bwd_microstep: 1740.85 | bwd_inner_microstep: 1736.09 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.08
+[2024-06-18 22:49:07,843] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:49:07,843] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1996.71 | bwd_microstep: 1957.89 | bwd_inner_microstep: 1952.51 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.82
+[2024-06-18 22:49:07,843] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3890.27 | bwd: 3698.76 | bwd_inner: 3688.65 | bwd_allreduce: 9.96 | step: 61.90
+ 47%|████▋     | 327/700 [42:17<46:56,  7.55s/it]                                                 {'loss': 0.3763, 'learning_rate': 5.771888520234997e-05, 'epoch': 3.27}
+ 47%|████▋     | 327/700 [42:17<46:56,  7.55s/it][2024-06-18 22:49:11,827] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1998.03 | bwd_microstep: 1964.28 | bwd_inner_microstep: 1959.54 | bwd_allreduce_microstep: 4.66 | step_microstep: 0.08
+[2024-06-18 22:49:15,818] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:49:15,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1982.31 | bwd_microstep: 1929.47 | bwd_inner_microstep: 1924.18 | bwd_allreduce_microstep: 5.21 | step_microstep: 61.77
+[2024-06-18 22:49:15,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3980.31 | bwd: 3893.78 | bwd_inner: 3883.75 | bwd_allreduce: 9.88 | step: 61.85
+ 47%|████▋     | 328/700 [42:25<47:36,  7.68s/it]                                                 {'loss': 0.4824, 'learning_rate': 5.749023705473153e-05, 'epoch': 3.28}
+ 47%|████▋     | 328/700 [42:25<47:36,  7.68s/it][2024-06-18 22:49:19,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1995.15 | bwd_microstep: 1964.52 | bwd_inner_microstep: 1959.66 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.08
+[2024-06-18 22:49:23,054] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:49:23,055] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1447.22 | bwd_microstep: 1726.79 | bwd_inner_microstep: 1721.49 | bwd_allreduce_microstep: 5.22 | step_microstep: 61.94
+[2024-06-18 22:49:23,055] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3442.33 | bwd: 3691.34 | bwd_inner: 3681.25 | bwd_allreduce: 9.91 | step: 62.03
+ 47%|████▋     | 329/700 [42:32<46:39,  7.55s/it]                                                 {'loss': 0.595, 'learning_rate': 5.726142856227452e-05, 'epoch': 3.29}
+ 47%|████▋     | 329/700 [42:32<46:39,  7.55s/it][2024-06-18 22:49:27,025] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1994.10 | bwd_microstep: 1953.92 | bwd_inner_microstep: 1949.17 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.07
+[2024-06-18 22:49:30,960] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 22:49:30,960] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1964.41 | bwd_microstep: 1890.86 | bwd_inner_microstep: 1885.35 | bwd_allreduce_microstep: 5.36 | step_microstep: 61.52
+[2024-06-18 22:49:30,961] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3958.48 | bwd: 3844.80 | bwd_inner: 3834.58 | bwd_allreduce: 10.04 | step: 61.60
+ 47%|████▋     | 330/700 [42:40<47:11,  7.65s/it]                                                 {'loss': 0.5615, 'learning_rate': 5.703246462312307e-05, 'epoch': 3.3}
+ 47%|████▋     | 330/700 [42:40<47:11,  7.65s/it][2024-06-18 22:49:34,696] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1911.80 | bwd_microstep: 1802.49 | bwd_inner_microstep: 1797.48 | bwd_allreduce_microstep: 4.94 | step_microstep: 0.09
+[2024-06-18 22:49:38,728] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:49:38,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1997.05 | bwd_microstep: 1954.21 | bwd_inner_microstep: 1948.90 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.88
+[2024-06-18 22:49:38,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3908.82 | bwd: 3756.72 | bwd_inner: 3746.40 | bwd_allreduce: 10.22 | step: 61.98
+ 47%|████▋     | 331/700 [42:48<47:16,  7.69s/it]                                                 {'loss': 0.4391, 'learning_rate': 5.6803350138749034e-05, 'epoch': 3.31}
+ 47%|████▋     | 331/700 [42:48<47:16,  7.69s/it][2024-06-18 22:49:42,644] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1976.12 | bwd_microstep: 1916.35 | bwd_inner_microstep: 1911.44 | bwd_allreduce_microstep: 4.77 | step_microstep: 0.07
+[2024-06-18 22:49:46,609] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.89
+[2024-06-18 22:49:46,610] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1975.73 | bwd_microstep: 1909.81 | bwd_inner_microstep: 1904.37 | bwd_allreduce_microstep: 5.36 | step_microstep: 61.98
+[2024-06-18 22:49:46,610] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3951.82 | bwd: 3826.18 | bwd_inner: 3815.88 | bwd_allreduce: 10.12 | step: 62.06
+ 47%|████���     | 332/700 [42:56<47:30,  7.75s/it]                                                 {'loss': 0.9026, 'learning_rate': 5.6574090013846946e-05, 'epoch': 3.32}
+ 47%|████▋     | 332/700 [42:56<47:30,  7.75s/it][2024-06-18 22:49:49,686] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1415.34 | bwd_microstep: 1639.39 | bwd_inner_microstep: 1634.58 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.07
+[2024-06-18 22:49:53,496] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 22:49:53,496] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1919.46 | bwd_microstep: 1811.18 | bwd_inner_microstep: 1805.69 | bwd_allreduce_microstep: 5.35 | step_microstep: 61.57
+[2024-06-18 22:49:53,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3334.77 | bwd: 3450.60 | bwd_inner: 3440.35 | bwd_allreduce: 10.06 | step: 61.65
+ 48%|████▊     | 333/700 [43:02<45:48,  7.49s/it]                                                 {'loss': 0.0241, 'learning_rate': 5.634468915622915e-05, 'epoch': 3.33}
+ 48%|████▊     | 333/700 [43:02<45:48,  7.49s/it][2024-06-18 22:49:57,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1962.74 | bwd_microstep: 1888.08 | bwd_inner_microstep: 1883.35 | bwd_allreduce_microstep: 4.68 | step_microstep: 0.07
+[2024-06-18 22:50:01,465] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 22:50:01,466] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2015.71 | bwd_microstep: 1999.84 | bwd_inner_microstep: 1994.34 | bwd_allreduce_microstep: 5.45 | step_microstep: 62.76
+[2024-06-18 22:50:01,466] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3978.42 | bwd: 3887.94 | bwd_inner: 3877.69 | bwd_allreduce: 10.15 | step: 62.84
+ 48%|████▊     | 334/700 [43:10<46:33,  7.63s/it]                                                 {'loss': 0.7012, 'learning_rate': 5.6115152476720635e-05, 'epoch': 3.34}
+ 48%|████▊     | 334/700 [43:10<46:33,  7.63s/it][2024-06-18 22:50:05,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1976.72 | bwd_microstep: 1927.74 | bwd_inner_microstep: 1922.84 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.07
+[2024-06-18 22:50:09,353] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 22:50:09,353] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1974.52 | bwd_microstep: 1905.21 | bwd_inner_microstep: 1899.84 | bwd_allreduce_microstep: 5.28 | step_microstep: 63.20
+[2024-06-18 22:50:09,354] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3951.21 | bwd: 3832.97 | bwd_inner: 3822.78 | bwd_allreduce: 10.02 | step: 63.28
+ 48%|████▊     | 335/700 [43:18<46:53,  7.71s/it]                                                 {'loss': 0.7037, 'learning_rate': 5.5885484889054016e-05, 'epoch': 3.35}
+ 48%|████▊     | 335/700 [43:18<46:53,  7.71s/it][2024-06-18 22:50:13,007] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1889.31 | bwd_microstep: 1741.24 | bwd_inner_microstep: 1736.30 | bwd_allreduce_microstep: 4.86 | step_microstep: 0.09
+[2024-06-18 22:50:16,110] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:50:16,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1400.56 | bwd_microstep: 1623.45 | bwd_inner_microstep: 1618.13 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.50
+[2024-06-18 22:50:16,111] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3289.84 | bwd: 3364.71 | bwd_inner: 3354.46 | bwd_allreduce: 10.11 | step: 61.60
+ 48%|████▊     | 336/700 [43:25<45:02,  7.42s/it]                                                 {'loss': 0.4188, 'learning_rate': 5.565569130976422e-05, 'epoch': 3.36}
+ 48%|████▊     | 336/700 [43:25<45:02,  7.42s/it][2024-06-18 22:50:19,303] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1542.12 | bwd_microstep: 1627.66 | bwd_inner_microstep: 1622.84 | bwd_allreduce_microstep: 4.75 | step_microstep: 0.08
+[2024-06-18 22:50:23,278] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:50:23,278] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1973.00 | bwd_microstep: 1922.60 | bwd_inner_microstep: 1917.35 | bwd_allreduce_microstep: 5.19 | step_microstep: 61.66
+[2024-06-18 22:50:23,278] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3515.09 | bwd: 3550.28 | bwd_inner: 3540.21 | bwd_allreduce: 9.96 | step: 61.75
+ 48%|████▊     | 337/700 [43:32<44:26,  7.35s/it]                                                 {'loss': 0.3808, 'learning_rate': 5.542577665808332e-05, 'epoch': 3.37}
+ 48%|████▊     | 337/700 [43:32<44:26,  7.35s/it][2024-06-18 22:50:27,280] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2004.79 | bwd_microstep: 1974.82 | bwd_inner_microstep: 1970.03 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.07
+[2024-06-18 22:50:30,854] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.89
+[2024-06-18 22:50:30,855] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1653.76 | bwd_microstep: 1840.12 | bwd_inner_microstep: 1834.72 | bwd_allreduce_microstep: 5.24 | step_microstep: 63.53
+[2024-06-18 22:50:30,855] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3658.52 | bwd: 3814.97 | bwd_inner: 3804.84 | bwd_allreduce: 9.97 | step: 63.61
+ 48%|████▊     | 338/700 [43:40<44:44,  7.42s/it]                                                 {'loss': 0.8797, 'learning_rate': 5.5195745855835226e-05, 'epoch': 3.38}
+ 48%|████▊     | 338/700 [43:40<44:44,  7.42s/it][2024-06-18 22:50:34,041] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1443.37 | bwd_microstep: 1720.92 | bwd_inner_microstep: 1716.13 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.07
+[2024-06-18 22:50:38,080] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.89
+[2024-06-18 22:50:38,081] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1995.19 | bwd_microstep: 1961.79 | bwd_inner_microstep: 1956.13 | bwd_allreduce_microstep: 5.57 | step_microstep: 64.36
+[2024-06-18 22:50:38,081] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3438.53 | bwd: 3682.73 | bwd_inner: 3672.30 | bwd_allreduce: 10.28 | step: 64.44
+ 48%|████▊     | 339/700 [43:47<44:16,  7.36s/it]                                                 {'loss': 0.2011, 'learning_rate': 5.496560382733028e-05, 'epoch': 3.39}
+ 48%|████▊     | 339/700 [43:47<44:16,  7.36s/it][2024-06-18 22:50:41,810] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1813.45 | bwd_microstep: 1893.85 | bwd_inner_microstep: 1889.04 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.07
+[2024-06-18 22:50:45,866] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:50:45,867] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2004.81 | bwd_microstep: 1971.65 | bwd_inner_microstep: 1966.14 | bwd_allreduce_microstep: 5.37 | step_microstep: 61.75
+[2024-06-18 22:50:45,867] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3818.23 | bwd: 3865.52 | bwd_inner: 3855.24 | bwd_allreduce: 10.11 | step: 61.83
+ 49%|████▊     | 340/700 [43:55<44:55,  7.49s/it]                                                 {'loss': 0.7986, 'learning_rate': 5.4735355499259855e-05, 'epoch': 3.4}
+ 49%|████▊     | 340/700 [43:55<44:55,  7.49s/it][2024-06-18 22:50:49,838] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1991.39 | bwd_microstep: 1957.69 | bwd_inner_microstep: 1952.88 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.08
+[2024-06-18 22:50:53,799] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.90
+[2024-06-18 22:50:53,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.76 | bwd_microstep: 1910.54 | bwd_inner_microstep: 1905.17 | bwd_allreduce_microstep: 5.32 | step_microstep: 62.08
+[2024-06-18 22:50:53,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3962.12 | bwd: 3868.25 | bwd_inner: 3858.05 | bwd_allreduce: 10.10 | step: 62.16
+ 49%|████▊     | 341/700 [44:03<45:35,  7.62s/it]                                                 {'loss': 0.5735, 'learning_rate': 5.4505005800590945e-05, 'epoch': 3.41}
+ 49%|████▊     | 341/700 [44:03<45:35,  7.62s/it][2024-06-18 22:50:56,882] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1415.98 | bwd_microstep: 1644.59 | bwd_inner_microstep: 1639.81 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.08
+[2024-06-18 22:51:00,813] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:51:00,814] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1961.28 | bwd_microstep: 1890.85 | bwd_inner_microstep: 1885.38 | bwd_allreduce_microstep: 5.33 | step_microstep: 61.75
+[2024-06-18 22:51:00,814] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3377.22 | bwd: 3535.46 | bwd_inner: 3525.26 | bwd_allreduce: 10.01 | step: 61.83
+ 49%|████▉     | 342/700 [44:10<44:23,  7.44s/it]                                                 {'loss': 0.8255, 'learning_rate': 5.427455966246057e-05, 'epoch': 3.42}
+ 49%|████▉     | 342/700 [44:10<44:23,  7.44s/it][2024-06-18 22:51:04,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1961.49 | bwd_microstep: 1891.72 | bwd_inner_microstep: 1886.79 | bwd_allreduce_microstep: 4.89 | step_microstep: 0.14
+[2024-06-18 22:51:08,624] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:51:08,625] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1966.51 | bwd_microstep: 1888.47 | bwd_inner_microstep: 1883.20 | bwd_allreduce_microstep: 5.22 | step_microstep: 61.94
+[2024-06-18 22:51:08,625] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3927.98 | bwd: 3780.22 | bwd_inner: 3769.99 | bwd_allreduce: 10.14 | step: 62.09
+ 49%|████▉     | 343/700 [44:18<44:55,  7.55s/it]                                                 {'loss': 0.9377, 'learning_rate': 5.4044022018070214e-05, 'epoch': 3.43}
+ 49%|████▉     | 343/700 [44:18<44:55,  7.55s/it][2024-06-18 22:51:12,506] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1968.04 | bwd_microstep: 1891.20 | bwd_inner_microstep: 1886.40 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.07
+[2024-06-18 22:51:16,441] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:51:16,441] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1966.19 | bwd_microstep: 1889.49 | bwd_inner_microstep: 1884.00 | bwd_allreduce_microstep: 5.35 | step_microstep: 61.62
+[2024-06-18 22:51:16,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3934.19 | bwd: 3780.72 | bwd_inner: 3770.45 | bwd_allreduce: 10.11 | step: 61.70
+ 49%|████▉     | 344/700 [44:25<45:16,  7.63s/it]                                                 {'loss': 0.5552, 'learning_rate': 5.3813397802580334e-05, 'epoch': 3.44}
+ 49%|████▉     | 344/700 [44:25<45:16,  7.63s/it][2024-06-18 22:51:20,354] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.05 | bwd_microstep: 1920.29 | bwd_inner_microstep: 1915.32 | bwd_allreduce_microstep: 4.83 | step_microstep: 0.09
+[2024-06-18 22:51:24,022] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:51:24,023] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1682.37 | bwd_microstep: 1905.93 | bwd_inner_microstep: 1900.56 | bwd_allreduce_microstep: 5.29 | step_microstep: 61.77
+[2024-06-18 22:51:24,023] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3652.39 | bwd: 3826.24 | bwd_inner: 3815.94 | bwd_allreduce: 10.11 | step: 61.86
+ 49%|████▉     | 345/700 [44:33<45:03,  7.62s/it]                                                 {'loss': 0.7921, 'learning_rate': 5.358269195300454e-05, 'epoch': 3.45}
+ 49%|████▉     | 345/700 [44:33<45:03,  7.62s/it][2024-06-18 22:51:27,654] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1883.97 | bwd_microstep: 1725.20 | bwd_inner_microstep: 1720.47 | bwd_allreduce_microstep: 4.68 | step_microstep: 0.08
+[2024-06-18 22:51:31,621] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:51:31,622] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1969.93 | bwd_microstep: 1917.42 | bwd_inner_microstep: 1911.93 | bwd_allreduce_microstep: 5.41 | step_microstep: 62.29
+[2024-06-18 22:51:31,622] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3853.87 | bwd: 3642.65 | bwd_inner: 3632.42 | bwd_allreduce: 10.12 | step: 62.37
+ 49%|████▉     | 346/700 [44:41<44:54,  7.61s/it]                                                 {'loss': 0.3424, 'learning_rate': 5.335190940810407e-05, 'epoch': 3.46}
+ 49%|████▉     | 346/700 [44:41<44:54,  7.61s/it][2024-06-18 22:51:35,502] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1964.04 | bwd_microstep: 1894.00 | bwd_inner_microstep: 1889.22 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.08
+[2024-06-18 22:51:39,465] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:51:39,466] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1974.24 | bwd_microstep: 1909.77 | bwd_inner_microstep: 1904.40 | bwd_allreduce_microstep: 5.29 | step_microstep: 61.99
+[2024-06-18 22:51:39,466] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3938.25 | bwd: 3803.79 | bwd_inner: 3793.64 | bwd_allreduce: 10.05 | step: 62.07
+ 50%|████▉     | 347/700 [44:48<45:11,  7.68s/it]                                                 {'loss': 0.6281, 'learning_rate': 5.312105510828196e-05, 'epoch': 3.47}
+ 50%|████▉     | 347/700 [44:48<45:11,  7.68s/it][2024-06-18 22:51:43,014] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1658.68 | bwd_microstep: 1868.04 | bwd_inner_microstep: 1862.76 | bwd_allreduce_microstep: 5.17 | step_microstep: 0.10
+[2024-06-18 22:51:47,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 22:51:47,007] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1982.55 | bwd_microstep: 1926.58 | bwd_inner_microstep: 1921.25 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.83
+[2024-06-18 22:51:47,007] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3641.19 | bwd: 3794.64 | bwd_inner: 3784.05 | bwd_allreduce: 10.44 | step: 61.93
+ 50%|████▉     | 348/700 [44:56<44:48,  7.64s/it]                                                 {'loss': 0.7099, 'learning_rate': 5.289013399547732e-05, 'epoch': 3.48}
+ 50%|████▉     | 348/700 [44:56<44:48,  7.64s/it][2024-06-18 22:51:50,975] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1988.60 | bwd_microstep: 1957.92 | bwd_inner_microstep: 1950.78 | bwd_allreduce_microstep: 7.02 | step_microstep: 0.11
+[2024-06-18 22:51:54,079] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.94
+[2024-06-18 22:51:54,080] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1396.01 | bwd_microstep: 1616.26 | bwd_inner_microstep: 1610.59 | bwd_allreduce_microstep: 5.55 | step_microstep: 62.99
+[2024-06-18 22:51:54,080] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3384.57 | bwd: 3574.21 | bwd_inner: 3561.46 | bwd_allreduce: 12.58 | step: 63.11
+ 50%|████▉     | 349/700 [45:03<43:41,  7.47s/it]                                                 {'loss': 0.694, 'learning_rate': 5.265915101305952e-05, 'epoch': 3.49}
+ 50%|████▉     | 349/700 [45:03<43:41,  7.47s/it][2024-06-18 22:51:57,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1971.86 | bwd_microstep: 1925.21 | bwd_inner_microstep: 1920.35 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.07
+[2024-06-18 22:52:01,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 22:52:01,556] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1835.61 | bwd_microstep: 1640.71 | bwd_inner_microstep: 1635.40 | bwd_allreduce_microstep: 5.23 | step_microstep: 61.72
+[2024-06-18 22:52:01,556] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3807.45 | bwd: 3565.94 | bwd_inner: 3555.82 | bwd_allreduce: 9.96 | step: 61.79
+ 50%|█████     | 350/700 [45:11<43:34,  7.47s/it]                                                 {'loss': 0.2422, 'learning_rate': 5.242811110572242e-05, 'epoch': 3.5}
+ 50%|█████     | 350/700 [45:11<43:34,  7.47s/it][2024-06-18 22:52:04,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1442.16 | bwd_microstep: 1722.84 | bwd_inner_microstep: 1718.06 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.08
+[2024-06-18 22:52:08,741] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 22:52:08,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1983.72 | bwd_microstep: 1935.77 | bwd_inner_microstep: 1930.31 | bwd_allreduce_microstep: 5.37 | step_microstep: 62.06
+[2024-06-18 22:52:08,742] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3425.85 | bwd: 3658.63 | bwd_inner: 3648.41 | bwd_allreduce: 10.08 | step: 62.14
+ 50%|█████     | 351/700 [45:18<42:57,  7.39s/it]                                                 {'loss': 0.2857, 'learning_rate': 5.219701921937845e-05, 'epoch': 3.51}
+ 50%|█████     | 351/700 [45:18<42:57,  7.39s/it][2024-06-18 22:52:12,718] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1993.77 | bwd_microstep: 1959.66 | bwd_inner_microstep: 1954.81 | bwd_allreduce_microstep: 4.68 | step_microstep: 0.08
+[2024-06-18 22:52:16,659] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:52:16,660] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1965.35 | bwd_microstep: 1895.83 | bwd_inner_microstep: 1890.46 | bwd_allreduce_microstep: 5.23 | step_microstep: 61.78
+[2024-06-18 22:52:16,660] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3959.08 | bwd: 3855.51 | bwd_inner: 3845.40 | bwd_allreduce: 9.90 | step: 61.86
+ 50%|█████     | 352/700 [45:26<43:45,  7.55s/it]                                                 {'loss': 0.7273, 'learning_rate': 5.1965880301052784e-05, 'epoch': 3.52}
+ 50%|█████     | 352/700 [45:26<43:45,  7.55s/it][2024-06-18 22:52:20,312] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1889.29 | bwd_microstep: 1741.63 | bwd_inner_microstep: 1736.82 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.09
+[2024-06-18 22:52:24,273] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:52:24,273] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1973.60 | bwd_microstep: 1907.31 | bwd_inner_microstep: 1901.98 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.98
+[2024-06-18 22:52:24,274] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3862.85 | bwd: 3648.97 | bwd_inner: 3638.83 | bwd_allreduce: 10.00 | step: 62.07
+ 50%|█████     | 353/700 [45:33<43:45,  7.57s/it]                                                 {'loss': 0.3574, 'learning_rate': 5.17346992987774e-05, 'epoch': 3.53}
+ 50%|█████     | 353/700 [45:33<43:45,  7.57s/it][2024-06-18 22:52:28,195] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1971.18 | bwd_microstep: 1928.75 | bwd_inner_microstep: 1923.93 | bwd_allreduce_microstep: 4.68 | step_microstep: 0.07
+[2024-06-18 22:52:32,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 22:52:32,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2004.11 | bwd_microstep: 1969.48 | bwd_inner_microstep: 1964.15 | bwd_allreduce_microstep: 5.22 | step_microstep: 61.81
+[2024-06-18 22:52:32,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3975.26 | bwd: 3898.25 | bwd_inner: 3888.19 | bwd_allreduce: 9.91 | step: 61.89
+ 51%|█████     | 354/700 [45:41<44:20,  7.69s/it]                                                 {'loss': 0.6972, 'learning_rate': 5.15034811614852e-05, 'epoch': 3.54}
+ 51%|█████     | 354/700 [45:41<44:20,  7.69s/it][2024-06-18 22:52:36,136] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.15 | bwd_microstep: 1898.51 | bwd_inner_microstep: 1893.71 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.07
+[2024-06-18 22:52:40,125] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:52:40,126] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1975.13 | bwd_microstep: 1935.08 | bwd_inner_microstep: 1929.62 | bwd_allreduce_microstep: 5.29 | step_microstep: 61.66
+[2024-06-18 22:52:40,126] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3942.24 | bwd: 3833.61 | bwd_inner: 3823.42 | bwd_allreduce: 10.00 | step: 61.75
+ 51%|█████     | 355/700 [45:49<44:32,  7.75s/it]                                                 {'loss': 0.6574, 'learning_rate': 5.1272230838904015e-05, 'epoch': 3.55}
+ 51%|█████     | 355/700 [45:49<44:32,  7.75s/it][2024-06-18 22:52:43,998] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1960.23 | bwd_microstep: 1889.44 | bwd_inner_microstep: 1884.69 | bwd_allreduce_microstep: 4.68 | step_microstep: 0.07
+[2024-06-18 22:52:47,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 22:52:47,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1500.64 | bwd_microstep: 1829.55 | bwd_inner_microstep: 1823.99 | bwd_allreduce_microstep: 5.48 | step_microstep: 62.53
+[2024-06-18 22:52:47,408] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3460.84 | bwd: 3719.01 | bwd_inner: 3708.71 | bwd_allreduce: 10.17 | step: 62.61
+ 51%|█████     | 356/700 [45:56<43:36,  7.61s/it]                                                 {'loss': 0.6237, 'learning_rate': 5.1040953281450684e-05, 'epoch': 3.56}
+ 51%|█████     | 356/700 [45:56<43:36,  7.61s/it][2024-06-18 22:52:51,379] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1992.34 | bwd_microstep: 1955.96 | bwd_inner_microstep: 1951.08 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.08
+[2024-06-18 22:52:55,340] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:52:55,341] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1969.37 | bwd_microstep: 1912.46 | bwd_inner_microstep: 1907.07 | bwd_allreduce_microstep: 5.23 | step_microstep: 61.86
+[2024-06-18 22:52:55,341] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3961.68 | bwd: 3868.44 | bwd_inner: 3858.26 | bwd_allreduce: 9.93 | step: 61.95
+ 51%|█████     | 357/700 [46:04<44:02,  7.70s/it]                                                 {'loss': 0.7119, 'learning_rate': 5.080965344012508e-05, 'epoch': 3.57}
+ 51%|█████     | 357/700 [46:04<44:02,  7.70s/it][2024-06-18 22:52:59,080] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1910.93 | bwd_microstep: 1806.01 | bwd_inner_microstep: 1801.31 | bwd_allreduce_microstep: 4.66 | step_microstep: 0.08
+[2024-06-18 22:53:02,506] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 22:53:02,507] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1506.22 | bwd_microstep: 1841.21 | bwd_inner_microstep: 1835.76 | bwd_allreduce_microstep: 5.30 | step_microstep: 62.06
+[2024-06-18 22:53:02,507] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3417.12 | bwd: 3647.25 | bwd_inner: 3637.14 | bwd_allreduce: 9.98 | step: 62.14
+ 51%|█████     | 358/700 [46:11<42:59,  7.54s/it]                                                 {'loss': 0.4174, 'learning_rate': 5.057833626640408e-05, 'epoch': 3.58}
+ 51%|█████     | 358/700 [46:12<42:59,  7.54s/it][2024-06-18 22:53:06,449] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1983.39 | bwd_microstep: 1936.66 | bwd_inner_microstep: 1931.87 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.08
+[2024-06-18 22:53:10,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:53:10,411] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1971.42 | bwd_microstep: 1910.32 | bwd_inner_microstep: 1904.91 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.67
+[2024-06-18 22:53:10,411] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3954.78 | bwd: 3847.00 | bwd_inner: 3836.88 | bwd_allreduce: 9.95 | step: 61.75
+ 51%|█████▏    | 359/700 [46:19<43:28,  7.65s/it]                                                 {'loss': 0.7423, 'learning_rate': 5.0347006712135646e-05, 'epoch': 3.59}
+ 51%|█████▏    | 359/700 [46:19<43:28,  7.65s/it][2024-06-18 22:53:13,776] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1506.82 | bwd_microstep: 1837.71 | bwd_inner_microstep: 1832.70 | bwd_allreduce_microstep: 4.91 | step_microstep: 0.14
+[2024-06-18 22:53:17,419] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 22:53:17,420] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1866.50 | bwd_microstep: 1695.07 | bwd_inner_microstep: 1689.70 | bwd_allreduce_microstep: 5.28 | step_microstep: 62.69
+[2024-06-18 22:53:17,420] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3373.28 | bwd: 3532.80 | bwd_inner: 3522.47 | bwd_allreduce: 10.19 | step: 62.84
+ 51%|█████▏    | 360/700 [46:26<42:16,  7.46s/it]                                                 {'loss': 0.5093, 'learning_rate': 5.011566972943272e-05, 'epoch': 3.6}
+ 51%|█████▏    | 360/700 [46:26<42:16,  7.46s/it][2024-06-18 22:53:20,619] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1541.10 | bwd_microstep: 1633.50 | bwd_inner_microstep: 1628.73 | bwd_allreduce_microstep: 4.69 | step_microstep: 0.07
+[2024-06-18 22:53:24,557] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 22:53:24,558] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.38 | bwd_microstep: 1891.25 | bwd_inner_microstep: 1885.93 | bwd_allreduce_microstep: 5.23 | step_microstep: 61.88
+[2024-06-18 22:53:24,558] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3508.45 | bwd: 3524.77 | bwd_inner: 3514.71 | bwd_allreduce: 9.93 | step: 61.97
+ 52%|█████▏    | 361/700 [46:34<41:35,  7.36s/it]                                                 {'loss': 0.4895, 'learning_rate': 4.988433027056729e-05, 'epoch': 3.61}
+ 52%|█████▏    | 361/700 [46:34<41:35,  7.36s/it][2024-06-18 22:53:28,538] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1994.69 | bwd_microstep: 1962.95 | bwd_inner_microstep: 1958.07 | bwd_allreduce_microstep: 4.80 | step_microstep: 0.09
+[2024-06-18 22:53:32,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 22:53:32,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1973.54 | bwd_microstep: 1904.91 | bwd_inner_microstep: 1899.44 | bwd_allreduce_microstep: 5.31 | step_microstep: 62.14
+[2024-06-18 22:53:32,498] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3968.20 | bwd: 3867.89 | bwd_inner: 3857.60 | bwd_allreduce: 10.11 | step: 62.23
+ 52%|█████▏    | 362/700 [46:41<42:26,  7.54s/it]                                                 {'loss': 0.6984, 'learning_rate': 4.9652993287864365e-05, 'epoch': 3.62}
+ 52%|█████▏    | 362/700 [46:41<42:26,  7.54s/it][2024-06-18 22:53:36,409] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1968.83 | bwd_microstep: 1920.31 | bwd_inner_microstep: 1915.48 | bwd_allreduce_microstep: 4.68 | step_microstep: 0.07
+[2024-06-18 22:53:40,017] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:53:40,017] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1660.55 | bwd_microstep: 1868.44 | bwd_inner_microstep: 1863.08 | bwd_allreduce_microstep: 5.28 | step_microstep: 61.92
+[2024-06-18 22:53:40,017] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3629.35 | bwd: 3788.78 | bwd_inner: 3778.65 | bwd_allreduce: 9.97 | step: 62.00
+ 52%|█████▏    | 363/700 [46:49<42:17,  7.53s/it]                                                 {'loss': 0.9807, 'learning_rate': 4.942166373359593e-05, 'epoch': 3.63}
+ 52%|█████▏    | 363/700 [46:49<42:17,  7.53s/it][2024-06-18 22:53:43,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1959.21 | bwd_microstep: 1883.08 | bwd_inner_microstep: 1878.26 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.07
+[2024-06-18 22:53:47,597] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:53:47,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1893.06 | bwd_microstep: 1743.01 | bwd_inner_microstep: 1737.67 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.46
+[2024-06-18 22:53:47,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3852.24 | bwd: 3626.10 | bwd_inner: 3615.96 | bwd_allreduce: 10.01 | step: 61.54
+ 52%|█████▏    | 364/700 [46:57<42:15,  7.55s/it]                                                 {'loss': 0.1413, 'learning_rate': 4.919034655987493e-05, 'epoch': 3.64}
+ 52%|█████▏    | 364/700 [46:57<42:15,  7.55s/it][2024-06-18 22:53:51,524] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1974.64 | bwd_microstep: 1929.53 | bwd_inner_microstep: 1924.66 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.07
+[2024-06-18 22:53:55,329] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 22:53:55,330] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1820.89 | bwd_microstep: 1902.95 | bwd_inner_microstep: 1897.41 | bwd_allreduce_microstep: 5.45 | step_microstep: 63.63
+[2024-06-18 22:53:55,330] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3795.50 | bwd: 3832.50 | bwd_inner: 3822.16 | bwd_allreduce: 10.16 | step: 63.71
+ 52%|█████▏    | 365/700 [47:04<42:26,  7.60s/it]                                                 {'loss': 0.7832, 'learning_rate': 4.895904671854933e-05, 'epoch': 3.65}
+ 52%|█████▏    | 365/700 [47:04<42:26,  7.60s/it][2024-06-18 22:53:59,317] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2000.94 | bwd_microstep: 1964.21 | bwd_inner_microstep: 1959.39 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.07
+[2024-06-18 22:54:03,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.80
+[2024-06-18 22:54:03,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1589.05 | bwd_microstep: 2018.75 | bwd_inner_microstep: 2013.41 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.66
+[2024-06-18 22:54:03,005] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3589.96 | bwd: 3982.98 | bwd_inner: 3972.87 | bwd_allreduce: 9.98 | step: 61.74
+ 52%|█████▏    | 366/700 [47:12<42:26,  7.62s/it]                                                 {'loss': 0.957, 'learning_rate': 4.872776916109601e-05, 'epoch': 3.66}
+ 52%|█████▏    | 366/700 [47:12<42:26,  7.62s/it][2024-06-18 22:54:05,554] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1233.67 | bwd_microstep: 1295.04 | bwd_inner_microstep: 1290.23 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.08
+[2024-06-18 22:54:09,247] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 22:54:09,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1884.85 | bwd_microstep: 1728.58 | bwd_inner_microstep: 1723.26 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.77
+[2024-06-18 22:54:09,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3118.49 | bwd: 3023.64 | bwd_inner: 3013.53 | bwd_allreduce: 9.97 | step: 61.86
+ 52%|█████▏    | 367/700 [47:18<40:00,  7.21s/it]                                                 {'loss': 0.0125, 'learning_rate': 4.849651883851481e-05, 'epoch': 3.67}
+ 52%|█████▏    | 367/700 [47:18<40:00,  7.21s/it][2024-06-18 22:54:12,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1916.38 | bwd_microstep: 1812.70 | bwd_inner_microstep: 1807.86 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.08
+[2024-06-18 22:54:17,046] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.89
+[2024-06-18 22:54:17,047] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2001.87 | bwd_microstep: 1963.77 | bwd_inner_microstep: 1958.39 | bwd_allreduce_microstep: 5.29 | step_microstep: 63.83
+[2024-06-18 22:54:17,047] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3918.22 | bwd: 3776.49 | bwd_inner: 3766.33 | bwd_allreduce: 9.99 | step: 63.91
+ 53%|█████▎    | 368/700 [47:26<40:52,  7.39s/it]                                                 {'loss': 0.5204, 'learning_rate': 4.826530070122262e-05, 'epoch': 3.68}
+ 53%|█████▎    | 368/700 [47:26<40:52,  7.39s/it][2024-06-18 22:54:21,081] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2011.55 | bwd_microstep: 2000.03 | bwd_inner_microstep: 1995.07 | bwd_allreduce_microstep: 4.88 | step_microstep: 0.14
+[2024-06-18 22:54:25,196] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:54:25,196] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2018.37 | bwd_microstep: 2016.17 | bwd_inner_microstep: 2010.77 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.94
+[2024-06-18 22:54:25,197] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 4029.89 | bwd: 4016.23 | bwd_inner: 4005.91 | bwd_allreduce: 10.13 | step: 62.09
+ 53%|█████▎    | 369/700 [47:34<42:00,  7.62s/it]                                                 {'loss': 0.9596, 'learning_rate': 4.803411969894724e-05, 'epoch': 3.69}
+ 53%|█████▎    | 369/700 [47:34<42:00,  7.62s/it][2024-06-18 22:54:26,226] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 504.36 | bwd_microstep: 504.86 | bwd_inner_microstep: 499.99 | bwd_allreduce_microstep: 4.78 | step_microstep: 0.07
+[2024-06-18 22:54:30,257] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:54:30,258] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1989.66 | bwd_microstep: 1958.83 | bwd_inner_microstep: 1953.47 | bwd_allreduce_microstep: 5.21 | step_microstep: 61.97
+[2024-06-18 22:54:30,258] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2493.99 | bwd: 2463.71 | bwd_inner: 2453.56 | bwd_allreduce: 10.00 | step: 62.05
+ 53%|█████▎    | 370/700 [47:39<37:40,  6.85s/it]                                                 {'loss': 0.4714, 'learning_rate': 4.780298078062157e-05, 'epoch': 3.7}
+ 53%|█████▎    | 370/700 [47:39<37:40,  6.85s/it][2024-06-18 22:54:33,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1569.08 | bwd_microstep: 1668.68 | bwd_inner_microstep: 1663.83 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.09
+[2024-06-18 22:54:37,287] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 22:54:37,287] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1814.57 | bwd_microstep: 1875.33 | bwd_inner_microstep: 1869.89 | bwd_allreduce_microstep: 5.28 | step_microstep: 61.52
+[2024-06-18 22:54:37,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3383.61 | bwd: 3544.03 | bwd_inner: 3533.82 | bwd_allreduce: 10.05 | step: 61.61
+ 53%|█████▎    | 371/700 [47:46<37:51,  6.90s/it]                                                 {'loss': 0.5038, 'learning_rate': 4.7571888894277604e-05, 'epoch': 3.71}
+ 53%|█████▎    | 371/700 [47:46<37:51,  6.90s/it][2024-06-18 22:54:41,331] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2015.60 | bwd_microstep: 2006.40 | bwd_inner_microstep: 2001.58 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.07
+[2024-06-18 22:54:45,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:54:45,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2006.39 | bwd_microstep: 1974.25 | bwd_inner_microstep: 1968.74 | bwd_allreduce_microstep: 5.42 | step_microstep: 62.30
+[2024-06-18 22:54:45,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 4021.96 | bwd: 3980.67 | bwd_inner: 3970.39 | bwd_allreduce: 10.14 | step: 62.38
+ 53%|█████▎    | 372/700 [47:54<39:42,  7.26s/it]                                                 {'loss': 0.8625, 'learning_rate': 4.7340848986940487e-05, 'epoch': 3.72}
+ 53%|█████▎    | 372/700 [47:54<39:42,  7.26s/it][2024-06-18 22:54:49,411] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2009.35 | bwd_microstep: 1986.36 | bwd_inner_microstep: 1981.51 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.07
+[2024-06-18 22:54:53,358] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:54:53,359] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.59 | bwd_microstep: 1897.12 | bwd_inner_microstep: 1891.65 | bwd_allreduce_microstep: 5.32 | step_microstep: 61.55
+[2024-06-18 22:54:53,359] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3979.91 | bwd: 3883.50 | bwd_inner: 3873.24 | bwd_allreduce: 10.06 | step: 61.63
+ 53%|█████▎    | 373/700 [48:02<40:44,  7.47s/it]                                                 {'loss': 0.865, 'learning_rate': 4.710986600452269e-05, 'epoch': 3.73}
+ 53%|█████▎    | 373/700 [48:02<40:44,  7.47s/it][2024-06-18 22:54:55,962] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1254.00 | bwd_microstep: 1328.33 | bwd_inner_microstep: 1323.56 | bwd_allreduce_microstep: 4.68 | step_microstep: 0.07
+[2024-06-18 22:55:00,036] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.89
+[2024-06-18 22:55:00,037] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2007.72 | bwd_microstep: 1984.92 | bwd_inner_microstep: 1979.29 | bwd_allreduce_microstep: 5.53 | step_microstep: 63.76
+[2024-06-18 22:55:00,037] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3261.70 | bwd: 3313.27 | bwd_inner: 3302.91 | bwd_allreduce: 10.22 | step: 63.84
+ 53%|█████▎    | 374/700 [48:09<39:18,  7.24s/it]                                                 {'loss': 0.3526, 'learning_rate': 4.687894489171804e-05, 'epoch': 3.74}
+ 53%|█████▎    | 374/700 [48:09<39:18,  7.24s/it][2024-06-18 22:55:03,669] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1884.17 | bwd_microstep: 1725.80 | bwd_inner_microstep: 1720.95 | bwd_allreduce_microstep: 4.77 | step_microstep: 0.07
+[2024-06-18 22:55:06,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 22:55:06,946] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1550.41 | bwd_microstep: 1646.62 | bwd_inner_microstep: 1641.12 | bwd_allreduce_microstep: 5.34 | step_microstep: 62.22
+[2024-06-18 22:55:06,946] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3434.55 | bwd: 3372.44 | bwd_inner: 3362.14 | bwd_allreduce: 10.10 | step: 62.30
+ 54%|█████▎    | 375/700 [48:16<38:39,  7.14s/it]                                                 {'loss': 0.2742, 'learning_rate': 4.6648090591895935e-05, 'epoch': 3.75}
+ 54%|█████▎    | 375/700 [48:16<38:39,  7.14s/it][2024-06-18 22:55:10,881] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1978.32 | bwd_microstep: 1935.29 | bwd_inner_microstep: 1930.43 | bwd_allreduce_microstep: 4.78 | step_microstep: 0.08
+[2024-06-18 22:55:14,864] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 22:55:14,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1974.29 | bwd_microstep: 1928.20 | bwd_inner_microstep: 1922.78 | bwd_allreduce_microstep: 5.28 | step_microstep: 61.70
+[2024-06-18 22:55:14,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3952.58 | bwd: 3863.51 | bwd_inner: 3853.29 | bwd_allreduce: 10.05 | step: 61.79
+ 54%|█████▎    | 376/700 [48:24<39:48,  7.37s/it]                                                 {'loss': 0.5781, 'learning_rate': 4.641730804699547e-05, 'epoch': 3.76}
+ 54%|█████▎    | 376/700 [48:24<39:48,  7.37s/it][2024-06-18 22:55:18,781] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1974.61 | bwd_microstep: 1919.73 | bwd_inner_microstep: 1914.79 | bwd_allreduce_microstep: 4.80 | step_microstep: 0.07
+[2024-06-18 22:55:22,213] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 22:55:22,213] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1651.45 | bwd_microstep: 1700.59 | bwd_inner_microstep: 1695.18 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.47
+[2024-06-18 22:55:22,214] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3626.02 | bwd: 3620.34 | bwd_inner: 3610.08 | bwd_allreduce: 10.04 | step: 61.55
+ 54%|█████▍    | 377/700 [48:31<39:38,  7.37s/it]                                                 {'loss': 0.7607, 'learning_rate': 4.6186602197419685e-05, 'epoch': 3.77}
+ 54%|█████▍    | 377/700 [48:31<39:38,  7.37s/it][2024-06-18 22:55:23,427] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 552.26 | bwd_microstep: 640.84 | bwd_inner_microstep: 635.72 | bwd_allreduce_microstep: 4.97 | step_microstep: 0.14
+[2024-06-18 22:55:27,409] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.94
+[2024-06-18 22:55:27,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1971.70 | bwd_microstep: 1928.74 | bwd_inner_microstep: 1923.15 | bwd_allreduce_microstep: 5.44 | step_microstep: 63.87
+[2024-06-18 22:55:27,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2523.93 | bwd: 2569.60 | bwd_inner: 2558.98 | bwd_allreduce: 10.37 | step: 64.01
+ 54%|█████▍    | 378/700 [48:36<36:02,  6.71s/it]                                                 {'loss': 0.8003, 'learning_rate': 4.59559779819298e-05, 'epoch': 3.78}
+ 54%|█████▍    | 378/700 [48:36<36:02,  6.71s/it][2024-06-18 22:55:30,884] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1733.82 | bwd_microstep: 1716.76 | bwd_inner_microstep: 1711.89 | bwd_allreduce_microstep: 4.75 | step_microstep: 0.07
+[2024-06-18 22:55:34,852] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:55:34,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1971.89 | bwd_microstep: 1916.73 | bwd_inner_microstep: 1911.31 | bwd_allreduce_microstep: 5.34 | step_microstep: 61.87
+[2024-06-18 22:55:34,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3705.68 | bwd: 3633.52 | bwd_inner: 3623.27 | bwd_allreduce: 10.09 | step: 61.95
+ 54%|█████▍    | 379/700 [48:44<37:05,  6.93s/it]                                                 {'loss': 0.4204, 'learning_rate': 4.572544033753945e-05, 'epoch': 3.79}
+ 54%|█████▍    | 379/700 [48:44<37:05,  6.93s/it][2024-06-18 22:55:38,735] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1965.94 | bwd_microstep: 1894.60 | bwd_inner_microstep: 1889.63 | bwd_allreduce_microstep: 4.82 | step_microstep: 0.08
+[2024-06-18 22:55:42,537] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:55:42,538] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1913.41 | bwd_microstep: 1808.26 | bwd_inner_microstep: 1802.90 | bwd_allreduce_microstep: 5.26 | step_microstep: 62.03
+[2024-06-18 22:55:42,538] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3879.32 | bwd: 3702.88 | bwd_inner: 3692.63 | bwd_allreduce: 10.06 | step: 62.12
+ 54%|█████▍    | 380/700 [48:52<38:10,  7.16s/it]                                                 {'loss': 0.3505, 'learning_rate': 4.5494994199409067e-05, 'epoch': 3.8}
+ 54%|█████▍    | 380/700 [48:52<38:10,  7.16s/it][2024-06-18 22:55:46,195] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1893.24 | bwd_microstep: 1742.10 | bwd_inner_microstep: 1737.14 | bwd_allreduce_microstep: 4.82 | step_microstep: 0.08
+[2024-06-18 22:55:50,190] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:55:50,190] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1978.92 | bwd_microstep: 1936.39 | bwd_inner_microstep: 1930.98 | bwd_allreduce_microstep: 5.28 | step_microstep: 61.45
+[2024-06-18 22:55:50,190] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3872.12 | bwd: 3678.52 | bwd_inner: 3668.22 | bwd_allreduce: 10.08 | step: 61.53
+ 54%|█████▍    | 381/700 [48:59<38:50,  7.31s/it]                                                 {'loss': 0.3341, 'learning_rate': 4.5264644500740156e-05, 'epoch': 3.81}
+ 54%|█████▍    | 381/700 [48:59<38:50,  7.31s/it][2024-06-18 22:55:53,936] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1914.76 | bwd_microstep: 1809.58 | bwd_inner_microstep: 1804.78 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.08
+[2024-06-18 22:55:57,917] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:55:57,918] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1974.90 | bwd_microstep: 1925.85 | bwd_inner_microstep: 1920.50 | bwd_allreduce_microstep: 5.28 | step_microstep: 61.76
+[2024-06-18 22:55:57,918] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3889.63 | bwd: 3735.46 | bwd_inner: 3725.32 | bwd_allreduce: 10.00 | step: 61.84
+ 55%|█████▍    | 382/700 [49:07<39:23,  7.43s/it]                                                 {'loss': 0.2664, 'learning_rate': 4.503439617266973e-05, 'epoch': 3.82}
+ 55%|█████▍    | 382/700 [49:07<39:23,  7.43s/it][2024-06-18 22:56:01,018] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1422.59 | bwd_microstep: 1656.10 | bwd_inner_microstep: 1650.91 | bwd_allreduce_microstep: 5.04 | step_microstep: 0.09
+[2024-06-18 22:56:04,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 22:56:04,579] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1831.48 | bwd_microstep: 1649.07 | bwd_inner_microstep: 1643.64 | bwd_allreduce_microstep: 5.35 | step_microstep: 62.24
+[2024-06-18 22:56:04,579] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3254.05 | bwd: 3305.19 | bwd_inner: 3294.62 | bwd_allreduce: 10.38 | step: 62.34
+ 55%|█████▍    | 383/700 [49:14<38:02,  7.20s/it]                                                 {'loss': 0.0007, 'learning_rate': 4.4804254144164785e-05, 'epoch': 3.83}
+ 55%|█████▍    | 383/700 [49:14<38:02,  7.20s/it][2024-06-18 22:56:08,233] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1890.53 | bwd_microstep: 1741.59 | bwd_inner_microstep: 1736.72 | bwd_allreduce_microstep: 4.69 | step_microstep: 0.08
+[2024-06-18 22:56:11,515] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:56:11,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1552.81 | bwd_microstep: 1650.53 | bwd_inner_microstep: 1645.21 | bwd_allreduce_microstep: 5.23 | step_microstep: 61.64
+[2024-06-18 22:56:11,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3443.31 | bwd: 3392.15 | bwd_inner: 3382.05 | bwd_allreduce: 9.93 | step: 61.73
+ 55%|█████▍    | 384/700 [49:21<37:30,  7.12s/it]                                                 {'loss': 0.1969, 'learning_rate': 4.4574223341916695e-05, 'epoch': 3.84}
+ 55%|█████▍    | 384/700 [49:21<37:30,  7.12s/it][2024-06-18 22:56:15,404] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1964.33 | bwd_microstep: 1901.37 | bwd_inner_microstep: 1896.38 | bwd_allreduce_microstep: 4.91 | step_microstep: 0.08
+[2024-06-18 22:56:19,274] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 22:56:19,275] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1938.57 | bwd_microstep: 1850.94 | bwd_inner_microstep: 1845.50 | bwd_allreduce_microstep: 5.30 | step_microstep: 61.94
+[2024-06-18 22:56:19,275] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3902.87 | bwd: 3752.33 | bwd_inner: 3741.94 | bwd_allreduce: 10.21 | step: 62.04
+ 55%|█████▌    | 385/700 [49:28<38:23,  7.31s/it]                                                 {'loss': 0.795, 'learning_rate': 4.434430869023579e-05, 'epoch': 3.85}
+ 55%|█████▌    | 385/700 [49:28<38:23,  7.31s/it][2024-06-18 22:56:23,193] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1973.29 | bwd_microstep: 1923.25 | bwd_inner_microstep: 1918.45 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.07
+[2024-06-18 22:56:27,185] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:56:27,186] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1980.81 | bwd_microstep: 1932.40 | bwd_inner_microstep: 1926.93 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.51
+[2024-06-18 22:56:27,186] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3954.07 | bwd: 3855.68 | bwd_inner: 3845.47 | bwd_allreduce: 10.01 | step: 61.59
+ 55%|█████▌    | 386/700 [49:36<39:12,  7.49s/it]                                                 {'loss': 0.716, 'learning_rate': 4.4114515110945995e-05, 'epoch': 3.86}
+ 55%|█████▌    | 386/700 [49:36<39:12,  7.49s/it][2024-06-18 22:56:30,383] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1543.76 | bwd_microstep: 1631.83 | bwd_inner_microstep: 1626.88 | bwd_allreduce_microstep: 4.88 | step_microstep: 0.14
+[2024-06-18 22:56:34,435] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 22:56:34,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2005.82 | bwd_microstep: 1966.49 | bwd_inner_microstep: 1961.15 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.99
+[2024-06-18 22:56:34,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3549.55 | bwd: 3598.34 | bwd_inner: 3588.07 | bwd_allreduce: 10.14 | step: 62.13
+ 55%|█████▌    | 387/700 [49:43<38:42,  7.42s/it]                                                 {'loss': 0.4021, 'learning_rate': 4.3884847523279376e-05, 'epoch': 3.87}
+ 55%|█████▌    | 387/700 [49:43<38:42,  7.42s/it][2024-06-18 22:56:38,305] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1960.18 | bwd_microstep: 1887.31 | bwd_inner_microstep: 1882.37 | bwd_allreduce_microstep: 4.79 | step_microstep: 0.08
+[2024-06-18 22:56:42,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:56:42,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1977.70 | bwd_microstep: 1933.14 | bwd_inner_microstep: 1927.71 | bwd_allreduce_microstep: 5.35 | step_microstep: 62.04
+[2024-06-18 22:56:42,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3937.84 | bwd: 3820.47 | bwd_inner: 3810.16 | bwd_allreduce: 10.12 | step: 62.13
+ 55%|█████▌    | 388/700 [49:51<39:16,  7.55s/it]                                                 {'loss': 0.7047, 'learning_rate': 4.365531084377087e-05, 'epoch': 3.88}
+ 55%|█████▌    | 388/700 [49:51<39:16,  7.55s/it][2024-06-18 22:56:45,953] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1890.81 | bwd_microstep: 1744.56 | bwd_inner_microstep: 1739.60 | bwd_allreduce_microstep: 4.82 | step_microstep: 0.07
+[2024-06-18 22:56:49,908] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:56:49,909] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1973.55 | bwd_microstep: 1902.22 | bwd_inner_microstep: 1896.74 | bwd_allreduce_microstep: 5.32 | step_microstep: 61.82
+[2024-06-18 22:56:49,909] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3864.33 | bwd: 3646.81 | bwd_inner: 3636.45 | bwd_allreduce: 10.12 | step: 61.90
+ 56%|█████▌    | 389/700 [49:59<39:14,  7.57s/it]                                                 {'loss': 0.4303, 'learning_rate': 4.342590998615307e-05, 'epoch': 3.89}
+ 56%|█████▌    | 389/700 [49:59<39:14,  7.57s/it][2024-06-18 22:56:52,940] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1393.20 | bwd_microstep: 1616.78 | bwd_inner_microstep: 1611.94 | bwd_allreduce_microstep: 4.68 | step_microstep: 0.07
+[2024-06-18 22:56:56,582] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:56:56,583] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1866.04 | bwd_microstep: 1696.69 | bwd_inner_microstep: 1691.29 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.84
+[2024-06-18 22:56:56,583] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3259.21 | bwd: 3313.48 | bwd_inner: 3303.33 | bwd_allreduce: 10.00 | step: 61.92
+ 56%|█████▌    | 390/700 [50:06<37:43,  7.30s/it]                                                 {'loss': 0.3186, 'learning_rate': 4.319664986125099e-05, 'epoch': 3.9}
+ 56%|█████▌    | 390/700 [50:06<37:43,  7.30s/it][2024-06-18 22:57:00,242] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1891.33 | bwd_microstep: 1745.97 | bwd_inner_microstep: 1741.22 | bwd_allreduce_microstep: 4.67 | step_microstep: 0.08
+[2024-06-18 22:57:03,956] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:57:03,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1894.01 | bwd_microstep: 1741.23 | bwd_inner_microstep: 1735.89 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.48
+[2024-06-18 22:57:03,957] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3785.32 | bwd: 3487.23 | bwd_inner: 3477.15 | bwd_allreduce: 9.93 | step: 61.56
+ 56%|█████▌    | 391/700 [50:13<37:42,  7.32s/it]                                                 {'loss': 0.0043, 'learning_rate': 4.2967535376876936e-05, 'epoch': 3.91}
+ 56%|█████▌    | 391/700 [50:13<37:42,  7.32s/it][2024-06-18 22:57:07,317] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1504.71 | bwd_microstep: 1834.32 | bwd_inner_microstep: 1829.25 | bwd_allreduce_microstep: 4.97 | step_microstep: 0.10
+[2024-06-18 22:57:11,294] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 22:57:11,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1975.89 | bwd_microstep: 1919.46 | bwd_inner_microstep: 1914.02 | bwd_allreduce_microstep: 5.30 | step_microstep: 61.98
+[2024-06-18 22:57:11,295] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3480.57 | bwd: 3753.80 | bwd_inner: 3743.36 | bwd_allreduce: 10.28 | step: 62.08
+ 56%|█████▌    | 392/700 [50:20<37:36,  7.33s/it]                                                 {'loss': 0.8166, 'learning_rate': 4.27385714377255e-05, 'epoch': 3.92}
+ 56%|█████▌    | 392/700 [50:20<37:36,  7.33s/it][2024-06-18 22:57:14,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1425.55 | bwd_microstep: 1665.35 | bwd_inner_microstep: 1660.47 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.07
+[2024-06-18 22:57:18,216] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:57:18,217] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1913.49 | bwd_microstep: 1815.97 | bwd_inner_microstep: 1810.63 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.62
+[2024-06-18 22:57:18,217] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3339.01 | bwd: 3481.34 | bwd_inner: 3471.19 | bwd_allreduce: 10.00 | step: 61.70
+ 56%|█████▌    | 393/700 [50:27<36:52,  7.21s/it]                                                 {'loss': 0.2863, 'learning_rate': 4.2509762945268474e-05, 'epoch': 3.93}
+ 56%|█████▌    | 393/700 [50:27<36:52,  7.21s/it][2024-06-18 22:57:22,193] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1992.96 | bwd_microstep: 1961.15 | bwd_inner_microstep: 1956.33 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.08
+[2024-06-18 22:57:26,235] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 22:57:26,236] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2005.38 | bwd_microstep: 1957.04 | bwd_inner_microstep: 1951.61 | bwd_allreduce_microstep: 5.34 | step_microstep: 62.16
+[2024-06-18 22:57:26,236] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3998.31 | bwd: 3918.21 | bwd_inner: 3907.99 | bwd_allreduce: 10.09 | step: 62.25
+ 56%|█████▋    | 394/700 [50:35<37:59,  7.45s/it]                                                 {'loss': 0.8721, 'learning_rate': 4.228111479765004e-05, 'epoch': 3.94}
+ 56%|█████▋    | 394/700 [50:35<37:59,  7.45s/it][2024-06-18 22:57:29,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1595.57 | bwd_microstep: 1743.13 | bwd_inner_microstep: 1738.34 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.07
+[2024-06-18 22:57:33,322] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.98
+[2024-06-18 22:57:33,323] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1899.37 | bwd_microstep: 1745.18 | bwd_inner_microstep: 1739.60 | bwd_allreduce_microstep: 5.50 | step_microstep: 63.83
+[2024-06-18 22:57:33,323] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3494.91 | bwd: 3488.34 | bwd_inner: 3477.99 | bwd_allreduce: 10.21 | step: 63.91
+ 56%|█████▋    | 395/700 [50:42<37:19,  7.34s/it]                                                 {'loss': 0.0029, 'learning_rate': 4.205263188958179e-05, 'epoch': 3.95}
+ 56%|█████▋    | 395/700 [50:42<37:19,  7.34s/it][2024-06-18 22:57:37,223] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1968.87 | bwd_microstep: 1907.78 | bwd_inner_microstep: 1902.97 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.07
+[2024-06-18 22:57:41,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:57:41,031] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1918.07 | bwd_microstep: 1810.11 | bwd_inner_microstep: 1804.75 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.77
+[2024-06-18 22:57:41,031] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3886.91 | bwd: 3717.91 | bwd_inner: 3707.77 | bwd_allreduce: 10.00 | step: 61.85
+ 57%|█████▋    | 396/700 [50:50<37:45,  7.45s/it]                                                 {'loss': 0.361, 'learning_rate': 4.182431911223805e-05, 'epoch': 3.96}
+ 57%|█████▋    | 396/700 [50:50<37:45,  7.45s/it][2024-06-18 22:57:44,948] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1973.11 | bwd_microstep: 1921.80 | bwd_inner_microstep: 1916.89 | bwd_allreduce_microstep: 4.82 | step_microstep: 0.09
+[2024-06-18 22:57:48,337] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:57:48,337] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1496.88 | bwd_microstep: 1812.84 | bwd_inner_microstep: 1807.49 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.98
+[2024-06-18 22:57:48,337] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3469.96 | bwd: 3734.66 | bwd_inner: 3724.42 | bwd_allreduce: 10.10 | step: 62.07
+ 57%|█████▋    | 397/700 [50:57<37:24,  7.41s/it]                                                 {'loss': 0.9149, 'learning_rate': 4.159618135315109e-05, 'epoch': 3.97}
+ 57%|█████▋    | 397/700 [50:57<37:24,  7.41s/it][2024-06-18 22:57:52,084] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1916.60 | bwd_microstep: 1808.10 | bwd_inner_microstep: 1803.29 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.07
+[2024-06-18 22:57:56,020] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:57:56,021] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1965.59 | bwd_microstep: 1891.60 | bwd_inner_microstep: 1886.16 | bwd_allreduce_microstep: 5.30 | step_microstep: 61.63
+[2024-06-18 22:57:56,021] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3882.16 | bwd: 3699.72 | bwd_inner: 3689.53 | bwd_allreduce: 10.01 | step: 61.71
+ 57%|█████▋    | 398/700 [51:05<37:42,  7.49s/it]                                                 {'loss': 0.2188, 'learning_rate': 4.136822349610654e-05, 'epoch': 3.98}
+ 57%|█████▋    | 398/700 [51:05<37:42,  7.49s/it][2024-06-18 22:57:59,929] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1969.00 | bwd_microstep: 1917.16 | bwd_inner_microstep: 1912.17 | bwd_allreduce_microstep: 4.91 | step_microstep: 0.08
+[2024-06-18 22:58:03,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 22:58:03,877] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1968.80 | bwd_microstep: 1899.07 | bwd_inner_microstep: 1893.62 | bwd_allreduce_microstep: 5.31 | step_microstep: 62.01
+[2024-06-18 22:58:03,877] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3937.78 | bwd: 3816.26 | bwd_inner: 3805.86 | bwd_allreduce: 10.23 | step: 62.09
+ 57%|█████▋    | 399/700 [51:13<38:07,  7.60s/it]                                                 {'loss': 0.7136, 'learning_rate': 4.114045042103887e-05, 'epoch': 3.99}
+ 57%|█████▋    | 399/700 [51:13<38:07,  7.60s/it][2024-06-18 22:58:07,200] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1496.42 | bwd_microstep: 1805.22 | bwd_inner_microstep: 1800.41 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.07
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+[2024-06-18 22:58:11,970] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 22:58:11,971] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1964.03 | bwd_microstep: 1917.02 | bwd_inner_microstep: 1911.40 | bwd_allreduce_microstep: 5.52 | step_microstep: 64.40
+[2024-06-18 22:58:11,971] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3460.38 | bwd: 3722.26 | bwd_inner: 3711.85 | bwd_allreduce: 10.27 | step: 64.48
+ 57%|█████▋    | 400/700 [51:21<38:44,  7.75s/it]                                                 {'loss': 0.7188, 'learning_rate': 4.0912867003926834e-05, 'epoch': 4.0}
+ 57%|█████▋    | 400/700 [51:21<38:44,  7.75s/it][2024-06-18 22:58:14,672] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-06-18 22:58:20,524] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-06-18 22:58:26,277] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-06-18 22:58:32,104] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-06-18 22:58:39,151] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1885.37 | bwd_microstep: 1729.96 | bwd_inner_microstep: 1725.19 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.07
+[2024-06-18 22:58:42,614] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 22:58:42,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1514.85 | bwd_microstep: 1869.24 | bwd_inner_microstep: 1863.84 | bwd_allreduce_microstep: 5.32 | step_microstep: 62.10
+[2024-06-18 22:58:42,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3400.15 | bwd: 3599.22 | bwd_inner: 3589.05 | bwd_allreduce: 10.04 | step: 62.18
+ 57%|█████▋    | 401/700 [51:52<1:12:50, 14.62s/it]                                                   {'loss': 0.2834, 'learning_rate': 4.068547811668918e-05, 'epoch': 4.01}
+ 57%|█████▋    | 401/700 [51:52<1:12:50, 14.62s/it][2024-06-18 22:58:46,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1984.83 | bwd_microstep: 1956.56 | bwd_inner_microstep: 1951.82 | bwd_allreduce_microstep: 4.69 | step_microstep: 0.07
+[2024-06-18 22:58:50,120] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.90
+[2024-06-18 22:58:50,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1821.88 | bwd_microstep: 1639.93 | bwd_inner_microstep: 1634.43 | bwd_allreduce_microstep: 5.42 | step_microstep: 62.71
+[2024-06-18 22:58:50,121] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3806.68 | bwd: 3596.51 | bwd_inner: 3586.27 | bwd_allreduce: 10.13 | step: 62.79
+ 57%|█████▋    | 402/700 [51:59<1:02:00, 12.48s/it]                                                   {'loss': 0.2911, 'learning_rate': 4.045828862708032e-05, 'epoch': 4.02}
+ 57%|█████▋    | 402/700 [51:59<1:02:00, 12.48s/it][2024-06-18 22:58:53,744] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1875.94 | bwd_microstep: 1724.70 | bwd_inner_microstep: 1719.83 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.07
+[2024-06-18 22:58:57,671] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:58:57,671] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1957.17 | bwd_microstep: 1890.31 | bwd_inner_microstep: 1884.89 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.90
+[2024-06-18 22:58:57,672] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3833.08 | bwd: 3615.03 | bwd_inner: 3604.84 | bwd_allreduce: 10.02 | step: 61.98
+ 58%|█████▊    | 403/700 [52:07<54:28, 11.00s/it]                                                   {'loss': 0.4027, 'learning_rate': 4.023130339858612e-05, 'epoch': 4.03}
+ 58%|█████▊    | 403/700 [52:07<54:28, 11.00s/it][2024-06-18 22:59:01,597] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.29 | bwd_microstep: 1933.24 | bwd_inner_microstep: 1928.35 | bwd_allreduce_microstep: 4.81 | step_microstep: 0.09
+[2024-06-18 22:59:05,062] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:59:05,063] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1518.92 | bwd_microstep: 1866.73 | bwd_inner_microstep: 1861.22 | bwd_allreduce_microstep: 5.36 | step_microstep: 62.04
+[2024-06-18 22:59:05,063] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3489.17 | bwd: 3800.00 | bwd_inner: 3789.65 | bwd_allreduce: 10.15 | step: 62.14
+ 58%|█████▊    | 404/700 [52:14<48:56,  9.92s/it]                                                 {'loss': 0.6534, 'learning_rate': 4.0004527290319784e-05, 'epoch': 4.04}
+ 58%|█████▊    | 404/700 [52:14<48:56,  9.92s/it][2024-06-18 22:59:08,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1884.98 | bwd_microstep: 1738.67 | bwd_inner_microstep: 1733.80 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.07
+[2024-06-18 22:59:12,300] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:59:12,301] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1653.64 | bwd_microstep: 1858.86 | bwd_inner_microstep: 1853.53 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.66
+[2024-06-18 22:59:12,301] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3538.58 | bwd: 3597.56 | bwd_inner: 3587.40 | bwd_allreduce: 10.02 | step: 61.75
+ 58%|█████▊    | 405/700 [52:21<44:49,  9.12s/it]                                                 {'loss': 0.3707, 'learning_rate': 3.977796515691785e-05, 'epoch': 4.05}
+ 58%|█████▊    | 405/700 [52:21<44:49,  9.12s/it][2024-06-18 22:59:16,220] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1969.61 | bwd_microstep: 1927.44 | bwd_inner_microstep: 1922.66 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.07
+[2024-06-18 22:59:20,179] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 22:59:20,179] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1969.85 | bwd_microstep: 1909.99 | bwd_inner_microstep: 1904.68 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.58
+[2024-06-18 22:59:20,180] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3939.43 | bwd: 3837.45 | bwd_inner: 3827.37 | bwd_allreduce: 9.94 | step: 61.66
+ 58%|█████▊    | 406/700 [52:29<42:50,  8.74s/it]                                                 {'loss': 0.818, 'learning_rate': 3.955162184843625e-05, 'epoch': 4.06}
+ 58%|█████▊    | 406/700 [52:29<42:50,  8.74s/it][2024-06-18 22:59:24,090] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1969.47 | bwd_microstep: 1919.62 | bwd_inner_microstep: 1914.65 | bwd_allreduce_microstep: 4.88 | step_microstep: 0.09
+[2024-06-18 22:59:28,029] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.90
+[2024-06-18 22:59:28,029] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1961.76 | bwd_microstep: 1895.89 | bwd_inner_microstep: 1890.49 | bwd_allreduce_microstep: 5.24 | step_microstep: 62.57
+[2024-06-18 22:59:28,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3931.20 | bwd: 3815.53 | bwd_inner: 3805.24 | bwd_allreduce: 10.12 | step: 62.67
+ 58%|█████▊    | 407/700 [52:37<41:23,  8.48s/it]                                                 {'loss': 0.5838, 'learning_rate': 3.9325502210246514e-05, 'epoch': 4.07}
+ 58%|█████▊    | 407/700 [52:37<41:23,  8.48s/it][2024-06-18 22:59:32,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1990.33 | bwd_microstep: 1958.55 | bwd_inner_microstep: 1953.63 | bwd_allreduce_microstep: 4.78 | step_microstep: 0.07
+[2024-06-18 22:59:35,811] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:59:35,812] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1914.73 | bwd_microstep: 1815.34 | bwd_inner_microstep: 1809.96 | bwd_allreduce_microstep: 5.25 | step_microstep: 62.11
+[2024-06-18 22:59:35,812] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3905.03 | bwd: 3773.91 | bwd_inner: 3763.71 | bwd_allreduce: 10.01 | step: 62.19
+ 58%|█████▊    | 408/700 [52:45<40:14,  8.27s/it]                                                 {'loss': 0.2727, 'learning_rate': 3.9099611082932e-05, 'epoch': 4.08}
+ 58%|█████▊    | 408/700 [52:45<40:14,  8.27s/it][2024-06-18 22:59:38,898] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1416.97 | bwd_microstep: 1648.11 | bwd_inner_microstep: 1643.20 | bwd_allreduce_microstep: 4.83 | step_microstep: 0.08
+[2024-06-18 22:59:42,874] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 22:59:42,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1971.00 | bwd_microstep: 1924.72 | bwd_inner_microstep: 1919.17 | bwd_allreduce_microstep: 5.40 | step_microstep: 62.17
+[2024-06-18 22:59:42,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3387.93 | bwd: 3572.85 | bwd_inner: 3562.44 | bwd_allreduce: 10.22 | step: 62.26
+ 58%|█████▊    | 409/700 [52:52<38:20,  7.91s/it]                                                 {'loss': 0.9802, 'learning_rate': 3.887395330218429e-05, 'epoch': 4.09}
+ 58%|█████▊    | 409/700 [52:52<38:20,  7.91s/it][2024-06-18 22:59:46,395] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1649.08 | bwd_microstep: 1848.72 | bwd_inner_microstep: 1843.78 | bwd_allreduce_microstep: 4.80 | step_microstep: 0.07
+[2024-06-18 22:59:50,397] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 22:59:50,397] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1982.79 | bwd_microstep: 1939.86 | bwd_inner_microstep: 1934.51 | bwd_allreduce_microstep: 5.28 | step_microstep: 61.76
+[2024-06-18 22:59:50,398] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3631.84 | bwd: 3788.60 | bwd_inner: 3778.36 | bwd_allreduce: 10.06 | step: 61.84
+ 59%|█████▊    | 410/700 [52:59<37:39,  7.79s/it]                                                 {'loss': 0.6162, 'learning_rate': 3.8648533698699695e-05, 'epoch': 4.1}
+ 59%|█████▊    | 410/700 [52:59<37:39,  7.79s/it][2024-06-18 22:59:53,804] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1518.04 | bwd_microstep: 1867.78 | bwd_inner_microstep: 1862.67 | bwd_allreduce_microstep: 4.96 | step_microstep: 0.14
+[2024-06-18 22:59:57,609] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 22:59:57,610] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1916.83 | bwd_microstep: 1807.60 | bwd_inner_microstep: 1802.13 | bwd_allreduce_microstep: 5.33 | step_microstep: 61.81
+[2024-06-18 22:59:57,610] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3434.84 | bwd: 3675.40 | bwd_inner: 3664.90 | bwd_allreduce: 10.26 | step: 61.96
+ 59%|█████▊    | 411/700 [53:07<36:41,  7.62s/it]                                                 {'loss': 0.2528, 'learning_rate': 3.8423357098075815e-05, 'epoch': 4.11}
+ 59%|█████▊    | 411/700 [53:07<36:41,  7.62s/it][2024-06-18 23:00:01,532] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1971.80 | bwd_microstep: 1927.71 | bwd_inner_microstep: 1922.74 | bwd_allreduce_microstep: 4.83 | step_microstep: 0.07
+[2024-06-18 23:00:05,504] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:00:05,505] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1969.95 | bwd_microstep: 1923.19 | bwd_inner_microstep: 1917.80 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.58
+[2024-06-18 23:00:05,505] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3941.72 | bwd: 3850.92 | bwd_inner: 3840.60 | bwd_allreduce: 10.13 | step: 61.66
+ 59%|█████▉    | 412/700 [53:14<36:57,  7.70s/it]                                                 {'loss': 0.6246, 'learning_rate': 3.8198428320708216e-05, 'epoch': 4.12}
+ 59%|█████▉    | 412/700 [53:14<36:57,  7.70s/it][2024-06-18 23:00:08,523] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1386.55 | bwd_microstep: 1609.93 | bwd_inner_microstep: 1604.95 | bwd_allreduce_microstep: 4.84 | step_microstep: 0.08
+[2024-06-18 23:00:11,906] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:00:11,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1493.86 | bwd_microstep: 1810.37 | bwd_inner_microstep: 1805.04 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.21
+[2024-06-18 23:00:11,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2880.38 | bwd: 3420.33 | bwd_inner: 3410.06 | bwd_allreduce: 10.08 | step: 61.29
+ 59%|█████▉    | 413/700 [53:21<34:58,  7.31s/it]                                                 {'loss': 0.4011, 'learning_rate': 3.7973752181687335e-05, 'epoch': 4.13}
+ 59%|█████▉    | 413/700 [53:21<34:58,  7.31s/it][2024-06-18 23:00:15,790] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1963.41 | bwd_microstep: 1897.88 | bwd_inner_microstep: 1892.80 | bwd_allreduce_microstep: 4.93 | step_microstep: 0.08
+[2024-06-18 23:00:19,716] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:00:19,716] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1964.25 | bwd_microstep: 1882.05 | bwd_inner_microstep: 1876.60 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.57
+[2024-06-18 23:00:19,717] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3927.63 | bwd: 3779.95 | bwd_inner: 3769.51 | bwd_allreduce: 10.23 | step: 61.65
+ 59%|█████▉    | 414/700 [53:29<35:33,  7.46s/it]                                                 {'loss': 0.3105, 'learning_rate': 3.774933349069524e-05, 'epoch': 4.14}
+ 59%|█████▉    | 414/700 [53:29<35:33,  7.46s/it][2024-06-18 23:00:23,397] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1802.59 | bwd_microstep: 1856.79 | bwd_inner_microstep: 1851.84 | bwd_allreduce_microstep: 4.81 | step_microstep: 0.07
+[2024-06-18 23:00:27,333] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 23:00:27,333] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1964.31 | bwd_microstep: 1891.23 | bwd_inner_microstep: 1885.79 | bwd_allreduce_microstep: 5.31 | step_microstep: 62.22
+[2024-06-18 23:00:27,334] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3766.87 | bwd: 3748.04 | bwd_inner: 3737.73 | bwd_allreduce: 10.10 | step: 62.30
+ 59%|█████▉    | 415/700 [53:36<35:39,  7.51s/it]                                                 {'loss': 0.5954, 'learning_rate': 3.7525177051902874e-05, 'epoch': 4.15}
+ 59%|█████▉    | 415/700 [53:36<35:39,  7.51s/it][2024-06-18 23:00:30,650] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1491.52 | bwd_microstep: 1804.12 | bwd_inner_microstep: 1799.04 | bwd_allreduce_microstep: 4.99 | step_microstep: 0.10
+[2024-06-18 23:00:34,650] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 23:00:34,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1978.25 | bwd_microstep: 1939.62 | bwd_inner_microstep: 1934.12 | bwd_allreduce_microstep: 5.35 | step_microstep: 62.10
+[2024-06-18 23:00:34,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3469.73 | bwd: 3743.76 | bwd_inner: 3733.23 | bwd_allreduce: 10.32 | step: 62.20
+ 59%|█████▉    | 416/700 [53:44<35:15,  7.45s/it]                                                 {'loss': 0.5291, 'learning_rate': 3.7301287663867005e-05, 'epoch': 4.16}
+ 59%|█████▉    | 416/700 [53:44<35:15,  7.45s/it][2024-06-18 23:00:38,555] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1964.74 | bwd_microstep: 1918.06 | bwd_inner_microstep: 1913.22 | bwd_allreduce_microstep: 4.69 | step_microstep: 0.08
+[2024-06-18 23:00:42,362] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:00:42,362] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1916.13 | bwd_microstep: 1810.92 | bwd_inner_microstep: 1805.58 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.61
+[2024-06-18 23:00:42,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3880.84 | bwd: 3729.00 | bwd_inner: 3718.87 | bwd_allreduce: 9.96 | step: 61.70
+ 60%|█████▉    | 417/700 [53:51<35:30,  7.53s/it]                                                 {'loss': 0.24, 'learning_rate': 3.7077670119427645e-05, 'epoch': 4.17}
+ 60%|█████▉    | 417/700 [53:51<35:30,  7.53s/it][2024-06-18 23:00:45,992] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1880.34 | bwd_microstep: 1726.99 | bwd_inner_microstep: 1722.04 | bwd_allreduce_microstep: 4.81 | step_microstep: 0.08
+[2024-06-18 23:00:49,160] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 23:00:49,161] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1422.62 | bwd_microstep: 1666.89 | bwd_inner_microstep: 1661.59 | bwd_allreduce_microstep: 5.22 | step_microstep: 61.69
+[2024-06-18 23:00:49,161] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3302.94 | bwd: 3393.90 | bwd_inner: 3383.70 | bwd_allreduce: 10.03 | step: 61.78
+ 60%|█████▉    | 418/700 [53:58<34:21,  7.31s/it]                                                 {'loss': 0.3224, 'learning_rate': 3.68543292056054e-05, 'epoch': 4.18}
+ 60%|█████▉    | 418/700 [53:58<34:21,  7.31s/it][2024-06-18 23:00:53,069] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.63 | bwd_microstep: 1918.05 | bwd_inner_microstep: 1913.25 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.07
+[2024-06-18 23:00:56,164] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 23:00:56,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1393.09 | bwd_microstep: 1622.33 | bwd_inner_microstep: 1616.61 | bwd_allreduce_microstep: 5.58 | step_microstep: 62.61
+[2024-06-18 23:00:56,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3360.68 | bwd: 3540.40 | bwd_inner: 3529.93 | bwd_allreduce: 10.32 | step: 62.69
+ 60%|█████▉    | 419/700 [54:05<33:48,  7.22s/it]                                                 {'loss': 0.4368, 'learning_rate': 3.663126970349897e-05, 'epoch': 4.19}
+ 60%|█████▉    | 419/700 [54:05<33:48,  7.22s/it][2024-06-18 23:00:59,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1443.26 | bwd_microstep: 1726.36 | bwd_inner_microstep: 1721.41 | bwd_allreduce_microstep: 4.80 | step_microstep: 0.07
+[2024-06-18 23:01:03,349] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.80
+[2024-06-18 23:01:03,350] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1978.29 | bwd_microstep: 1934.17 | bwd_inner_microstep: 1928.65 | bwd_allreduce_microstep: 5.38 | step_microstep: 61.54
+[2024-06-18 23:01:03,350] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3421.52 | bwd: 3660.55 | bwd_inner: 3650.17 | bwd_allreduce: 10.14 | step: 61.62
+ 60%|██████    | 420/700 [54:12<33:38,  7.21s/it]                                                 {'loss': 0.5847, 'learning_rate': 3.640849638818286e-05, 'epoch': 4.2}
+ 60%|██████    | 420/700 [54:12<33:38,  7.21s/it][2024-06-18 23:01:06,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1420.12 | bwd_microstep: 1667.13 | bwd_inner_microstep: 1662.15 | bwd_allreduce_microstep: 4.84 | step_microstep: 0.07
+[2024-06-18 23:01:10,485] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 23:01:10,485] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1993.88 | bwd_microstep: 1952.64 | bwd_inner_microstep: 1947.11 | bwd_allreduce_microstep: 5.41 | step_microstep: 62.29
+[2024-06-18 23:01:10,485] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3413.97 | bwd: 3619.79 | bwd_inner: 3609.36 | bwd_allreduce: 10.23 | step: 62.37
+ 60%|██████    | 421/700 [54:19<33:24,  7.19s/it]                                                 {'loss': 0.55, 'learning_rate': 3.6186014028605096e-05, 'epoch': 4.21}
+ 60%|██████    | 421/700 [54:19<33:24,  7.19s/it][2024-06-18 23:01:14,468] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1995.34 | bwd_microstep: 1965.58 | bwd_inner_microstep: 1960.82 | bwd_allreduce_microstep: 4.68 | step_microstep: 0.07
+[2024-06-18 23:01:17,861] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.80
+[2024-06-18 23:01:17,861] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1498.52 | bwd_microstep: 1815.75 | bwd_inner_microstep: 1810.25 | bwd_allreduce_microstep: 5.35 | step_microstep: 61.59
+[2024-06-18 23:01:17,862] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3493.83 | bwd: 3781.35 | bwd_inner: 3771.14 | bwd_allreduce: 10.02 | step: 61.67
+ 60%|██████    | 422/700 [54:27<33:33,  7.24s/it]                                                 {'loss': 0.7909, 'learning_rate': 3.596382738748516e-05, 'epoch': 4.22}
+ 60%|██████    | 422/700 [54:27<33:33,  7.24s/it][2024-06-18 23:01:21,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1966.34 | bwd_microstep: 1912.44 | bwd_inner_microstep: 1907.59 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.08
+[2024-06-18 23:01:25,729] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:01:25,730] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1972.70 | bwd_microstep: 1915.04 | bwd_inner_microstep: 1909.67 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.74
+[2024-06-18 23:01:25,730] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3939.01 | bwd: 3827.50 | bwd_inner: 3817.33 | bwd_allreduce: 10.03 | step: 61.82
+ 60%|██████    | 423/700 [54:35<34:18,  7.43s/it]                                                 {'loss': 0.7457, 'learning_rate': 3.574194122121207e-05, 'epoch': 4.23}
+ 60%|██████    | 423/700 [54:35<34:18,  7.43s/it][2024-06-18 23:01:29,640] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1968.99 | bwd_microstep: 1919.56 | bwd_inner_microstep: 1914.77 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.08
+[2024-06-18 23:01:33,577] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:01:33,577] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1962.50 | bwd_microstep: 1894.94 | bwd_inner_microstep: 1889.43 | bwd_allreduce_microstep: 5.36 | step_microstep: 61.79
+[2024-06-18 23:01:33,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3931.45 | bwd: 3814.52 | bwd_inner: 3804.26 | bwd_allreduce: 10.08 | step: 61.88
+ 61%|██████    | 424/700 [54:43<34:45,  7.56s/it]                                                 {'loss': 0.6966, 'learning_rate': 3.55203602797425e-05, 'epoch': 4.24}
+ 61%|██████    | 424/700 [54:43<34:45,  7.56s/it][2024-06-18 23:01:37,167] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1673.72 | bwd_microstep: 1894.62 | bwd_inner_microstep: 1889.55 | bwd_allreduce_microstep: 4.97 | step_microstep: 0.10
+[2024-06-18 23:01:40,669] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 23:01:40,669] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1723.94 | bwd_microstep: 1696.01 | bwd_inner_microstep: 1690.55 | bwd_allreduce_microstep: 5.32 | step_microstep: 62.18
+[2024-06-18 23:01:40,670] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3397.63 | bwd: 3590.65 | bwd_inner: 3580.18 | bwd_allreduce: 10.30 | step: 62.28
+ 61%|██████    | 425/700 [54:50<33:59,  7.42s/it]                                                 {'loss': 0.3457, 'learning_rate': 3.52990893064991e-05, 'epoch': 4.25}
+ 61%|██████    | 425/700 [54:50<33:59,  7.42s/it][2024-06-18 23:01:44,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1974.07 | bwd_microstep: 1932.41 | bwd_inner_microstep: 1927.57 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.08
+[2024-06-18 23:01:48,659] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.89
+[2024-06-18 23:01:48,659] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2005.05 | bwd_microstep: 1975.24 | bwd_inner_microstep: 1969.82 | bwd_allreduce_microstep: 5.33 | step_microstep: 62.00
+[2024-06-18 23:01:48,660] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3979.09 | bwd: 3907.67 | bwd_inner: 3897.45 | bwd_allreduce: 10.08 | step: 62.08
+ 61%|██████    | 426/700 [54:58<34:39,  7.59s/it]                                                 {'loss': 0.5137, 'learning_rate': 3.507813303826903e-05, 'epoch': 4.26}
+ 61%|██████    | 426/700 [54:58<34:39,  7.59s/it][2024-06-18 23:01:51,859] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1448.33 | bwd_microstep: 1729.57 | bwd_inner_microstep: 1724.65 | bwd_allreduce_microstep: 4.77 | step_microstep: 0.07
+[2024-06-18 23:01:55,971] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:01:55,971] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2014.86 | bwd_microstep: 2017.07 | bwd_inner_microstep: 2011.75 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.64
+[2024-06-18 23:01:55,971] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3463.16 | bwd: 3746.65 | bwd_inner: 3736.47 | bwd_allreduce: 9.99 | step: 61.72
+ 61%|██████    | 427/700 [55:05<34:09,  7.51s/it]                                                 {'loss': 0.8169, 'learning_rate': 3.4857496205102474e-05, 'epoch': 4.27}
+ 61%|██████    | 427/700 [55:05<34:09,  7.51s/it][2024-06-18 23:01:59,951] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1995.97 | bwd_microstep: 1961.42 | bwd_inner_microstep: 1956.51 | bwd_allreduce_microstep: 4.83 | step_microstep: 0.14
+[2024-06-18 23:02:03,910] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 23:02:03,911] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.06 | bwd_microstep: 1910.16 | bwd_inner_microstep: 1904.83 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.99
+[2024-06-18 23:02:03,911] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3966.00 | bwd: 3871.60 | bwd_inner: 3861.38 | bwd_allreduce: 10.08 | step: 62.14
+ 61%|██████    | 428/700 [55:13<34:36,  7.64s/it]                                                 {'loss': 0.3222, 'learning_rate': 3.463718353021138e-05, 'epoch': 4.28}
+ 61%|██████    | 428/700 [55:13<34:36,  7.64s/it][2024-06-18 23:02:07,895] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1998.09 | bwd_microstep: 1964.49 | bwd_inner_microstep: 1959.61 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.08
+[2024-06-18 23:02:11,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:02:11,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1965.06 | bwd_microstep: 1890.79 | bwd_inner_microstep: 1885.41 | bwd_allreduce_microstep: 5.29 | step_microstep: 61.85
+[2024-06-18 23:02:11,832] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3963.12 | bwd: 3855.30 | bwd_inner: 3845.09 | bwd_allreduce: 10.03 | step: 61.93
+ 61%|██████▏   | 429/700 [55:21<34:52,  7.72s/it]                                                 {'loss': 0.3728, 'learning_rate': 3.441719972986846e-05, 'epoch': 4.29}
+ 61%|██████▏   | 429/700 [55:21<34:52,  7.72s/it][2024-06-18 23:02:15,805] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1992.26 | bwd_microstep: 1959.94 | bwd_inner_microstep: 1955.13 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.07
+[2024-06-18 23:02:19,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 23:02:19,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.71 | bwd_microstep: 1900.92 | bwd_inner_microstep: 1895.55 | bwd_allreduce_microstep: 5.28 | step_microstep: 61.44
+[2024-06-18 23:02:19,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3962.94 | bwd: 3860.89 | bwd_inner: 3850.74 | bwd_allreduce: 10.02 | step: 61.52
+ 61%|██████▏   | 430/700 [55:29<35:01,  7.78s/it]                                                 {'loss': 0.5675, 'learning_rate': 3.419754951330608e-05, 'epoch': 4.3}
+ 61%|██████▏   | 430/700 [55:29<35:01,  7.78s/it][2024-06-18 23:02:23,675] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1973.34 | bwd_microstep: 1923.21 | bwd_inner_microstep: 1918.39 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.07
+[2024-06-18 23:02:27,650] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 23:02:27,650] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1975.87 | bwd_microstep: 1919.67 | bwd_inner_microstep: 1914.32 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.80
+[2024-06-18 23:02:27,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3949.18 | bwd: 3842.91 | bwd_inner: 3832.76 | bwd_allreduce: 10.01 | step: 61.88
+ 62%|██████▏   | 431/700 [55:37<35:02,  7.82s/it]                                                 {'loss': 0.3658, 'learning_rate': 3.397823758261553e-05, 'epoch': 4.31}
+ 62%|██████▏   | 431/700 [55:37<35:02,  7.82s/it][2024-06-18 23:02:30,994] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1497.44 | bwd_microstep: 1824.83 | bwd_inner_microstep: 1820.05 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.08
+[2024-06-18 23:02:34,782] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:02:34,782] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1814.74 | bwd_microstep: 1893.90 | bwd_inner_microstep: 1888.44 | bwd_allreduce_microstep: 5.32 | step_microstep: 61.63
+[2024-06-18 23:02:34,782] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3312.15 | bwd: 3718.75 | bwd_inner: 3708.56 | bwd_allreduce: 10.00 | step: 61.71
+ 62%|██████▏   | 432/700 [55:44<33:59,  7.61s/it]                                                 {'loss': 0.6603, 'learning_rate': 3.37592686326464e-05, 'epoch': 4.32}
+ 62%|██████▏   | 432/700 [55:44<33:59,  7.61s/it][2024-06-18 23:02:38,528] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1916.87 | bwd_microstep: 1807.54 | bwd_inner_microstep: 1802.55 | bwd_allreduce_microstep: 4.90 | step_microstep: 0.09
+[2024-06-18 23:02:42,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 23:02:42,775] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2057.37 | bwd_microstep: 2107.37 | bwd_inner_microstep: 2101.96 | bwd_allreduce_microstep: 5.25 | step_microstep: 62.10
+[2024-06-18 23:02:42,775] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3974.21 | bwd: 3914.93 | bwd_inner: 3904.61 | bwd_allreduce: 10.16 | step: 62.20
+ 62%|██████▏   | 433/700 [55:52<34:22,  7.73s/it]                                                 {'loss': 0.4974, 'learning_rate': 3.354064735090599e-05, 'epoch': 4.33}
+ 62%|██████▏   | 433/700 [55:52<34:22,  7.73s/it][2024-06-18 23:02:46,116] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1495.18 | bwd_microstep: 1824.28 | bwd_inner_microstep: 1819.49 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.07
+[2024-06-18 23:02:49,925] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:02:49,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1920.76 | bwd_microstep: 1809.81 | bwd_inner_microstep: 1804.47 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.38
+[2024-06-18 23:02:49,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3415.91 | bwd: 3634.11 | bwd_inner: 3624.01 | bwd_allreduce: 9.97 | step: 61.47
+ 62%|██████▏   | 434/700 [55:59<33:29,  7.55s/it]                                                 {'loss': 0.1962, 'learning_rate': 3.332237841745898e-05, 'epoch': 4.34}
+ 62%|██████▏   | 434/700 [55:59<33:29,  7.55s/it][2024-06-18 23:02:53,796] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1958.88 | bwd_microstep: 1888.76 | bwd_inner_microstep: 1883.93 | bwd_allreduce_microstep: 4.75 | step_microstep: 0.08
+[2024-06-18 23:02:57,490] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 23:02:57,491] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1884.42 | bwd_microstep: 1730.42 | bwd_inner_microstep: 1724.90 | bwd_allreduce_microstep: 5.37 | step_microstep: 61.91
+[2024-06-18 23:02:57,491] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3843.27 | bwd: 3619.20 | bwd_inner: 3608.91 | bwd_allreduce: 10.10 | step: 62.00
+ 62%|██████▏   | 435/700 [56:06<33:22,  7.56s/it]                                                 {'loss': 0.2256, 'learning_rate': 3.310446650482732e-05, 'epoch': 4.35}
+ 62%|██████▏   | 435/700 [56:06<33:22,  7.56s/it][2024-06-18 23:03:01,063] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1856.51 | bwd_microstep: 1693.47 | bwd_inner_microstep: 1688.63 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.07
+[2024-06-18 23:03:05,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:03:05,006] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.72 | bwd_microstep: 1895.99 | bwd_inner_microstep: 1890.63 | bwd_allreduce_microstep: 5.29 | step_microstep: 61.68
+[2024-06-18 23:03:05,007] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3824.20 | bwd: 3589.49 | bwd_inner: 3579.31 | bwd_allreduce: 10.04 | step: 61.76
+ 62%|██████▏   | 436/700 [56:14<33:11,  7.54s/it]                                                 {'loss': 0.2978, 'learning_rate': 3.288691627789017e-05, 'epoch': 4.36}
+ 62%|██████▏   | 436/700 [56:14<33:11,  7.54s/it][2024-06-18 23:03:08,992] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1997.16 | bwd_microstep: 1966.84 | bwd_inner_microstep: 1961.89 | bwd_allreduce_microstep: 4.87 | step_microstep: 0.14
+[2024-06-18 23:03:13,046] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:03:13,047] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2004.35 | bwd_microstep: 1969.49 | bwd_inner_microstep: 1964.05 | bwd_allreduce_microstep: 5.30 | step_microstep: 61.96
+[2024-06-18 23:03:13,047] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 4001.48 | bwd: 3936.35 | bwd_inner: 3926.01 | bwd_allreduce: 10.15 | step: 62.11
+ 62%|██████▏   | 437/700 [56:22<33:43,  7.69s/it]                                                 {'loss': 0.5792, 'learning_rate': 3.266973239378394e-05, 'epoch': 4.37}
+ 62%|██████▏   | 437/700 [56:22<33:43,  7.69s/it][2024-06-18 23:03:16,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1965.22 | bwd_microstep: 1893.94 | bwd_inner_microstep: 1889.01 | bwd_allreduce_microstep: 4.79 | step_microstep: 0.08
+[2024-06-18 23:03:20,916] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:03:20,917] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1975.39 | bwd_microstep: 1933.80 | bwd_inner_microstep: 1928.42 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.85
+[2024-06-18 23:03:20,917] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3940.58 | bwd: 3827.77 | bwd_inner: 3817.50 | bwd_allreduce: 10.08 | step: 61.94
+ 63%|██████▎   | 438/700 [56:30<33:49,  7.75s/it]                                                 {'loss': 0.468, 'learning_rate': 3.2452919501802715e-05, 'epoch': 4.38}
+ 63%|██████▎   | 438/700 [56:30<33:49,  7.75s/it][2024-06-18 23:03:24,661] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1914.39 | bwd_microstep: 1808.23 | bwd_inner_microstep: 1803.39 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.07
+[2024-06-18 23:03:28,598] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:03:28,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1964.93 | bwd_microstep: 1893.21 | bwd_inner_microstep: 1887.79 | bwd_allreduce_microstep: 5.28 | step_microstep: 61.32
+[2024-06-18 23:03:28,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3879.29 | bwd: 3701.46 | bwd_inner: 3691.25 | bwd_allreduce: 10.05 | step: 61.40
+ 63%|██████▎   | 439/700 [56:38<33:36,  7.73s/it]                                                 {'loss': 0.2519, 'learning_rate': 3.2236482243298714e-05, 'epoch': 4.39}
+ 63%|██████▎   | 439/700 [56:38<33:36,  7.73s/it][2024-06-18 23:03:32,579] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1996.66 | bwd_microstep: 1961.46 | bwd_inner_microstep: 1956.45 | bwd_allreduce_microstep: 4.92 | step_microstep: 0.08
+[2024-06-18 23:03:36,611] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:03:36,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1995.55 | bwd_microstep: 1956.74 | bwd_inner_microstep: 1951.50 | bwd_allreduce_microstep: 5.19 | step_microstep: 61.87
+[2024-06-18 23:03:36,612] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3992.18 | bwd: 3918.22 | bwd_inner: 3907.99 | bwd_allreduce: 10.13 | step: 61.96
+ 63%|██████▎   | 440/700 [56:46<33:51,  7.81s/it]                                                 {'loss': 0.7188, 'learning_rate': 3.2020425251582844e-05, 'epoch': 4.4}
+ 63%|██████▎   | 440/700 [56:46<33:51,  7.81s/it][2024-06-18 23:03:40,263] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1890.41 | bwd_microstep: 1739.04 | bwd_inner_microstep: 1734.22 | bwd_allreduce_microstep: 4.75 | step_microstep: 0.07
+[2024-06-18 23:03:44,204] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:03:44,205] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1969.16 | bwd_microstep: 1892.89 | bwd_inner_microstep: 1887.53 | bwd_allreduce_microstep: 5.28 | step_microstep: 61.60
+[2024-06-18 23:03:44,205] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3859.55 | bwd: 3631.95 | bwd_inner: 3621.77 | bwd_allreduce: 10.05 | step: 61.68
+ 63%|██████▎   | 441/700 [56:53<33:26,  7.75s/it]                                                 {'loss': 0.3078, 'learning_rate': 3.180475315182563e-05, 'epoch': 4.41}
+ 63%|██████▎   | 441/700 [56:53<33:26,  7.75s/it][2024-06-18 23:03:48,086] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1964.43 | bwd_microstep: 1894.53 | bwd_inner_microstep: 1889.47 | bwd_allreduce_microstep: 4.98 | step_microstep: 0.09
+[2024-06-18 23:03:50,067] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 23:03:50,068] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 837.17 | bwd_microstep: 1064.97 | bwd_inner_microstep: 1059.63 | bwd_allreduce_microstep: 5.26 | step_microstep: 62.12
+[2024-06-18 23:03:50,068] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2801.57 | bwd: 2959.52 | bwd_inner: 2949.14 | bwd_allreduce: 10.25 | step: 62.22
+ 63%|██████▎   | 442/700 [56:59<30:52,  7.18s/it]                                                 {'loss': 0.6103, 'learning_rate': 3.1589470560958104e-05, 'epoch': 4.42}
+ 63%|██████▎   | 442/700 [56:59<30:52,  7.18s/it][2024-06-18 23:03:53,937] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1958.86 | bwd_microstep: 1887.81 | bwd_inner_microstep: 1882.91 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.08
+[2024-06-18 23:03:57,913] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.89
+[2024-06-18 23:03:57,914] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1975.57 | bwd_microstep: 1921.53 | bwd_inner_microstep: 1916.13 | bwd_allreduce_microstep: 5.32 | step_microstep: 62.03
+[2024-06-18 23:03:57,914] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3934.39 | bwd: 3809.37 | bwd_inner: 3799.12 | bwd_allreduce: 10.06 | step: 62.11
+ 63%|██████▎   | 443/700 [57:07<31:36,  7.38s/it]                                                 {'loss': 0.686, 'learning_rate': 3.137458208757302e-05, 'epoch': 4.43}
+ 63%|██████▎   | 443/700 [57:07<31:36,  7.38s/it][2024-06-18 23:04:00,987] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1413.45 | bwd_microstep: 1638.00 | bwd_inner_microstep: 1633.18 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.07
+[2024-06-18 23:04:04,966] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 23:04:04,967] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1974.07 | bwd_microstep: 1925.21 | bwd_inner_microstep: 1919.75 | bwd_allreduce_microstep: 5.32 | step_microstep: 61.60
+[2024-06-18 23:04:04,967] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3387.49 | bwd: 3563.23 | bwd_inner: 3553.00 | bwd_allreduce: 10.04 | step: 61.68
+ 63%|██████▎   | 444/700 [57:14<31:04,  7.28s/it]                                                 {'loss': 0.3806, 'learning_rate': 3.116009233182623e-05, 'epoch': 4.44}
+ 63%|██████▎   | 444/700 [57:14<31:04,  7.28s/it][2024-06-18 23:04:08,540] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1857.97 | bwd_microstep: 1693.44 | bwd_inner_microstep: 1688.67 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.08
+[2024-06-18 23:04:12,492] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 23:04:12,493] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1973.04 | bwd_microstep: 1900.10 | bwd_inner_microstep: 1894.54 | bwd_allreduce_microstep: 5.48 | step_microstep: 62.23
+[2024-06-18 23:04:12,493] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3830.98 | bwd: 3593.56 | bwd_inner: 3583.23 | bwd_allreduce: 10.22 | step: 62.31
+ 64%|██████▎   | 445/700 [57:21<31:15,  7.36s/it]                                                 {'loss': 0.1448, 'learning_rate': 3.0946005885338113e-05, 'epoch': 4.45}
+ 64%|██████▎   | 445/700 [57:21<31:15,  7.36s/it][2024-06-18 23:04:16,473] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1995.59 | bwd_microstep: 1962.20 | bwd_inner_microstep: 1957.31 | bwd_allreduce_microstep: 4.75 | step_microstep: 0.07
+[2024-06-18 23:04:20,420] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 23:04:20,421] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1968.79 | bwd_microstep: 1897.55 | bwd_inner_microstep: 1892.04 | bwd_allreduce_microstep: 5.39 | step_microstep: 62.49
+[2024-06-18 23:04:20,421] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3964.34 | bwd: 3859.77 | bwd_inner: 3849.46 | bwd_allreduce: 10.14 | step: 62.57
+ 64%|██████▎   | 446/700 [57:29<31:51,  7.53s/it]                                                 {'loss': 0.5645, 'learning_rate': 3.073232733109536e-05, 'epoch': 4.46}
+ 64%|██████▎   | 446/700 [57:29<31:51,  7.53s/it][2024-06-18 23:04:24,290] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1959.75 | bwd_microstep: 1887.62 | bwd_inner_microstep: 1882.55 | bwd_allreduce_microstep: 4.95 | step_microstep: 0.09
+[2024-06-18 23:04:27,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:04:27,714] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1598.03 | bwd_microstep: 1745.01 | bwd_inner_microstep: 1739.64 | bwd_allreduce_microstep: 5.29 | step_microstep: 61.65
+[2024-06-18 23:04:27,714] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3557.74 | bwd: 3632.65 | bwd_inner: 3622.26 | bwd_allreduce: 10.25 | step: 61.74
+ 64%|██████▍   | 447/700 [57:37<31:26,  7.46s/it]                                                 {'loss': 0.1435, 'learning_rate': 3.0519061243352834e-05, 'epoch': 4.47}
+ 64%|██████▍   | 447/700 [57:37<31:26,  7.46s/it][2024-06-18 23:04:31,628] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.00 | bwd_microstep: 1922.07 | bwd_inner_microstep: 1917.23 | bwd_allreduce_microstep: 4.77 | step_microstep: 0.08
+[2024-06-18 23:04:35,564] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:04:35,565] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1964.83 | bwd_microstep: 1891.52 | bwd_inner_microstep: 1886.01 | bwd_allreduce_microstep: 5.37 | step_microstep: 61.83
+[2024-06-18 23:04:35,565] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3934.79 | bwd: 3813.61 | bwd_inner: 3803.30 | bwd_allreduce: 10.13 | step: 61.91
+ 64%|██████▍   | 448/700 [57:45<31:48,  7.58s/it]                                                 {'loss': 0.477, 'learning_rate': 3.0306212187535653e-05, 'epoch': 4.48}
+ 64%|██████▍   | 448/700 [57:45<31:48,  7.58s/it][2024-06-18 23:04:39,056] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1828.63 | bwd_microstep: 1640.24 | bwd_inner_microstep: 1635.41 | bwd_allreduce_microstep: 4.75 | step_microstep: 0.07
+[2024-06-18 23:04:43,102] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 23:04:43,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2000.36 | bwd_microstep: 1966.88 | bwd_inner_microstep: 1961.62 | bwd_allreduce_microstep: 5.19 | step_microstep: 61.84
+[2024-06-18 23:04:43,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3828.97 | bwd: 3607.15 | bwd_inner: 3597.07 | bwd_allreduce: 9.95 | step: 61.92
+ 64%|██████▍   | 449/700 [57:52<31:38,  7.56s/it]                                                 {'loss': 0.4546, 'learning_rate': 3.0093784720141455e-05, 'epoch': 4.49}
+ 64%|██████▍   | 449/700 [57:52<31:38,  7.56s/it][2024-06-18 23:04:46,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1626.17 | bwd_microstep: 1659.73 | bwd_inner_microstep: 1654.77 | bwd_allreduce_microstep: 4.82 | step_microstep: 0.07
+[2024-06-18 23:04:50,359] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.89
+[2024-06-18 23:04:50,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1969.39 | bwd_microstep: 1896.92 | bwd_inner_microstep: 1891.28 | bwd_allreduce_microstep: 5.55 | step_microstep: 64.36
+[2024-06-18 23:04:50,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3595.52 | bwd: 3556.68 | bwd_inner: 3546.12 | bwd_allreduce: 10.36 | step: 64.44
+ 64%|██████▍   | 450/700 [57:59<31:08,  7.47s/it]                                                 {'loss': 0.7156, 'learning_rate': 2.9881783388642893e-05, 'epoch': 4.5}
+ 64%|██████▍   | 450/700 [57:59<31:08,  7.47s/it][2024-06-18 23:04:54,243] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.76 | bwd_microstep: 1892.51 | bwd_inner_microstep: 1887.67 | bwd_allreduce_microstep: 4.77 | step_microstep: 0.07
+[2024-06-18 23:04:57,401] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 23:04:57,401] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1421.60 | bwd_microstep: 1656.50 | bwd_inner_microstep: 1651.01 | bwd_allreduce_microstep: 5.41 | step_microstep: 62.52
+[2024-06-18 23:04:57,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3389.33 | bwd: 3549.03 | bwd_inner: 3538.71 | bwd_allreduce: 10.19 | step: 62.60
+ 64%|██████▍   | 451/700 [58:06<30:28,  7.34s/it]                                                 {'loss': 0.3016, 'learning_rate': 2.96702127313902e-05, 'epoch': 4.51}
+ 64%|██████▍   | 451/700 [58:06<30:28,  7.34s/it][2024-06-18 23:05:01,058] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1892.64 | bwd_microstep: 1741.41 | bwd_inner_microstep: 1736.42 | bwd_allreduce_microstep: 4.84 | step_microstep: 0.08
+[2024-06-18 23:05:05,131] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.90
+[2024-06-18 23:05:05,131] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2007.31 | bwd_microstep: 1985.13 | bwd_inner_microstep: 1979.70 | bwd_allreduce_microstep: 5.34 | step_microstep: 62.37
+[2024-06-18 23:05:05,132] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3899.91 | bwd: 3726.55 | bwd_inner: 3716.19 | bwd_allreduce: 10.17 | step: 62.45
+ 65%|██████▍   | 452/700 [58:14<30:49,  7.46s/it]                                                 {'loss': 0.3981, 'learning_rate': 2.945907727751412e-05, 'epoch': 4.52}
+ 65%|██████▍   | 452/700 [58:14<30:49,  7.46s/it][2024-06-18 23:05:09,106] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1994.32 | bwd_microstep: 1958.20 | bwd_inner_microstep: 1953.37 | bwd_allreduce_microstep: 4.75 | step_microstep: 0.07
+[2024-06-18 23:05:13,050] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.80
+[2024-06-18 23:05:13,051] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.92 | bwd_microstep: 1897.19 | bwd_inner_microstep: 1891.73 | bwd_allreduce_microstep: 5.34 | step_microstep: 61.59
+[2024-06-18 23:05:13,051] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3962.20 | bwd: 3855.41 | bwd_inner: 3845.18 | bwd_allreduce: 10.09 | step: 61.68
+ 65%|██████▍   | 453/700 [58:22<31:16,  7.60s/it]                                                 {'loss': 0.4881, 'learning_rate': 2.924838154682893e-05, 'epoch': 4.53}
+ 65%|██████▍   | 453/700 [58:22<31:16,  7.60s/it][2024-06-18 23:05:16,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1964.69 | bwd_microstep: 1904.82 | bwd_inner_microstep: 1899.74 | bwd_allreduce_microstep: 4.93 | step_microstep: 0.14
+[2024-06-18 23:05:20,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 23:05:20,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1966.18 | bwd_microstep: 1896.01 | bwd_inner_microstep: 1890.54 | bwd_allreduce_microstep: 5.33 | step_microstep: 61.85
+[2024-06-18 23:05:20,886] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3930.85 | bwd: 3800.85 | bwd_inner: 3790.38 | bwd_allreduce: 10.23 | step: 62.00
+ 65%|██████▍   | 454/700 [58:30<31:26,  7.67s/it]                                                 {'loss': 0.4777, 'learning_rate': 2.9038130049735634e-05, 'epoch': 4.54}
+ 65%|██████▍   | 454/700 [58:30<31:26,  7.67s/it][2024-06-18 23:05:24,374] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1826.20 | bwd_microstep: 1640.16 | bwd_inner_microstep: 1635.28 | bwd_allreduce_microstep: 4.81 | step_microstep: 0.07
+[2024-06-18 23:05:28,372] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 23:05:28,372] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1982.12 | bwd_microstep: 1936.19 | bwd_inner_microstep: 1930.65 | bwd_allreduce_microstep: 5.46 | step_microstep: 62.59
+[2024-06-18 23:05:28,373] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3808.29 | bwd: 3576.37 | bwd_inner: 3565.96 | bwd_allreduce: 10.28 | step: 62.67
+ 65%|██████▌   | 455/700 [58:37<31:05,  7.61s/it]                                                 {'loss': 0.4819, 'learning_rate': 2.882832728712551e-05, 'epoch': 4.55}
+ 65%|██████▌   | 455/700 [58:37<31:05,  7.61s/it][2024-06-18 23:05:32,294] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1973.44 | bwd_microstep: 1925.71 | bwd_inner_microstep: 1920.82 | bwd_allreduce_microstep: 4.77 | step_microstep: 0.08
+[2024-06-18 23:05:36,335] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 23:05:36,336] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1999.42 | bwd_microstep: 1961.94 | bwd_inner_microstep: 1956.48 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.68
+[2024-06-18 23:05:36,336] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3972.83 | bwd: 3887.66 | bwd_inner: 3877.41 | bwd_allreduce: 10.08 | step: 61.77
+ 65%|██████▌   | 456/700 [58:45<31:23,  7.72s/it]                                                 {'loss': 0.6737, 'learning_rate': 2.8618977750283603e-05, 'epoch': 4.56}
+ 65%|██████▌   | 456/700 [58:45<31:23,  7.72s/it][2024-06-18 23:05:40,328] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2001.19 | bwd_microstep: 1969.49 | bwd_inner_microstep: 1964.73 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.07
+[2024-06-18 23:05:44,321] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:05:44,322] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1978.40 | bwd_microstep: 1934.68 | bwd_inner_microstep: 1929.29 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.65
+[2024-06-18 23:05:44,322] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3979.55 | bwd: 3904.19 | bwd_inner: 3894.03 | bwd_allreduce: 10.05 | step: 61.74
+ 65%|██████▌   | 457/700 [58:53<31:35,  7.80s/it]                                                 {'loss': 0.5443, 'learning_rate': 2.8410085920792807e-05, 'epoch': 4.57}
+ 65%|██████▌   | 457/700 [58:53<31:35,  7.80s/it][2024-06-18 23:05:48,304] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1996.65 | bwd_microstep: 1963.03 | bwd_inner_microstep: 1958.06 | bwd_allreduce_microstep: 4.83 | step_microstep: 0.07
+[2024-06-18 23:05:52,281] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:05:52,281] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1973.61 | bwd_microstep: 1924.26 | bwd_inner_microstep: 1918.91 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.60
+[2024-06-18 23:05:52,281] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3970.23 | bwd: 3887.31 | bwd_inner: 3877.04 | bwd_allreduce: 10.08 | step: 61.68
+ 65%|██████▌   | 458/700 [59:01<31:38,  7.85s/it]                                                 {'loss': 0.4713, 'learning_rate': 2.8201656270437658e-05, 'epoch': 4.58}
+ 65%|██████▌   | 458/700 [59:01<31:38,  7.85s/it][2024-06-18 23:05:55,771] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1827.85 | bwd_microstep: 1640.39 | bwd_inner_microstep: 1635.23 | bwd_allreduce_microstep: 5.06 | step_microstep: 0.10
+[2024-06-18 23:05:59,709] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 23:05:59,710] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1965.36 | bwd_microstep: 1889.98 | bwd_inner_microstep: 1884.60 | bwd_allreduce_microstep: 5.32 | step_microstep: 62.28
+[2024-06-18 23:05:59,710] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3793.18 | bwd: 3530.38 | bwd_inner: 3519.85 | bwd_allreduce: 10.39 | step: 62.38
+ 66%|██████▌   | 459/700 [59:09<31:00,  7.72s/it]                                                 {'loss': 0.3005, 'learning_rate': 2.7993693261108823e-05, 'epoch': 4.59}
+ 66%|██████▌   | 459/700 [59:09<31:00,  7.72s/it][2024-06-18 23:06:03,238] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1653.79 | bwd_microstep: 1853.13 | bwd_inner_microstep: 1848.26 | bwd_allreduce_microstep: 4.79 | step_microstep: 0.08
+[2024-06-18 23:06:07,189] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.95
+[2024-06-18 23:06:07,190] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.51 | bwd_microstep: 1900.47 | bwd_inner_microstep: 1895.06 | bwd_allreduce_microstep: 5.33 | step_microstep: 63.02
+[2024-06-18 23:06:07,191] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3624.26 | bwd: 3753.62 | bwd_inner: 3743.36 | bwd_allreduce: 10.12 | step: 63.10
+ 66%|██████▌   | 460/700 [59:16<30:35,  7.65s/it]                                                 {'loss': 0.3711, 'learning_rate': 2.7786201344707486e-05, 'epoch': 4.6}
+ 66%|██████▌   | 460/700 [59:16<30:35,  7.65s/it][2024-06-18 23:06:11,108] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1973.06 | bwd_microstep: 1919.21 | bwd_inner_microstep: 1914.37 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.07
+[2024-06-18 23:06:14,800] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 23:06:14,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1886.21 | bwd_microstep: 1726.59 | bwd_inner_microstep: 1721.20 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.84
+[2024-06-18 23:06:14,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3859.24 | bwd: 3645.82 | bwd_inner: 3635.61 | bwd_allreduce: 10.08 | step: 61.92
+ 66%|██████▌   | 461/700 [59:24<30:25,  7.64s/it]                                                 {'loss': 0.1466, 'learning_rate': 2.7579184963050052e-05, 'epoch': 4.61}
+ 66%|██████▌   | 461/700 [59:24<30:25,  7.64s/it][2024-06-18 23:06:18,235] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1531.09 | bwd_microstep: 1882.39 | bwd_inner_microstep: 1877.38 | bwd_allreduce_microstep: 4.88 | step_microstep: 0.14
+[2024-06-18 23:06:21,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:06:21,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1594.39 | bwd_microstep: 1742.48 | bwd_inner_microstep: 1737.02 | bwd_allreduce_microstep: 5.38 | step_microstep: 61.98
+[2024-06-18 23:06:21,653] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3125.45 | bwd: 3624.89 | bwd_inner: 3614.46 | bwd_allreduce: 10.27 | step: 62.12
+ 66%|██████▌   | 462/700 [59:31<29:21,  7.40s/it]                                                 {'loss': 0.312, 'learning_rate': 2.737264854777306e-05, 'epoch': 4.62}
+ 66%|██████▌   | 462/700 [59:31<29:21,  7.40s/it][2024-06-18 23:06:25,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1889.80 | bwd_microstep: 1741.92 | bwd_inner_microstep: 1737.08 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.07
+[2024-06-18 23:06:29,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 23:06:29,292] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1977.08 | bwd_microstep: 1927.28 | bwd_inner_microstep: 1921.90 | bwd_allreduce_microstep: 5.33 | step_microstep: 63.00
+[2024-06-18 23:06:29,293] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3866.85 | bwd: 3669.22 | bwd_inner: 3659.00 | bwd_allreduce: 10.10 | step: 63.08
+ 66%|██████▌   | 463/700 [59:38<29:31,  7.47s/it]                                                 {'loss': 0.4356, 'learning_rate': 2.716659652023833e-05, 'epoch': 4.63}
+ 66%|██████▌   | 463/700 [59:38<29:31,  7.47s/it][2024-06-18 23:06:32,869] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1857.69 | bwd_microstep: 1693.88 | bwd_inner_microstep: 1688.97 | bwd_allreduce_microstep: 4.83 | step_microstep: 0.09
+[2024-06-18 23:06:36,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:06:36,847] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1979.58 | bwd_microstep: 1917.70 | bwd_inner_microstep: 1912.33 | bwd_allreduce_microstep: 5.32 | step_microstep: 61.97
+[2024-06-18 23:06:36,847] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3837.24 | bwd: 3611.60 | bwd_inner: 3601.31 | bwd_allreduce: 10.16 | step: 62.06
+ 66%|██████▋   | 464/700 [59:46<29:29,  7.50s/it]                                                 {'loss': 0.3489, 'learning_rate': 2.6961033291438343e-05, 'epoch': 4.64}
+ 66%|██████▋   | 464/700 [59:46<29:29,  7.50s/it][2024-06-18 23:06:40,030] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1440.52 | bwd_microstep: 1721.36 | bwd_inner_microstep: 1716.55 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.07
+[2024-06-18 23:06:44,011] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:06:44,011] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1973.86 | bwd_microstep: 1926.90 | bwd_inner_microstep: 1921.40 | bwd_allreduce_microstep: 5.35 | step_microstep: 61.82
+[2024-06-18 23:06:44,012] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3414.35 | bwd: 3648.27 | bwd_inner: 3638.03 | bwd_allreduce: 10.06 | step: 61.90
+ 66%|██████▋   | 465/700 [59:53<28:58,  7.40s/it]                                                 {'loss': 0.2129, 'learning_rate': 2.6755963261901708e-05, 'epoch': 4.65}
+ 66%|██████▋   | 465/700 [59:53<28:58,  7.40s/it][2024-06-18 23:06:47,506] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1644.54 | bwd_microstep: 1828.66 | bwd_inner_microstep: 1823.84 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.07
+[2024-06-18 23:06:51,447] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 23:06:51,448] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1965.16 | bwd_microstep: 1895.61 | bwd_inner_microstep: 1890.18 | bwd_allreduce_microstep: 5.28 | step_microstep: 62.66
+[2024-06-18 23:06:51,448] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3609.67 | bwd: 3724.29 | bwd_inner: 3714.09 | bwd_allreduce: 10.02 | step: 62.74
+ 67%|██████▋   | 466/700 [1:00:00<28:53,  7.41s/it]                                                   {'loss': 0.4563, 'learning_rate': 2.6551390821599076e-05, 'epoch': 4.66}
+ 67%|██████▋   | 466/700 [1:00:00<28:53,  7.41s/it][2024-06-18 23:06:55,443] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2001.13 | bwd_microstep: 1971.90 | bwd_inner_microstep: 1966.98 | bwd_allreduce_microstep: 4.80 | step_microstep: 0.08
+[2024-06-18 23:06:59,383] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.89
+[2024-06-18 23:06:59,383] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1968.46 | bwd_microstep: 1891.09 | bwd_inner_microstep: 1885.38 | bwd_allreduce_microstep: 5.57 | step_microstep: 62.62
+[2024-06-18 23:06:59,384] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3969.56 | bwd: 3863.01 | bwd_inner: 3852.46 | bwd_allreduce: 10.36 | step: 62.70
+ 67%|██████▋   | 467/700 [1:00:08<29:23,  7.57s/it]                                                   {'loss': 0.6174, 'learning_rate': 2.6347320349849146e-05, 'epoch': 4.67}
+ 67%|██████▋   | 467/700 [1:00:08<29:23,  7.57s/it][2024-06-18 23:07:02,637] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1566.52 | bwd_microstep: 1665.36 | bwd_inner_microstep: 1660.41 | bwd_allreduce_microstep: 4.80 | step_microstep: 0.07
+[2024-06-18 23:07:06,575] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 23:07:06,575] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1964.71 | bwd_microstep: 1892.77 | bwd_inner_microstep: 1887.37 | bwd_allreduce_microstep: 5.32 | step_microstep: 61.45
+[2024-06-18 23:07:06,575] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3531.20 | bwd: 3558.15 | bwd_inner: 3547.85 | bwd_allreduce: 10.10 | step: 61.52
+ 67%|██████▋   | 468/700 [1:00:16<28:49,  7.45s/it]                                                   {'loss': 0.4202, 'learning_rate': 2.6143756215224802e-05, 'epoch': 4.68}
+ 67%|██████▋   | 468/700 [1:00:16<28:49,  7.45s/it][2024-06-18 23:07:10,206] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1883.81 | bwd_microstep: 1725.47 | bwd_inner_microstep: 1720.51 | bwd_allreduce_microstep: 4.81 | step_microstep: 0.07
+[2024-06-18 23:07:14,191] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.89
+[2024-06-18 23:07:14,192] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1979.59 | bwd_microstep: 1925.38 | bwd_inner_microstep: 1919.83 | bwd_allreduce_microstep: 5.40 | step_microstep: 61.75
+[2024-06-18 23:07:14,192] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3863.37 | bwd: 3650.87 | bwd_inner: 3640.45 | bwd_allreduce: 10.18 | step: 61.83
+ 67%|██████▋   | 469/700 [1:00:23<28:53,  7.50s/it]                                                   {'loss': 0.2551, 'learning_rate': 2.5940702775459747e-05, 'epoch': 4.69}
+ 67%|██████▋   | 469/700 [1:00:23<28:53,  7.50s/it][2024-06-18 23:07:18,091] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1965.90 | bwd_microstep: 1911.39 | bwd_inner_microstep: 1906.55 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.07
+[2024-06-18 23:07:21,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:07:21,886] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1817.59 | bwd_microstep: 1897.18 | bwd_inner_microstep: 1891.72 | bwd_allreduce_microstep: 5.32 | step_microstep: 61.49
+[2024-06-18 23:07:21,886] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3783.46 | bwd: 3808.60 | bwd_inner: 3798.34 | bwd_allreduce: 10.07 | step: 61.58
+ 67%|██████▋   | 470/700 [1:00:31<28:58,  7.56s/it]                                                   {'loss': 0.4952, 'learning_rate': 2.5738164377355145e-05, 'epoch': 4.7}
+ 67%|██████▋   | 470/700 [1:00:31<28:58,  7.56s/it][2024-06-18 23:07:25,769] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1966.24 | bwd_microstep: 1895.19 | bwd_inner_microstep: 1890.12 | bwd_allreduce_microstep: 4.99 | step_microstep: 0.14
+[2024-06-18 23:07:29,494] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 23:07:29,495] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1895.02 | bwd_microstep: 1748.67 | bwd_inner_microstep: 1743.16 | bwd_allreduce_microstep: 5.36 | step_microstep: 61.82
+[2024-06-18 23:07:29,495] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3861.23 | bwd: 3643.89 | bwd_inner: 3633.36 | bwd_allreduce: 10.33 | step: 61.97
+ 67%|██████▋   | 471/700 [1:00:38<28:54,  7.57s/it]                                                   {'loss': 0.1808, 'learning_rate': 2.5536145356686524e-05, 'epoch': 4.71}
+ 67%|██████▋   | 471/700 [1:00:38<28:54,  7.57s/it][2024-06-18 23:07:33,394] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1971.72 | bwd_microstep: 1905.26 | bwd_inner_microstep: 1900.30 | bwd_allreduce_microstep: 4.81 | step_microstep: 0.07
+[2024-06-18 23:07:37,342] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 23:07:37,342] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1969.02 | bwd_microstep: 1898.63 | bwd_inner_microstep: 1893.13 | bwd_allreduce_microstep: 5.36 | step_microstep: 62.04
+[2024-06-18 23:07:37,343] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3940.71 | bwd: 3803.91 | bwd_inner: 3793.54 | bwd_allreduce: 10.14 | step: 62.12
+ 67%|██████▋   | 472/700 [1:00:46<29:05,  7.66s/it]                                                   {'loss': 0.6142, 'learning_rate': 2.5334650038111048e-05, 'epoch': 4.72}
+ 67%|██████▋   | 472/700 [1:00:46<29:05,  7.66s/it][2024-06-18 23:07:41,271] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1976.74 | bwd_microstep: 1929.59 | bwd_inner_microstep: 1924.67 | bwd_allreduce_microstep: 4.77 | step_microstep: 0.09
+[2024-06-18 23:07:44,924] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:07:44,925] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1679.98 | bwd_microstep: 1894.29 | bwd_inner_microstep: 1888.95 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.83
+[2024-06-18 23:07:44,925] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3656.69 | bwd: 3823.90 | bwd_inner: 3813.69 | bwd_allreduce: 10.04 | step: 61.92
+ 68%|██████▊   | 473/700 [1:00:54<28:52,  7.63s/it]                                                   {'loss': 0.604, 'learning_rate': 2.5133682735074904e-05, 'epoch': 4.73}
+ 68%|██████▊   | 473/700 [1:00:54<28:52,  7.63s/it][2024-06-18 23:07:47,998] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1414.67 | bwd_microstep: 1638.11 | bwd_inner_microstep: 1633.14 | bwd_allreduce_microstep: 4.83 | step_microstep: 0.07
+[2024-06-18 23:07:51,985] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:07:51,985] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1976.79 | bwd_microstep: 1929.62 | bwd_inner_microstep: 1924.14 | bwd_allreduce_microstep: 5.40 | step_microstep: 61.95
+[2024-06-18 23:07:51,985] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3391.43 | bwd: 3567.76 | bwd_inner: 3557.35 | bwd_allreduce: 10.21 | step: 62.03
+ 68%|██████▊   | 474/700 [1:01:01<28:06,  7.46s/it]                                                   {'loss': 0.2581, 'learning_rate': 2.4933247749720912e-05, 'epoch': 4.74}
+ 68%|██████▊   | 474/700 [1:01:01<28:06,  7.46s/it][2024-06-18 23:07:55,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1628.14 | bwd_microstep: 1661.19 | bwd_inner_microstep: 1656.26 | bwd_allreduce_microstep: 4.77 | step_microstep: 0.07
+[2024-06-18 23:07:59,285] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:07:59,285] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1976.95 | bwd_microstep: 1932.20 | bwd_inner_microstep: 1926.65 | bwd_allreduce_microstep: 5.40 | step_microstep: 61.89
+[2024-06-18 23:07:59,286] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3605.06 | bwd: 3593.42 | bwd_inner: 3583.03 | bwd_allreduce: 10.13 | step: 61.98
+ 68%|██████▊   | 475/700 [1:01:08<27:48,  7.41s/it]                                                   {'loss': 0.5465, 'learning_rate': 2.4733349372796507e-05, 'epoch': 4.75}
+ 68%|██████▊   | 475/700 [1:01:08<27:48,  7.41s/it][2024-06-18 23:08:02,632] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1496.74 | bwd_microstep: 1828.45 | bwd_inner_microstep: 1823.49 | bwd_allreduce_microstep: 4.81 | step_microstep: 0.07
+[2024-06-18 23:08:06,570] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.89
+[2024-06-18 23:08:06,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1963.50 | bwd_microstep: 1892.84 | bwd_inner_microstep: 1887.21 | bwd_allreduce_microstep: 5.53 | step_microstep: 63.81
+[2024-06-18 23:08:06,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3460.21 | bwd: 3721.32 | bwd_inner: 3710.78 | bwd_allreduce: 10.33 | step: 63.89
+ 68%|██████▊   | 476/700 [1:01:16<27:32,  7.38s/it]                                                   {'loss': 0.6839, 'learning_rate': 2.4533991883561868e-05, 'epoch': 4.76}
+ 68%|██████▊   | 476/700 [1:01:16<27:32,  7.38s/it][2024-06-18 23:08:10,200] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1880.71 | bwd_microstep: 1725.75 | bwd_inner_microstep: 1720.80 | bwd_allreduce_microstep: 4.81 | step_microstep: 0.07
+[2024-06-18 23:08:14,143] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:08:14,143] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1969.06 | bwd_microstep: 1894.69 | bwd_inner_microstep: 1889.23 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.59
+[2024-06-18 23:08:14,144] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3849.75 | bwd: 3620.47 | bwd_inner: 3610.14 | bwd_allreduce: 10.07 | step: 61.67
+ 68%|██████▊   | 477/700 [1:01:23<27:37,  7.43s/it]                                                   {'loss': 0.3046, 'learning_rate': 2.4335179549698233e-05, 'epoch': 4.77}
+ 68%|██████▊   | 477/700 [1:01:23<27:37,  7.43s/it][2024-06-18 23:08:18,059] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1971.79 | bwd_microstep: 1921.92 | bwd_inner_microstep: 1916.98 | bwd_allreduce_microstep: 4.80 | step_microstep: 0.08
+[2024-06-18 23:08:22,040] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 23:08:22,040] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1975.49 | bwd_microstep: 1924.70 | bwd_inner_microstep: 1919.18 | bwd_allreduce_microstep: 5.37 | step_microstep: 61.66
+[2024-06-18 23:08:22,041] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3947.24 | bwd: 3846.64 | bwd_inner: 3836.27 | bwd_allreduce: 10.15 | step: 61.75
+ 68%|██████▊   | 478/700 [1:01:31<28:01,  7.57s/it]                                                   {'loss': 0.2704, 'learning_rate': 2.4136916627216655e-05, 'epoch': 4.78}
+ 68%|██████▊   | 478/700 [1:01:31<28:01,  7.57s/it][2024-06-18 23:08:25,935] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1969.17 | bwd_microstep: 1903.43 | bwd_inner_microstep: 1898.45 | bwd_allreduce_microstep: 4.83 | step_microstep: 0.07
+[2024-06-18 23:08:29,999] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 23:08:30,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2011.39 | bwd_microstep: 1971.93 | bwd_inner_microstep: 1966.36 | bwd_allreduce_microstep: 5.49 | step_microstep: 62.57
+[2024-06-18 23:08:30,000] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3980.52 | bwd: 3875.38 | bwd_inner: 3864.88 | bwd_allreduce: 10.31 | step: 62.65
+ 68%|██████▊   | 479/700 [1:01:39<28:19,  7.69s/it]                                                   {'loss': 0.6112, 'learning_rate': 2.3939207360366832e-05, 'epoch': 4.79}
+ 68%|██████▊   | 479/700 [1:01:39<28:19,  7.69s/it][2024-06-18 23:08:33,896] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1969.89 | bwd_microstep: 1904.31 | bwd_inner_microstep: 1899.45 | bwd_allreduce_microstep: 4.78 | step_microstep: 0.07
+[2024-06-18 23:08:37,609] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:08:37,610] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1697.08 | bwd_microstep: 1936.20 | bwd_inner_microstep: 1930.83 | bwd_allreduce_microstep: 5.28 | step_microstep: 62.19
+[2024-06-18 23:08:37,610] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3666.93 | bwd: 3840.53 | bwd_inner: 3830.32 | bwd_allreduce: 10.07 | step: 62.27
+ 69%|██████▊   | 480/700 [1:01:47<28:06,  7.67s/it]                                                   {'loss': 0.6618, 'learning_rate': 2.374205598154624e-05, 'epoch': 4.8}
+ 69%|██████▊   | 480/700 [1:01:47<28:06,  7.67s/it][2024-06-18 23:08:41,477] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1959.38 | bwd_microstep: 1885.94 | bwd_inner_microstep: 1880.99 | bwd_allreduce_microstep: 4.81 | step_microstep: 0.07
+[2024-06-18 23:08:45,473] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:08:45,474] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1979.34 | bwd_microstep: 1936.89 | bwd_inner_microstep: 1931.39 | bwd_allreduce_microstep: 5.36 | step_microstep: 62.32
+[2024-06-18 23:08:45,474] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3938.69 | bwd: 3822.86 | bwd_inner: 3812.48 | bwd_allreduce: 10.16 | step: 62.40
+ 69%|██████▊   | 481/700 [1:01:54<28:11,  7.73s/it]                                                   {'loss': 0.5694, 'learning_rate': 2.3545466711209585e-05, 'epoch': 4.81}
+ 69%|██████▊   | 481/700 [1:01:54<28:11,  7.73s/it][2024-06-18 23:08:49,457] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1998.97 | bwd_microstep: 1961.86 | bwd_inner_microstep: 1956.96 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.07
+[2024-06-18 23:08:53,513] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:08:53,514] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2000.94 | bwd_microstep: 1974.82 | bwd_inner_microstep: 1969.49 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.93
+[2024-06-18 23:08:53,514] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3999.87 | bwd: 3936.71 | bwd_inner: 3926.54 | bwd_allreduce: 10.00 | step: 62.01
+ 69%|██████▉   | 482/700 [1:02:03<28:24,  7.82s/it]                                                   {'loss': 0.6574, 'learning_rate': 2.3349443757778343e-05, 'epoch': 4.82}
+ 69%|██████▉   | 482/700 [1:02:03<28:24,  7.82s/it][2024-06-18 23:08:57,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1974.99 | bwd_microstep: 1926.52 | bwd_inner_microstep: 1921.50 | bwd_allreduce_microstep: 4.94 | step_microstep: 0.08
+[2024-06-18 23:08:59,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:08:59,443] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 946.00 | bwd_microstep: 979.79 | bwd_inner_microstep: 974.22 | bwd_allreduce_microstep: 5.42 | step_microstep: 61.80
+[2024-06-18 23:08:59,443] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2920.96 | bwd: 2906.33 | bwd_inner: 2895.80 | bwd_allreduce: 10.34 | step: 61.88
+ 69%|██████▉   | 483/700 [1:02:08<26:13,  7.25s/it]                                                   {'loss': 0.4976, 'learning_rate': 2.315399131755081e-05, 'epoch': 4.83}
+ 69%|██████▉   | 483/700 [1:02:08<26:13,  7.25s/it][2024-06-18 23:09:03,094] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1890.51 | bwd_microstep: 1739.22 | bwd_inner_microstep: 1734.38 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.07
+[2024-06-18 23:09:06,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:09:06,901] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1917.60 | bwd_microstep: 1808.73 | bwd_inner_microstep: 1803.49 | bwd_allreduce_microstep: 5.19 | step_microstep: 61.78
+[2024-06-18 23:09:06,901] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3808.09 | bwd: 3547.96 | bwd_inner: 3537.88 | bwd_allreduce: 9.97 | step: 61.86
+ 69%|██████▉   | 484/700 [1:02:16<26:19,  7.31s/it]                                                   {'loss': 0.043, 'learning_rate': 2.29591135746122e-05, 'epoch': 4.84}
+ 69%|██████▉   | 484/700 [1:02:16<26:19,  7.31s/it][2024-06-18 23:09:10,795] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.13 | bwd_microstep: 1905.81 | bwd_inner_microstep: 1900.71 | bwd_allreduce_microstep: 5.00 | step_microstep: 0.10
+[2024-06-18 23:09:14,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 23:09:14,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1894.63 | bwd_microstep: 1744.90 | bwd_inner_microstep: 1739.57 | bwd_allreduce_microstep: 5.28 | step_microstep: 61.91
+[2024-06-18 23:09:14,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3861.73 | bwd: 3650.73 | bwd_inner: 3640.30 | bwd_allreduce: 10.31 | step: 62.02
+ 69%|██████▉   | 485/700 [1:02:24<26:31,  7.40s/it]                                                   {'loss': 0.245, 'learning_rate': 2.2764814700745025e-05, 'epoch': 4.85}
+ 69%|██████▉   | 485/700 [1:02:24<26:31,  7.40s/it][2024-06-18 23:09:18,253] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1909.58 | bwd_microstep: 1804.54 | bwd_inner_microstep: 1799.58 | bwd_allreduce_microstep: 4.82 | step_microstep: 0.07
+[2024-06-18 23:09:21,829] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:09:21,830] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1658.98 | bwd_microstep: 1839.02 | bwd_inner_microstep: 1833.67 | bwd_allreduce_microstep: 5.28 | step_microstep: 61.40
+[2024-06-18 23:09:21,830] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3568.53 | bwd: 3643.59 | bwd_inner: 3633.31 | bwd_allreduce: 10.08 | step: 61.48
+ 69%|██████▉   | 486/700 [1:02:31<26:18,  7.38s/it]                                                   {'loss': 0.2813, 'learning_rate': 2.25710988553399e-05, 'epoch': 4.86}
+ 69%|██████▉   | 486/700 [1:02:31<26:18,  7.38s/it][2024-06-18 23:09:25,814] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1995.10 | bwd_microstep: 1966.79 | bwd_inner_microstep: 1961.90 | bwd_allreduce_microstep: 4.82 | step_microstep: 0.08
+[2024-06-18 23:09:29,205] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 23:09:29,205] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1497.39 | bwd_microstep: 1814.10 | bwd_inner_microstep: 1808.69 | bwd_allreduce_microstep: 5.34 | step_microstep: 61.92
+[2024-06-18 23:09:29,205] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3492.46 | bwd: 3780.92 | bwd_inner: 3770.62 | bwd_allreduce: 10.16 | step: 62.01
+ 70%|██████▉   | 487/700 [1:02:38<26:11,  7.38s/it]                                                   {'loss': 0.5447, 'learning_rate': 2.2377970185306424e-05, 'epoch': 4.87}
+ 70%|██████▉   | 487/700 [1:02:38<26:11,  7.38s/it][2024-06-18 23:09:32,404] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1542.36 | bwd_microstep: 1635.09 | bwd_inner_microstep: 1630.04 | bwd_allreduce_microstep: 4.96 | step_microstep: 0.08
+[2024-06-18 23:09:36,394] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.90
+[2024-06-18 23:09:36,395] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1979.02 | bwd_microstep: 1929.00 | bwd_inner_microstep: 1923.32 | bwd_allreduce_microstep: 5.53 | step_microstep: 62.90
+[2024-06-18 23:09:36,395] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3521.34 | bwd: 3564.11 | bwd_inner: 3553.43 | bwd_allreduce: 10.51 | step: 62.99
+ 70%|██████���   | 488/700 [1:02:45<25:51,  7.32s/it]                                                   {'loss': 0.114, 'learning_rate': 2.2185432824984453e-05, 'epoch': 4.88}
+ 70%|██████▉   | 488/700 [1:02:45<25:51,  7.32s/it][2024-06-18 23:09:39,589] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1445.69 | bwd_microstep: 1727.09 | bwd_inner_microstep: 1721.97 | bwd_allreduce_microstep: 5.04 | step_microstep: 0.08
+[2024-06-18 23:09:43,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.90
+[2024-06-18 23:09:43,690] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2016.87 | bwd_microstep: 2000.35 | bwd_inner_microstep: 1994.74 | bwd_allreduce_microstep: 5.46 | step_microstep: 63.88
+[2024-06-18 23:09:43,690] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3462.53 | bwd: 3727.46 | bwd_inner: 3716.79 | bwd_allreduce: 10.48 | step: 63.97
+ 70%|██████▉   | 489/700 [1:02:53<25:43,  7.31s/it]                                                   {'loss': 0.7998, 'learning_rate': 2.1993490896055512e-05, 'epoch': 4.89}
+ 70%|██████▉   | 489/700 [1:02:53<25:43,  7.31s/it][2024-06-18 23:09:46,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1547.93 | bwd_microstep: 1644.56 | bwd_inner_microstep: 1639.65 | bwd_allreduce_microstep: 4.83 | step_microstep: 0.08
+[2024-06-18 23:09:50,906] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 23:09:50,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1982.06 | bwd_microstep: 1928.74 | bwd_inner_microstep: 1923.24 | bwd_allreduce_microstep: 5.44 | step_microstep: 62.43
+[2024-06-18 23:09:50,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3529.96 | bwd: 3573.32 | bwd_inner: 3562.91 | bwd_allreduce: 10.28 | step: 62.52
+ 70%|███████   | 490/700 [1:03:00<25:29,  7.28s/it]                                                   {'loss': 0.7277, 'learning_rate': 2.180214850745467e-05, 'epoch': 4.9}
+ 70%|███████   | 490/700 [1:03:00<25:29,  7.28s/it][2024-06-18 23:09:54,831] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1977.40 | bwd_microstep: 1924.96 | bwd_inner_microstep: 1919.90 | bwd_allreduce_microstep: 4.97 | step_microstep: 0.08
+[2024-06-18 23:09:58,784] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:09:58,785] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1972.38 | bwd_microstep: 1901.02 | bwd_inner_microstep: 1895.59 | bwd_allreduce_microstep: 5.35 | step_microstep: 61.72
+[2024-06-18 23:09:58,785] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3949.75 | bwd: 3825.99 | bwd_inner: 3815.53 | bwd_allreduce: 10.33 | step: 61.80
+ 70%|███████   | 491/700 [1:03:08<25:59,  7.46s/it]                                                   {'loss': 0.5846, 'learning_rate': 2.161140975528254e-05, 'epoch': 4.91}
+ 70%|███████   | 491/700 [1:03:08<25:59,  7.46s/it][2024-06-18 23:10:02,351] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1758.06 | bwd_microstep: 1786.26 | bwd_inner_microstep: 1781.31 | bwd_allreduce_microstep: 4.86 | step_microstep: 0.08
+[2024-06-18 23:10:06,420] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:10:06,420] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2007.28 | bwd_microstep: 1982.21 | bwd_inner_microstep: 1976.88 | bwd_allreduce_microstep: 5.28 | step_microstep: 61.41
+[2024-06-18 23:10:06,420] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3765.30 | bwd: 3768.49 | bwd_inner: 3758.21 | bwd_allreduce: 10.16 | step: 61.49
+ 70%|███████   | 492/700 [1:03:15<26:02,  7.51s/it]                                                   {'loss': 0.5622, 'learning_rate': 2.1421278722717524e-05, 'epoch': 4.92}
+ 70%|███████   | 492/700 [1:03:15<26:02,  7.51s/it][2024-06-18 23:10:09,816] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1703.32 | bwd_microstep: 1670.25 | bwd_inner_microstep: 1665.35 | bwd_allreduce_microstep: 4.82 | step_microstep: 0.07
+[2024-06-18 23:10:13,532] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 23:10:13,533] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1896.05 | bwd_microstep: 1741.07 | bwd_inner_microstep: 1735.33 | bwd_allreduce_microstep: 5.59 | step_microstep: 62.38
+[2024-06-18 23:10:13,533] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3599.33 | bwd: 3411.34 | bwd_inner: 3400.75 | bwd_allreduce: 10.40 | step: 62.46
+ 70%|███████   | 493/700 [1:03:23<25:30,  7.39s/it]                                                   {'loss': 0.2222, 'learning_rate': 2.123175947992851e-05, 'epoch': 4.93}
+ 70%|███████   | 493/700 [1:03:23<25:30,  7.39s/it][2024-06-18 23:10:17,163] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1881.79 | bwd_microstep: 1725.97 | bwd_inner_microstep: 1721.06 | bwd_allreduce_microstep: 4.78 | step_microstep: 0.08
+[2024-06-18 23:10:21,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:10:21,225] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2004.28 | bwd_microstep: 1977.29 | bwd_inner_microstep: 1971.86 | bwd_allreduce_microstep: 5.35 | step_microstep: 61.70
+[2024-06-18 23:10:21,225] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3886.04 | bwd: 3703.28 | bwd_inner: 3692.99 | bwd_allreduce: 10.14 | step: 61.78
+ 71%|███████   | 494/700 [1:03:30<25:41,  7.48s/it]                                                   {'loss': 0.2649, 'learning_rate': 2.1042856083987695e-05, 'epoch': 4.94}
+ 71%|███████   | 494/700 [1:03:30<25:41,  7.48s/it][2024-06-18 23:10:24,973] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1917.25 | bwd_microstep: 1809.53 | bwd_inner_microstep: 1804.52 | bwd_allreduce_microstep: 4.87 | step_microstep: 0.08
+[2024-06-18 23:10:28,814] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 23:10:28,815] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1838.61 | bwd_microstep: 1922.70 | bwd_inner_microstep: 1917.33 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.63
+[2024-06-18 23:10:28,815] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3755.82 | bwd: 3732.26 | bwd_inner: 3721.90 | bwd_allreduce: 10.17 | step: 61.72
+ 71%|███████   | 495/700 [1:03:38<25:40,  7.52s/it]                                                   {'loss': 0.2307, 'learning_rate': 2.0854572578783686e-05, 'epoch': 4.95}
+ 71%|███████   | 495/700 [1:03:38<25:40,  7.52s/it][2024-06-18 23:10:32,004] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1446.72 | bwd_microstep: 1721.31 | bwd_inner_microstep: 1716.44 | bwd_allreduce_microstep: 4.79 | step_microstep: 0.08
+[2024-06-18 23:10:35,974] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:10:35,975] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1978.77 | bwd_microstep: 1912.04 | bwd_inner_microstep: 1906.66 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.81
+[2024-06-18 23:10:35,975] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3425.46 | bwd: 3633.38 | bwd_inner: 3623.14 | bwd_allreduce: 10.11 | step: 61.90
+ 71%|███████   | 496/700 [1:03:45<25:11,  7.41s/it]                                                   {'loss': 0.1544, 'learning_rate': 2.0666912994935034e-05, 'epoch': 4.96}
+ 71%|███████   | 496/700 [1:03:45<25:11,  7.41s/it][2024-06-18 23:10:39,257] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1575.94 | bwd_microstep: 1684.75 | bwd_inner_microstep: 1679.70 | bwd_allreduce_microstep: 4.98 | step_microstep: 0.14
+[2024-06-18 23:10:42,752] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 23:10:42,753] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1533.26 | bwd_microstep: 1882.63 | bwd_inner_microstep: 1877.12 | bwd_allreduce_microstep: 5.36 | step_microstep: 61.71
+[2024-06-18 23:10:42,753] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3109.17 | bwd: 3567.40 | bwd_inner: 3556.88 | bwd_allreduce: 10.33 | step: 61.86
+ 71%|███████   | 497/700 [1:03:52<24:25,  7.22s/it]                                                   {'loss': 0.3855, 'learning_rate': 2.0479881349703883e-05, 'epoch': 4.97}
+ 71%|███████   | 497/700 [1:03:52<24:25,  7.22s/it][2024-06-18 23:10:46,406] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1890.70 | bwd_microstep: 1740.71 | bwd_inner_microstep: 1735.79 | bwd_allreduce_microstep: 4.78 | step_microstep: 0.07
+[2024-06-18 23:10:50,387] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 23:10:50,387] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1974.72 | bwd_microstep: 1926.82 | bwd_inner_microstep: 1921.31 | bwd_allreduce_microstep: 5.36 | step_microstep: 61.40
+[2024-06-18 23:10:50,387] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3865.39 | bwd: 3667.55 | bwd_inner: 3657.21 | bwd_allreduce: 10.10 | step: 61.48
+ 71%|███████   | 498/700 [1:03:59<24:43,  7.34s/it]                                                   {'loss': 0.3163, 'learning_rate': 2.0293481646909934e-05, 'epoch': 4.98}
+ 71%|███████   | 498/700 [1:03:59<24:43,  7.34s/it][2024-06-18 23:10:53,497] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1426.16 | bwd_microstep: 1662.18 | bwd_inner_microstep: 1657.20 | bwd_allreduce_microstep: 4.84 | step_microstep: 0.08
+[2024-06-18 23:10:57,478] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:10:57,479] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1978.53 | bwd_microstep: 1924.56 | bwd_inner_microstep: 1919.16 | bwd_allreduce_microstep: 5.32 | step_microstep: 61.45
+[2024-06-18 23:10:57,479] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3404.66 | bwd: 3586.76 | bwd_inner: 3576.43 | bwd_allreduce: 10.15 | step: 61.54
+ 71%|███████▏  | 499/700 [1:04:06<24:20,  7.27s/it]                                                   {'loss': 0.5392, 'learning_rate': 2.0107717876844838e-05, 'epoch': 4.99}
+ 71%|███████▏  | 499/700 [1:04:06<24:20,  7.27s/it][2024-06-18 23:11:01,356] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1966.13 | bwd_microstep: 1889.32 | bwd_inner_microstep: 1884.37 | bwd_allreduce_microstep: 4.80 | step_microstep: 0.08
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+[2024-06-18 23:11:06,180] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 23:11:06,180] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.56 | bwd_microstep: 1928.37 | bwd_inner_microstep: 1922.77 | bwd_allreduce_microstep: 5.52 | step_microstep: 62.05
+[2024-06-18 23:11:06,181] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3933.63 | bwd: 3817.71 | bwd_inner: 3807.21 | bwd_allreduce: 10.32 | step: 62.14
+ 71%|███████▏  | 500/700 [1:04:15<25:39,  7.70s/it]                                                   {'loss': 0.6052, 'learning_rate': 1.9922594016186713e-05, 'epoch': 5.0}
+ 71%|███████▏  | 500/700 [1:04:15<25:39,  7.70s/it][2024-06-18 23:11:08,884] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-06-18 23:11:14,739] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-06-18 23:11:20,565] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-06-18 23:11:26,397] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-06-18 23:11:33,809] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1999.60 | bwd_microstep: 1941.77 | bwd_inner_microstep: 1936.77 | bwd_allreduce_microstep: 4.95 | step_microstep: 0.09
+[2024-06-18 23:11:38,037] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 23:11:38,038] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2043.75 | bwd_microstep: 2104.47 | bwd_inner_microstep: 2099.02 | bwd_allreduce_microstep: 5.31 | step_microstep: 62.29
+[2024-06-18 23:11:38,038] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 4043.29 | bwd: 4046.26 | bwd_inner: 4035.84 | bwd_allreduce: 10.29 | step: 62.39
+ 72%|███████▏  | 501/700 [1:04:47<49:34, 14.95s/it]                                                   {'loss': 0.5806, 'learning_rate': 1.9738114027915006e-05, 'epoch': 5.01}
+ 72%|███████▏  | 501/700 [1:04:47<49:34, 14.95s/it][2024-06-18 23:11:41,379] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1494.03 | bwd_microstep: 1824.93 | bwd_inner_microstep: 1820.07 | bwd_allreduce_microstep: 4.78 | step_microstep: 0.08
+[2024-06-18 23:11:45,310] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.92
+[2024-06-18 23:11:45,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1957.78 | bwd_microstep: 1894.38 | bwd_inner_microstep: 1888.98 | bwd_allreduce_microstep: 5.34 | step_microstep: 61.74
+[2024-06-18 23:11:45,311] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3451.78 | bwd: 3719.33 | bwd_inner: 3709.08 | bwd_allreduce: 10.14 | step: 61.82
+ 72%|███████▏  | 502/700 [1:04:54<41:43, 12.64s/it]                                                   {'loss': 0.5953, 'learning_rate': 1.9554281861225694e-05, 'epoch': 5.02}
+ 72%|███████▏  | 502/700 [1:04:54<41:43, 12.64s/it][2024-06-18 23:11:48,956] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1883.95 | bwd_microstep: 1738.85 | bwd_inner_microstep: 1733.89 | bwd_allreduce_microstep: 4.83 | step_microstep: 0.07
+[2024-06-18 23:11:52,920] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:11:52,920] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1969.63 | bwd_microstep: 1915.48 | bwd_inner_microstep: 1910.09 | bwd_allreduce_microstep: 5.34 | step_microstep: 61.55
+[2024-06-18 23:11:52,921] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3853.54 | bwd: 3654.36 | bwd_inner: 3644.02 | bwd_allreduce: 10.15 | step: 61.63
+ 72%|███████▏  | 503/700 [1:05:02<36:33, 11.13s/it]                                                   {'loss': 0.1115, 'learning_rate': 1.937110145144668e-05, 'epoch': 5.03}
+ 72%|███████▏  | 503/700 [1:05:02<36:33, 11.13s/it][2024-06-18 23:11:56,490] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1856.85 | bwd_microstep: 1690.73 | bwd_inner_microstep: 1685.64 | bwd_allreduce_microstep: 5.01 | step_microstep: 0.14
+[2024-06-18 23:12:00,435] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:12:00,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1966.05 | bwd_microstep: 1898.56 | bwd_inner_microstep: 1893.08 | bwd_allreduce_microstep: 5.37 | step_microstep: 61.89
+[2024-06-18 23:12:00,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3822.87 | bwd: 3589.31 | bwd_inner: 3578.79 | bwd_allreduce: 10.39 | step: 62.04
+ 72%|███████▏  | 504/700 [1:05:09<32:49, 10.05s/it]                                                   {'loss': 0.221, 'learning_rate': 1.9188576719953633e-05, 'epoch': 5.04}
+ 72%|███████▏  | 504/700 [1:05:09<32:49, 10.05s/it][2024-06-18 23:12:04,344] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.04 | bwd_microstep: 1919.44 | bwd_inner_microstep: 1914.57 | bwd_allreduce_microstep: 4.79 | step_microstep: 0.08
+[2024-06-18 23:12:08,139] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:12:08,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1910.44 | bwd_microstep: 1805.26 | bwd_inner_microstep: 1799.90 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.67
+[2024-06-18 23:12:08,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3877.44 | bwd: 3724.72 | bwd_inner: 3714.49 | bwd_allreduce: 10.11 | step: 61.75
+ 72%|███████▏  | 505/700 [1:05:17<30:22,  9.34s/it]                                                   {'loss': 0.1939, 'learning_rate': 1.9006711574086005e-05, 'epoch': 5.05}
+ 72%|███████▏  | 505/700 [1:05:17<30:22,  9.34s/it][2024-06-18 23:12:11,343] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1542.96 | bwd_microstep: 1638.31 | bwd_inner_microstep: 1633.25 | bwd_allreduce_microstep: 4.97 | step_microstep: 0.09
+[2024-06-18 23:12:14,910] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:12:14,910] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1650.66 | bwd_microstep: 1836.59 | bwd_inner_microstep: 1831.05 | bwd_allreduce_microstep: 5.40 | step_microstep: 61.70
+[2024-06-18 23:12:14,910] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3193.58 | bwd: 3474.92 | bwd_inner: 3464.37 | bwd_allreduce: 10.35 | step: 61.81
+ 72%|███████▏  | 506/700 [1:05:24<27:43,  8.57s/it]                                                   {'loss': 0.3278, 'learning_rate': 1.8825509907063327e-05, 'epoch': 5.06}
+ 72%|███████▏  | 506/700 [1:05:24<27:43,  8.57s/it][2024-06-18 23:12:18,907] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2003.00 | bwd_microstep: 1972.39 | bwd_inner_microstep: 1967.42 | bwd_allreduce_microstep: 4.83 | step_microstep: 0.08
+[2024-06-18 23:12:22,335] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:12:22,335] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1507.64 | bwd_microstep: 1840.31 | bwd_inner_microstep: 1834.77 | bwd_allreduce_microstep: 5.45 | step_microstep: 61.68
+[2024-06-18 23:12:22,335] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3510.61 | bwd: 3812.72 | bwd_inner: 3802.27 | bwd_allreduce: 10.25 | step: 61.77
+ 72%|███████▏  | 507/700 [1:05:31<26:28,  8.23s/it]                                                   {'loss': 0.4018, 'learning_rate': 1.8644975597901977e-05, 'epoch': 5.07}
+ 72%|███████▏  | 507/700 [1:05:31<26:28,  8.23s/it][2024-06-18 23:12:25,857] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1650.29 | bwd_microstep: 1849.84 | bwd_inner_microstep: 1844.97 | bwd_allreduce_microstep: 4.78 | step_microstep: 0.07
+[2024-06-18 23:12:29,551] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:12:29,552] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1886.87 | bwd_microstep: 1727.08 | bwd_inner_microstep: 1721.51 | bwd_allreduce_microstep: 5.43 | step_microstep: 61.68
+[2024-06-18 23:12:29,552] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3537.12 | bwd: 3576.94 | bwd_inner: 3566.55 | bwd_allreduce: 10.18 | step: 61.76
+ 73%|███████▎  | 508/700 [1:05:39<25:21,  7.92s/it]                                                   {'loss': 0.117, 'learning_rate': 1.8465112511332065e-05, 'epoch': 5.08}
+ 73%|███████▎  | 508/700 [1:05:39<25:21,  7.92s/it][2024-06-18 23:12:33,534] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1999.28 | bwd_microstep: 1960.91 | bwd_inner_microstep: 1955.76 | bwd_allreduce_microstep: 5.07 | step_microstep: 0.08
+[2024-06-18 23:12:36,868] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.92
+[2024-06-18 23:12:36,868] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1574.13 | bwd_microstep: 1677.48 | bwd_inner_microstep: 1671.86 | bwd_allreduce_microstep: 5.54 | step_microstep: 64.02
+[2024-06-18 23:12:36,869] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3573.37 | bwd: 3638.42 | bwd_inner: 3627.65 | bwd_allreduce: 10.62 | step: 64.11
+ 73%|███████▎  | 509/700 [1:05:46<24:38,  7.74s/it]                                                   {'loss': 0.2117, 'learning_rate': 1.8285924497714703e-05, 'epoch': 5.09}
+ 73%|███████▎  | 509/700 [1:05:46<24:38,  7.74s/it][2024-06-18 23:12:40,505] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1882.61 | bwd_microstep: 1731.21 | bwd_inner_microstep: 1726.40 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.08
+[2024-06-18 23:12:43,980] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:12:43,981] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1526.08 | bwd_microstep: 1870.98 | bwd_inner_microstep: 1865.59 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.69
+[2024-06-18 23:12:43,981] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3408.66 | bwd: 3602.22 | bwd_inner: 3592.04 | bwd_allreduce: 10.04 | step: 61.77
+ 73%|███████▎  | 510/700 [1:05:53<23:55,  7.55s/it]                                                   {'loss': 0.3809, 'learning_rate': 1.8107415392959614e-05, 'epoch': 5.1}
+ 73%|███████▎  | 510/700 [1:05:53<23:55,  7.55s/it][2024-06-18 23:12:47,721] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1911.40 | bwd_microstep: 1806.16 | bwd_inner_microstep: 1801.24 | bwd_allreduce_microstep: 4.84 | step_microstep: 0.08
+[2024-06-18 23:12:51,787] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 23:12:51,788] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2006.06 | bwd_microstep: 1981.32 | bwd_inner_microstep: 1975.90 | bwd_allreduce_microstep: 5.28 | step_microstep: 61.68
+[2024-06-18 23:12:51,788] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3917.43 | bwd: 3787.50 | bwd_inner: 3777.21 | bwd_allreduce: 10.11 | step: 61.77
+ 73%|███████▎  | 511/700 [1:06:01<24:01,  7.63s/it]                                                   {'loss': 0.2844, 'learning_rate': 1.7929589018443016e-05, 'epoch': 5.11}
+ 73%|███████▎  | 511/700 [1:06:01<24:01,  7.63s/it][2024-06-18 23:12:53,368] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 730.00 | bwd_microstep: 828.98 | bwd_inner_microstep: 824.15 | bwd_allreduce_microstep: 4.75 | step_microstep: 0.08
+[2024-06-18 23:12:56,925] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 23:12:56,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1645.52 | bwd_microstep: 1833.25 | bwd_inner_microstep: 1827.92 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.74
+[2024-06-18 23:12:56,926] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2375.49 | bwd: 2662.25 | bwd_inner: 2652.11 | bwd_allreduce: 10.01 | step: 61.83
+ 73%|███████▎  | 512/700 [1:06:06<21:33,  6.88s/it]                                                   {'loss': 0.148, 'learning_rate': 1.7752449180925747e-05, 'epoch': 5.12}
+ 73%|███████▎  | 512/700 [1:06:06<21:33,  6.88s/it][2024-06-18 23:13:00,810] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1965.35 | bwd_microstep: 1896.20 | bwd_inner_microstep: 1891.21 | bwd_allreduce_microstep: 4.91 | step_microstep: 0.14
+[2024-06-18 23:13:04,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:13:04,296] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1529.12 | bwd_microstep: 1876.93 | bwd_inner_microstep: 1871.55 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.88
+[2024-06-18 23:13:04,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3494.43 | bwd: 3773.15 | bwd_inner: 3762.83 | bwd_allreduce: 10.19 | step: 62.03
+ 73%|███████▎  | 513/700 [1:06:13<21:54,  7.03s/it]                                                   {'loss': 0.4519, 'learning_rate': 1.7575999672471867e-05, 'epoch': 5.13}
+ 73%|███████▎  | 513/700 [1:06:13<21:54,  7.03s/it][2024-06-18 23:13:08,208] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1972.90 | bwd_microstep: 1916.62 | bwd_inner_microstep: 1911.77 | bwd_allreduce_microstep: 4.77 | step_microstep: 0.07
+[2024-06-18 23:13:12,185] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:13:12,186] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1974.79 | bwd_microstep: 1922.66 | bwd_inner_microstep: 1917.15 | bwd_allreduce_microstep: 5.44 | step_microstep: 62.48
+[2024-06-18 23:13:12,186] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3947.66 | bwd: 3839.30 | bwd_inner: 3828.95 | bwd_allreduce: 10.21 | step: 62.57
+ 73%|███████▎  | 514/700 [1:06:21<22:35,  7.29s/it]                                                   {'loss': 0.4638, 'learning_rate': 1.7400244270367428e-05, 'epoch': 5.14}
+ 73%|███████▎  | 514/700 [1:06:21<22:35,  7.29s/it][2024-06-18 23:13:16,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1996.17 | bwd_microstep: 1961.38 | bwd_inner_microstep: 1956.55 | bwd_allreduce_microstep: 4.75 | step_microstep: 0.07
+[2024-06-18 23:13:20,148] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:13:20,149] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1978.16 | bwd_microstep: 1925.30 | bwd_inner_microstep: 1919.96 | bwd_allreduce_microstep: 5.30 | step_microstep: 62.11
+[2024-06-18 23:13:20,149] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3974.30 | bwd: 3886.71 | bwd_inner: 3876.53 | bwd_allreduce: 10.06 | step: 62.19
+ 74%|███████▎  | 515/700 [1:06:29<23:05,  7.49s/it]                                                   {'loss': 0.3697, 'learning_rate': 1.7225186737039638e-05, 'epoch': 5.15}
+ 74%|███████▎  | 515/700 [1:06:29<23:05,  7.49s/it][2024-06-18 23:13:24,149] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2003.50 | bwd_microstep: 1975.38 | bwd_inner_microstep: 1970.52 | bwd_allreduce_microstep: 4.79 | step_microstep: 0.07
+[2024-06-18 23:13:28,149] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:13:28,150] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1979.64 | bwd_microstep: 1940.10 | bwd_inner_microstep: 1934.85 | bwd_allreduce_microstep: 5.21 | step_microstep: 61.47
+[2024-06-18 23:13:28,150] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3983.10 | bwd: 3915.50 | bwd_inner: 3905.38 | bwd_allreduce: 10.00 | step: 61.55
+ 74%|███████▎  | 516/700 [1:06:37<23:26,  7.64s/it]                                                   {'loss': 0.3685, 'learning_rate': 1.7050830819976267e-05, 'epoch': 5.16}
+ 74%|███████▎  | 516/700 [1:06:37<23:26,  7.64s/it][2024-06-18 23:13:32,137] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1999.44 | bwd_microstep: 1966.31 | bwd_inner_microstep: 1961.42 | bwd_allreduce_microstep: 4.75 | step_microstep: 0.07
+[2024-06-18 23:13:36,127] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 23:13:36,128] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1978.40 | bwd_microstep: 1931.62 | bwd_inner_microstep: 1926.05 | bwd_allreduce_microstep: 5.49 | step_microstep: 62.87
+[2024-06-18 23:13:36,128] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3977.81 | bwd: 3897.95 | bwd_inner: 3887.54 | bwd_allreduce: 10.22 | step: 62.95
+ 74%|███████▍  | 517/700 [1:06:45<23:37,  7.74s/it]                                                   {'loss': 0.4112, 'learning_rate': 1.6877180251645486e-05, 'epoch': 5.17}
+ 74%|███████▍  | 517/700 [1:06:45<23:37,  7.74s/it][2024-06-18 23:13:40,007] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1962.80 | bwd_microstep: 1894.05 | bwd_inner_microstep: 1889.12 | bwd_allreduce_microstep: 4.79 | step_microstep: 0.07
+[2024-06-18 23:13:43,931] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:13:43,932] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1961.68 | bwd_microstep: 1882.77 | bwd_inner_microstep: 1877.23 | bwd_allreduce_microstep: 5.41 | step_microstep: 62.00
+[2024-06-18 23:13:43,932] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3924.44 | bwd: 3776.84 | bwd_inner: 3766.46 | bwd_allreduce: 10.18 | step: 62.08
+ 74%|███████▍  | 518/700 [1:06:53<23:32,  7.76s/it]                                                   {'loss': 0.1764, 'learning_rate': 1.6704238749415957e-05, 'epoch': 5.18}
+ 74%|███████▍  | 518/700 [1:06:53<23:32,  7.76s/it][2024-06-18 23:13:47,853] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1972.62 | bwd_microstep: 1926.76 | bwd_inner_microstep: 1922.05 | bwd_allreduce_microstep: 4.67 | step_microstep: 0.07
+[2024-06-18 23:13:51,848] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 23:13:51,849] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1981.37 | bwd_microstep: 1934.32 | bwd_inner_microstep: 1928.82 | bwd_allreduce_microstep: 5.35 | step_microstep: 61.91
+[2024-06-18 23:13:51,849] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3953.95 | bwd: 3861.11 | bwd_inner: 3850.92 | bwd_allreduce: 10.04 | step: 61.99
+ 74%|███████▍  | 519/700 [1:07:01<23:33,  7.81s/it]                                                   {'loss': 0.5628, 'learning_rate': 1.653201001547719e-05, 'epoch': 5.19}
+ 74%|███████▍  | 519/700 [1:07:01<23:33,  7.81s/it][2024-06-18 23:13:55,178] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1495.39 | bwd_microstep: 1812.74 | bwd_inner_microstep: 1807.83 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.07
+[2024-06-18 23:13:59,126] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:13:59,127] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.50 | bwd_microstep: 1897.69 | bwd_inner_microstep: 1892.40 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.72
+[2024-06-18 23:13:59,127] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3465.85 | bwd: 3710.45 | bwd_inner: 3700.28 | bwd_allreduce: 10.00 | step: 61.80
+ 74%|███████▍  | 520/700 [1:07:08<22:56,  7.65s/it]                                                   {'loss': 0.3652, 'learning_rate': 1.6360497736760383e-05, 'epoch': 5.2}
+ 74%|███████▍  | 520/700 [1:07:08<22:56,  7.65s/it][2024-06-18 23:14:03,027] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1971.35 | bwd_microstep: 1906.91 | bwd_inner_microstep: 1901.96 | bwd_allreduce_microstep: 4.87 | step_microstep: 0.14
+[2024-06-18 23:14:07,085] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:14:07,085] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2003.20 | bwd_microstep: 1974.94 | bwd_inner_microstep: 1969.63 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.84
+[2024-06-18 23:14:07,086] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3974.53 | bwd: 3881.87 | bwd_inner: 3871.61 | bwd_allreduce: 10.12 | step: 61.98
+ 74%|███████▍  | 521/700 [1:07:16<23:05,  7.74s/it]                                                   {'loss': 0.4692, 'learning_rate': 1.618970558485942e-05, 'epoch': 5.21}
+ 74%|███████▍  | 521/700 [1:07:16<23:05,  7.74s/it][2024-06-18 23:14:11,013] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1976.04 | bwd_microstep: 1929.96 | bwd_inner_microstep: 1925.16 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.07
+[2024-06-18 23:14:14,994] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:14:14,994] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1973.74 | bwd_microstep: 1926.46 | bwd_inner_microstep: 1920.96 | bwd_allreduce_microstep: 5.36 | step_microstep: 62.05
+[2024-06-18 23:14:14,994] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3949.75 | bwd: 3856.44 | bwd_inner: 3846.19 | bwd_allreduce: 10.06 | step: 62.13
+ 75%|███████▍  | 522/700 [1:07:24<23:06,  7.79s/it]                                                   {'loss': 0.1951, 'learning_rate': 1.601963721595232e-05, 'epoch': 5.22}
+ 75%|███████▍  | 522/700 [1:07:24<23:06,  7.79s/it][2024-06-18 23:14:18,323] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1495.54 | bwd_microstep: 1812.34 | bwd_inner_microstep: 1807.36 | bwd_allreduce_microstep: 4.84 | step_microstep: 0.09
+[2024-06-18 23:14:21,404] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:14:21,405] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1392.46 | bwd_microstep: 1609.75 | bwd_inner_microstep: 1604.31 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.90
+[2024-06-18 23:14:21,405] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2887.97 | bwd: 3422.11 | bwd_inner: 3411.77 | bwd_allreduce: 10.10 | step: 61.99
+ 75%|███████▍  | 523/700 [1:07:30<21:45,  7.38s/it]                                                   {'loss': 0.2395, 'learning_rate': 1.5850296270722964e-05, 'epoch': 5.23}
+ 75%|███████▍  | 523/700 [1:07:30<21:45,  7.38s/it][2024-06-18 23:14:25,302] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.03 | bwd_microstep: 1907.82 | bwd_inner_microstep: 1902.99 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.08
+[2024-06-18 23:14:29,243] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:14:29,243] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1968.08 | bwd_microstep: 1893.67 | bwd_inner_microstep: 1888.31 | bwd_allreduce_microstep: 5.30 | step_microstep: 61.61
+[2024-06-18 23:14:29,244] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3935.08 | bwd: 3801.51 | bwd_inner: 3791.32 | bwd_allreduce: 10.06 | step: 61.69
+ 75%|███████▍  | 524/700 [1:07:38<22:02,  7.52s/it]                                                   {'loss': 0.3004, 'learning_rate': 1.5681686374283088e-05, 'epoch': 5.24}
+ 75%|███████▍  | 524/700 [1:07:38<22:02,  7.52s/it][2024-06-18 23:14:33,122] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1963.87 | bwd_microstep: 1892.28 | bwd_inner_microstep: 1887.44 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.07
+[2024-06-18 23:14:37,197] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 23:14:37,198] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2013.17 | bwd_microstep: 1982.99 | bwd_inner_microstep: 1977.54 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.92
+[2024-06-18 23:14:37,198] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3977.02 | bwd: 3875.29 | bwd_inner: 3865.05 | bwd_allreduce: 10.05 | step: 62.00
+ 75%|███████▌  | 525/700 [1:07:46<22:18,  7.65s/it]                                                   {'loss': 0.584, 'learning_rate': 1.5513811136094787e-05, 'epoch': 5.25}
+ 75%|███████▌  | 525/700 [1:07:46<22:18,  7.65s/it][2024-06-18 23:14:41,077] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1961.77 | bwd_microstep: 1895.25 | bwd_inner_microstep: 1890.43 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.07
+[2024-06-18 23:14:44,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.91
+[2024-06-18 23:14:44,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1820.30 | bwd_microstep: 1897.65 | bwd_inner_microstep: 1891.99 | bwd_allreduce_microstep: 5.52 | step_microstep: 62.94
+[2024-06-18 23:14:44,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3782.04 | bwd: 3792.92 | bwd_inner: 3782.47 | bwd_allreduce: 10.27 | step: 63.01
+ 75%|███████▌  | 526/700 [1:07:54<22:12,  7.66s/it]                                                   {'loss': 0.4504, 'learning_rate': 1.5346674149893202e-05, 'epoch': 5.26}
+ 75%|███████▌  | 526/700 [1:07:54<22:12,  7.66s/it][2024-06-18 23:14:48,778] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.91 | bwd_microstep: 1912.19 | bwd_inner_microstep: 1907.37 | bwd_allreduce_microstep: 4.75 | step_microstep: 0.08
+[2024-06-18 23:14:51,916] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 23:14:51,917] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1417.29 | bwd_microstep: 1641.61 | bwd_inner_microstep: 1636.27 | bwd_allreduce_microstep: 5.26 | step_microstep: 62.13
+[2024-06-18 23:14:51,917] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3385.16 | bwd: 3553.82 | bwd_inner: 3543.67 | bwd_allreduce: 10.02 | step: 62.21
+ 75%|███████▌  | 527/700 [1:08:01<21:32,  7.47s/it]                                                   {'loss': 0.2288, 'learning_rate': 1.5180278993609526e-05, 'epoch': 5.27}
+ 75%|███████▌  | 527/700 [1:08:01<21:32,  7.47s/it][2024-06-18 23:14:55,824] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.39 | bwd_microstep: 1917.78 | bwd_inner_microstep: 1912.89 | bwd_allreduce_microstep: 4.81 | step_microstep: 0.09
+[2024-06-18 23:14:59,639] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 23:14:59,640] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1919.78 | bwd_microstep: 1815.56 | bwd_inner_microstep: 1810.26 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.82
+[2024-06-18 23:14:59,640] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3887.13 | bwd: 3733.36 | bwd_inner: 3723.17 | bwd_allreduce: 10.07 | step: 61.93
+ 75%|███████▌  | 528/700 [1:08:09<21:38,  7.55s/it]                                                   {'loss': 0.2076, 'learning_rate': 1.5014629229294524e-05, 'epoch': 5.28}
+ 75%|███████▌  | 528/700 [1:08:09<21:38,  7.55s/it][2024-06-18 23:15:03,516] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1961.63 | bwd_microstep: 1892.94 | bwd_inner_microstep: 1885.81 | bwd_allreduce_microstep: 6.99 | step_microstep: 0.07
+[2024-06-18 23:15:07,323] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 23:15:07,323] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1917.48 | bwd_microstep: 1809.04 | bwd_inner_microstep: 1803.53 | bwd_allreduce_microstep: 5.35 | step_microstep: 61.78
+[2024-06-18 23:15:07,324] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3879.08 | bwd: 3702.01 | bwd_inner: 3689.45 | bwd_allreduce: 12.32 | step: 61.86
+ 76%|███████▌  | 529/700 [1:08:16<21:37,  7.59s/it]                                                   {'loss': 0.1697, 'learning_rate': 1.4849728403042213e-05, 'epoch': 5.29}
+ 76%|███████▌  | 529/700 [1:08:16<21:37,  7.59s/it][2024-06-18 23:15:11,199] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1961.94 | bwd_microstep: 1891.53 | bwd_inner_microstep: 1886.49 | bwd_allreduce_microstep: 4.90 | step_microstep: 0.14
+[2024-06-18 23:15:15,166] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 23:15:15,166] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1977.37 | bwd_microstep: 1909.32 | bwd_inner_microstep: 1904.05 | bwd_allreduce_microstep: 5.23 | step_microstep: 61.81
+[2024-06-18 23:15:15,166] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3939.27 | bwd: 3800.88 | bwd_inner: 3790.59 | bwd_allreduce: 10.12 | step: 61.96
+ 76%|███████▌  | 530/700 [1:08:24<21:42,  7.66s/it]                                                   {'loss': 0.4365, 'learning_rate': 1.4685580044913921e-05, 'epoch': 5.3}
+ 76%|███████▌  | 530/700 [1:08:24<21:42,  7.66s/it][2024-06-18 23:15:18,364] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1543.51 | bwd_microstep: 1633.01 | bwd_inner_microstep: 1628.20 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.07
+[2024-06-18 23:15:22,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:15:22,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1966.62 | bwd_microstep: 1896.61 | bwd_inner_microstep: 1891.14 | bwd_allreduce_microstep: 5.40 | step_microstep: 62.68
+[2024-06-18 23:15:22,309] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3510.11 | bwd: 3529.65 | bwd_inner: 3519.37 | bwd_allreduce: 10.14 | step: 62.76
+ 76%|███████▌  | 531/700 [1:08:31<21:08,  7.51s/it]                                                   {'loss': 0.4248, 'learning_rate': 1.4522187668862796e-05, 'epoch': 5.31}
+ 76%|███████▌  | 531/700 [1:08:31<21:08,  7.51s/it][2024-06-18 23:15:25,945] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1884.46 | bwd_microstep: 1730.34 | bwd_inner_microstep: 1725.43 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.07
+[2024-06-18 23:15:29,890] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:15:29,891] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1969.36 | bwd_microstep: 1896.13 | bwd_inner_microstep: 1890.78 | bwd_allreduce_microstep: 5.30 | step_microstep: 61.98
+[2024-06-18 23:15:29,891] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3853.80 | bwd: 3626.49 | bwd_inner: 3616.26 | bwd_allreduce: 10.04 | step: 62.05
+ 76%|███████▌  | 532/700 [1:08:39<21:05,  7.53s/it]                                                   {'loss': 0.12, 'learning_rate': 1.4359554772658552e-05, 'epoch': 5.32}
+ 76%|███████▌  | 532/700 [1:08:39<21:05,  7.53s/it][2024-06-18 23:15:33,824] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1978.51 | bwd_microstep: 1932.86 | bwd_inner_microstep: 1928.03 | bwd_allreduce_microstep: 4.69 | step_microstep: 0.07
+[2024-06-18 23:15:37,816] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:15:37,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1976.68 | bwd_microstep: 1935.42 | bwd_inner_microstep: 1929.94 | bwd_allreduce_microstep: 5.34 | step_microstep: 62.00
+[2024-06-18 23:15:37,817] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3955.16 | bwd: 3868.30 | bwd_inner: 3858.08 | bwd_allreduce: 10.04 | step: 62.08
+ 76%|███████▌  | 533/700 [1:08:47<21:17,  7.65s/it]                                                   {'loss': 0.3856, 'learning_rate': 1.419768483781252e-05, 'epoch': 5.33}
+ 76%|███████▌  | 533/700 [1:08:47<21:17,  7.65s/it][2024-06-18 23:15:41,750] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1980.26 | bwd_microstep: 1930.94 | bwd_inner_microstep: 1925.96 | bwd_allreduce_microstep: 4.84 | step_microstep: 0.07
+[2024-06-18 23:15:44,255] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 23:15:44,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1106.71 | bwd_microstep: 1319.31 | bwd_inner_microstep: 1313.97 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.70
+[2024-06-18 23:15:44,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3086.93 | bwd: 3250.27 | bwd_inner: 3239.99 | bwd_allreduce: 10.09 | step: 61.78
+ 76%|███████▋  | 534/700 [1:08:53<20:09,  7.29s/it]                                                   {'loss': 0.2775, 'learning_rate': 1.4036581329503245e-05, 'epoch': 5.34}
+ 76%|███████▋  | 534/700 [1:08:53<20:09,  7.29s/it][2024-06-18 23:15:47,471] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1547.93 | bwd_microstep: 1645.74 | bwd_inner_microstep: 1640.96 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.07
+[2024-06-18 23:15:51,436] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.90
+[2024-06-18 23:15:51,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1973.77 | bwd_microstep: 1909.92 | bwd_inner_microstep: 1904.32 | bwd_allreduce_microstep: 5.52 | step_microstep: 63.80
+[2024-06-18 23:15:51,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3521.67 | bwd: 3555.69 | bwd_inner: 3545.31 | bwd_allreduce: 10.23 | step: 63.87
+ 76%|███████▋  | 535/700 [1:09:00<19:56,  7.25s/it]                                                   {'loss': 0.4864, 'learning_rate': 1.3876247696502238e-05, 'epoch': 5.35}
+ 76%|███████▋  | 535/700 [1:09:00<19:56,  7.25s/it][2024-06-18 23:15:55,371] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1976.54 | bwd_microstep: 1935.39 | bwd_inner_microstep: 1930.65 | bwd_allreduce_microstep: 4.69 | step_microstep: 0.07
+[2024-06-18 23:15:59,354] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:15:59,355] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1979.31 | bwd_microstep: 1924.78 | bwd_inner_microstep: 1919.38 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.40
+[2024-06-18 23:15:59,355] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3955.81 | bwd: 3860.19 | bwd_inner: 3850.08 | bwd_allreduce: 9.97 | step: 61.48
+ 77%|███████▋  | 536/700 [1:09:08<20:22,  7.45s/it]                                                   {'loss': 0.5636, 'learning_rate': 1.3716687371100096e-05, 'epoch': 5.36}
+ 77%|███████▋  | 536/700 [1:09:08<20:22,  7.45s/it][2024-06-18 23:16:02,608] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1566.85 | bwd_microstep: 1664.96 | bwd_inner_microstep: 1660.08 | bwd_allreduce_microstep: 4.79 | step_microstep: 0.08
+[2024-06-18 23:16:03,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 23:16:03,616] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 481.39 | bwd_microstep: 447.42 | bwd_inner_microstep: 442.13 | bwd_allreduce_microstep: 5.22 | step_microstep: 61.89
+[2024-06-18 23:16:03,616] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2048.21 | bwd: 2112.40 | bwd_inner: 2102.24 | bwd_allreduce: 10.02 | step: 61.98
+ 77%|███████▋  | 537/700 [1:09:13<17:38,  6.50s/it]                                                   {'loss': 0.0053, 'learning_rate': 1.3557903769033148e-05, 'epoch': 5.37}
+ 77%|███████▋  | 537/700 [1:09:13<17:38,  6.50s/it][2024-06-18 23:16:07,486] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1957.06 | bwd_microstep: 1891.23 | bwd_inner_microstep: 1886.37 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.08
+[2024-06-18 23:16:11,437] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:16:11,438] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1971.42 | bwd_microstep: 1899.15 | bwd_inner_microstep: 1893.83 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.65
+[2024-06-18 23:16:11,438] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3928.45 | bwd: 3790.41 | bwd_inner: 3780.26 | bwd_allreduce: 10.01 | step: 61.73
+ 77%|███████▋  | 538/700 [1:09:20<18:36,  6.89s/it]                                                   {'loss': 0.4416, 'learning_rate': 1.3399900289410245e-05, 'epoch': 5.38}
+ 77%|███████▋  | 538/700 [1:09:20<18:36,  6.89s/it][2024-06-18 23:16:14,836] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1706.03 | bwd_microstep: 1670.61 | bwd_inner_microstep: 1665.64 | bwd_allreduce_microstep: 4.90 | step_microstep: 0.14
+[2024-06-18 23:16:18,367] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 23:16:18,367] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1542.28 | bwd_microstep: 1909.51 | bwd_inner_microstep: 1904.15 | bwd_allreduce_microstep: 5.28 | step_microstep: 61.67
+[2024-06-18 23:16:18,367] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3248.28 | bwd: 3580.14 | bwd_inner: 3569.82 | bwd_allreduce: 10.19 | step: 61.82
+ 77%|███████▋  | 539/700 [1:09:27<18:31,  6.90s/it]                                                   {'loss': 0.5851, 'learning_rate': 1.3242680314639993e-05, 'epoch': 5.39}
+ 77%|███████▋  | 539/700 [1:09:27<18:31,  6.90s/it][2024-06-18 23:16:22,289] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1975.36 | bwd_microstep: 1924.91 | bwd_inner_microstep: 1920.02 | bwd_allreduce_microstep: 4.75 | step_microstep: 0.07
+[2024-06-18 23:16:26,013] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:16:26,014] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1896.61 | bwd_microstep: 1746.99 | bwd_inner_microstep: 1741.63 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.64
+[2024-06-18 23:16:26,014] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3871.94 | bwd: 3671.93 | bwd_inner: 3661.74 | bwd_allreduce: 10.01 | step: 61.71
+ 77%|███████▋  | 540/700 [1:09:35<19:00,  7.13s/it]                                                   {'loss': 0.1134, 'learning_rate': 1.3086247210358398e-05, 'epoch': 5.4}
+ 77%|███████▋  | 540/700 [1:09:35<19:00,  7.13s/it][2024-06-18 23:16:27,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 879.58 | bwd_microstep: 841.77 | bwd_inner_microstep: 836.80 | bwd_allreduce_microstep: 4.90 | step_microstep: 0.09
+[2024-06-18 23:16:31,705] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:16:31,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.07 | bwd_microstep: 1899.74 | bwd_inner_microstep: 1894.52 | bwd_allreduce_microstep: 5.17 | step_microstep: 61.38
+[2024-06-18 23:16:31,706] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2849.63 | bwd: 2741.53 | bwd_inner: 2731.34 | bwd_allreduce: 10.09 | step: 61.47
+ 77%|███████▋  | 541/700 [1:09:41<17:44,  6.70s/it]                                                   {'loss': 0.092, 'learning_rate': 1.2930604325356794e-05, 'epoch': 5.41}
+ 77%|███████▋  | 541/700 [1:09:41<17:44,  6.70s/it][2024-06-18 23:16:34,972] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1566.93 | bwd_microstep: 1677.37 | bwd_inner_microstep: 1672.43 | bwd_allreduce_microstep: 4.80 | step_microstep: 0.07
+[2024-06-18 23:16:38,346] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:16:38,347] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1629.84 | bwd_microstep: 1665.33 | bwd_inner_microstep: 1660.00 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.73
+[2024-06-18 23:16:38,347] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3196.74 | bwd: 3342.72 | bwd_inner: 3332.50 | bwd_allreduce: 10.03 | step: 61.81
+ 77%|███████▋  | 542/700 [1:09:47<17:35,  6.68s/it]                                                   {'loss': 0.478, 'learning_rate': 1.277575499151013e-05, 'epoch': 5.42}
+ 77%|███████▋  | 542/700 [1:09:47<17:35,  6.68s/it][2024-06-18 23:16:41,897] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1756.48 | bwd_microstep: 1771.65 | bwd_inner_microstep: 1766.75 | bwd_allreduce_microstep: 4.84 | step_microstep: 0.08
+[2024-06-18 23:16:45,938] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 23:16:45,939] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2000.26 | bwd_microstep: 1961.96 | bwd_inner_microstep: 1956.64 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.80
+[2024-06-18 23:16:45,939] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3756.72 | bwd: 3733.64 | bwd_inner: 3723.42 | bwd_allreduce: 10.08 | step: 61.89
+ 78%|███████▊  | 543/700 [1:09:55<18:11,  6.95s/it]                                                   {'loss': 0.3503, 'learning_rate': 1.2621702523705676e-05, 'epoch': 5.43}
+ 78%|███████▊  | 543/700 [1:09:55<18:11,  6.95s/it][2024-06-18 23:16:49,847] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1972.91 | bwd_microstep: 1912.98 | bwd_inner_microstep: 1908.03 | bwd_allreduce_microstep: 4.89 | step_microstep: 0.07
+[2024-06-18 23:16:53,657] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.92
+[2024-06-18 23:16:53,658] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1917.81 | bwd_microstep: 1810.27 | bwd_inner_microstep: 1804.57 | bwd_allreduce_microstep: 5.61 | step_microstep: 64.77
+[2024-06-18 23:16:53,658] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3890.69 | bwd: 3723.27 | bwd_inner: 3712.62 | bwd_allreduce: 10.50 | step: 64.85
+ 78%|███████▊  | 544/700 [1:10:03<18:40,  7.18s/it]                                                   {'loss': 0.1728, 'learning_rate': 1.2468450219772054e-05, 'epoch': 5.44}
+ 78%|███████▊  | 544/700 [1:10:03<18:40,  7.18s/it][2024-06-18 23:16:57,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1893.15 | bwd_microstep: 1739.59 | bwd_inner_microstep: 1734.83 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.08
+[2024-06-18 23:17:01,116] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:17:01,117] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1915.63 | bwd_microstep: 1808.72 | bwd_inner_microstep: 1803.23 | bwd_allreduce_microstep: 5.35 | step_microstep: 61.73
+[2024-06-18 23:17:01,117] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3808.75 | bwd: 3548.33 | bwd_inner: 3538.11 | bwd_allreduce: 10.06 | step: 61.81
+ 78%|███████▊  | 545/700 [1:10:10<18:46,  7.27s/it]                                                   {'loss': 0.007, 'learning_rate': 1.2316001360408614e-05, 'epoch': 5.45}
+ 78%|███████▊  | 545/700 [1:10:10<18:46,  7.27s/it][2024-06-18 23:17:04,613] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1832.97 | bwd_microstep: 1640.86 | bwd_inner_microstep: 1635.99 | bwd_allreduce_microstep: 4.79 | step_microstep: 0.07
+[2024-06-18 23:17:08,538] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.90
+[2024-06-18 23:17:08,539] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1960.51 | bwd_microstep: 1885.82 | bwd_inner_microstep: 1880.42 | bwd_allreduce_microstep: 5.36 | step_microstep: 61.71
+[2024-06-18 23:17:08,539] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3793.45 | bwd: 3526.70 | bwd_inner: 3516.43 | bwd_allreduce: 10.17 | step: 61.79
+ 78%|███████▊  | 546/700 [1:10:18<18:46,  7.31s/it]                                                   {'loss': 0.0939, 'learning_rate': 1.2164359209115234e-05, 'epoch': 5.46}
+ 78%|███████▊  | 546/700 [1:10:18<18:46,  7.31s/it][2024-06-18 23:17:12,278] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1913.33 | bwd_microstep: 1803.76 | bwd_inner_microstep: 1799.02 | bwd_allreduce_microstep: 4.69 | step_microstep: 0.07
+[2024-06-18 23:17:16,365] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:17:16,365] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2013.81 | bwd_microstep: 1993.70 | bwd_inner_microstep: 1988.37 | bwd_allreduce_microstep: 5.27 | step_microstep: 62.07
+[2024-06-18 23:17:16,366] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3927.11 | bwd: 3797.48 | bwd_inner: 3787.39 | bwd_allreduce: 9.98 | step: 62.15
+ 78%|███████▊  | 547/700 [1:10:25<19:02,  7.47s/it]                                                   {'loss': 0.5087, 'learning_rate': 1.2013527012122478e-05, 'epoch': 5.47}
+ 78%|███████▊  | 547/700 [1:10:25<19:02,  7.47s/it][2024-06-18 23:17:20,021] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1889.19 | bwd_microstep: 1744.75 | bwd_inner_microstep: 1739.77 | bwd_allreduce_microstep: 4.91 | step_microstep: 0.14
+[2024-06-18 23:17:24,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:17:24,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1974.90 | bwd_microstep: 1924.83 | bwd_inner_microstep: 1919.43 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.70
+[2024-06-18 23:17:24,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3864.07 | bwd: 3669.61 | bwd_inner: 3659.27 | bwd_allreduce: 10.16 | step: 61.84
+ 78%|███████▊  | 548/700 [1:10:33<19:02,  7.52s/it]                                                   {'loss': 0.2571, 'learning_rate': 1.186350799832202e-05, 'epoch': 5.48}
+ 78%|███████▊  | 548/700 [1:10:33<19:02,  7.52s/it][2024-06-18 23:17:27,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1964.20 | bwd_microstep: 1897.61 | bwd_inner_microstep: 1892.91 | bwd_allreduce_microstep: 4.65 | step_microstep: 0.07
+[2024-06-18 23:17:31,644] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:17:31,644] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1811.07 | bwd_microstep: 1868.46 | bwd_inner_microstep: 1863.11 | bwd_allreduce_microstep: 5.30 | step_microstep: 61.87
+[2024-06-18 23:17:31,644] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3775.24 | bwd: 3766.09 | bwd_inner: 3756.02 | bwd_allreduce: 9.97 | step: 61.95
+ 78%|███████▊  | 549/700 [1:10:41<19:00,  7.56s/it]                                                   {'loss': 0.6024, 'learning_rate': 1.1714305379197615e-05, 'epoch': 5.49}
+ 78%|███████▊  | 549/700 [1:10:41<19:00,  7.56s/it][2024-06-18 23:17:34,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1425.57 | bwd_microstep: 1665.52 | bwd_inner_microstep: 1660.63 | bwd_allreduce_microstep: 4.82 | step_microstep: 0.09
+[2024-06-18 23:17:38,689] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:17:38,690] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1965.52 | bwd_microstep: 1887.69 | bwd_inner_microstep: 1882.40 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.60
+[2024-06-18 23:17:38,690] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3391.06 | bwd: 3553.24 | bwd_inner: 3543.05 | bwd_allreduce: 10.08 | step: 61.70
+ 79%|███████▊  | 550/700 [1:10:48<18:30,  7.40s/it]                                                   {'loss': 0.3537, 'learning_rate': 1.1565922348756325e-05, 'epoch': 5.5}
+ 79%|███████▊  | 550/700 [1:10:48<18:30,  7.40s/it][2024-06-18 23:17:42,593] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.06 | bwd_microstep: 1914.53 | bwd_inner_microstep: 1909.79 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.08
+[2024-06-18 23:17:46,658] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:17:46,658] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2005.44 | bwd_microstep: 1979.39 | bwd_inner_microstep: 1973.83 | bwd_allreduce_microstep: 5.48 | step_microstep: 62.03
+[2024-06-18 23:17:46,659] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3972.47 | bwd: 3893.94 | bwd_inner: 3883.64 | bwd_allreduce: 10.19 | step: 62.11
+ 79%|███████▊  | 551/700 [1:10:56<18:48,  7.57s/it]                                                   {'loss': 0.3272, 'learning_rate': 1.1418362083460066e-05, 'epoch': 5.51}
+ 79%|███████▊  | 551/700 [1:10:56<18:48,  7.57s/it][2024-06-18 23:17:50,395] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1912.28 | bwd_microstep: 1802.22 | bwd_inner_microstep: 1797.28 | bwd_allreduce_microstep: 4.80 | step_microstep: 0.07
+[2024-06-18 23:17:54,190] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:17:54,191] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1911.90 | bwd_microstep: 1804.45 | bwd_inner_microstep: 1799.02 | bwd_allreduce_microstep: 5.29 | step_microstep: 61.89
+[2024-06-18 23:17:54,191] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3824.14 | bwd: 3606.69 | bwd_inner: 3596.40 | bwd_allreduce: 10.08 | step: 61.97
+ 79%|███████▉  | 552/700 [1:11:03<18:38,  7.56s/it]                                                   {'loss': 0.0008, 'learning_rate': 1.1271627742157742e-05, 'epoch': 5.52}
+ 79%|███████▉  | 552/700 [1:11:03<18:38,  7.56s/it][2024-06-18 23:17:55,968] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 890.55 | bwd_microstep: 865.72 | bwd_inner_microstep: 860.65 | bwd_allreduce_microstep: 4.92 | step_microstep: 0.09
+[2024-06-18 23:17:59,346] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.89
+[2024-06-18 23:17:59,347] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1493.17 | bwd_microstep: 1804.61 | bwd_inner_microstep: 1799.23 | bwd_allreduce_microstep: 5.30 | step_microstep: 62.46
+[2024-06-18 23:17:59,347] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2383.69 | bwd: 2670.36 | bwd_inner: 2659.96 | bwd_allreduce: 10.20 | step: 62.56
+ 79%|███████▉  | 553/700 [1:11:08<16:45,  6.84s/it]                                                   {'loss': 0.3474, 'learning_rate': 1.1125722466017547e-05, 'epoch': 5.53}
+ 79%|███████▉  | 553/700 [1:11:08<16:45,  6.84s/it][2024-06-18 23:18:03,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1963.18 | bwd_microstep: 1895.81 | bwd_inner_microstep: 1891.05 | bwd_allreduce_microstep: 4.69 | step_microstep: 0.07
+[2024-06-18 23:18:07,282] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 23:18:07,282] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2000.83 | bwd_microstep: 1971.99 | bwd_inner_microstep: 1966.55 | bwd_allreduce_microstep: 5.36 | step_microstep: 62.33
+[2024-06-18 23:18:07,283] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3963.97 | bwd: 3867.82 | bwd_inner: 3857.63 | bwd_allreduce: 10.06 | step: 62.41
+ 79%|███████▉  | 554/700 [1:11:16<17:26,  7.17s/it]                                                   {'loss': 0.4607, 'learning_rate': 1.0980649378459668e-05, 'epoch': 5.54}
+ 79%|███████▉  | 554/700 [1:11:16<17:26,  7.17s/it][2024-06-18 23:18:11,205] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1973.45 | bwd_microstep: 1927.47 | bwd_inner_microstep: 1922.61 | bwd_allreduce_microstep: 4.78 | step_microstep: 0.08
+[2024-06-18 23:18:15,183] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 23:18:15,183] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1974.08 | bwd_microstep: 1924.33 | bwd_inner_microstep: 1919.03 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.60
+[2024-06-18 23:18:15,184] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3947.50 | bwd: 3851.82 | bwd_inner: 3841.66 | bwd_allreduce: 10.04 | step: 61.68
+ 79%|███████▉  | 555/700 [1:11:24<17:51,  7.39s/it]                                                   {'loss': 0.3995, 'learning_rate': 1.083641158508955e-05, 'epoch': 5.55}
+ 79%|███████▉  | 555/700 [1:11:24<17:51,  7.39s/it][2024-06-18 23:18:19,061] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1961.99 | bwd_microstep: 1894.02 | bwd_inner_microstep: 1889.12 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.07
+[2024-06-18 23:18:22,480] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:18:22,480] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1503.65 | bwd_microstep: 1836.45 | bwd_inner_microstep: 1831.12 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.79
+[2024-06-18 23:18:22,481] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3465.61 | bwd: 3730.49 | bwd_inner: 3720.32 | bwd_allreduce: 9.99 | step: 61.87
+ 79%|███████▉  | 556/700 [1:11:31<17:39,  7.36s/it]                                                   {'loss': 0.4744, 'learning_rate': 1.0693012173631344e-05, 'epoch': 5.56}
+ 79%|███████▉  | 556/700 [1:11:31<17:39,  7.36s/it][2024-06-18 23:18:26,464] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1998.16 | bwd_microstep: 1963.78 | bwd_inner_microstep: 1958.98 | bwd_allreduce_microstep: 4.75 | step_microstep: 0.14
+[2024-06-18 23:18:30,406] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 23:18:30,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.57 | bwd_microstep: 1894.59 | bwd_inner_microstep: 1889.24 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.98
+[2024-06-18 23:18:30,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3965.70 | bwd: 3858.39 | bwd_inner: 3848.24 | bwd_allreduce: 10.04 | step: 62.13
+ 80%|███████▉  | 557/700 [1:11:39<17:56,  7.53s/it]                                                   {'loss': 0.3983, 'learning_rate': 1.055045421386175e-05, 'epoch': 5.57}
+ 80%|███████▉  | 557/700 [1:11:39<17:56,  7.53s/it][2024-06-18 23:18:34,283] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1962.74 | bwd_microstep: 1891.27 | bwd_inner_microstep: 1886.50 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.07
+[2024-06-18 23:18:38,337] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:18:38,337] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2003.63 | bwd_microstep: 1971.17 | bwd_inner_microstep: 1965.72 | bwd_allreduce_microstep: 5.30 | step_microstep: 61.78
+[2024-06-18 23:18:38,337] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3966.34 | bwd: 3862.46 | bwd_inner: 3852.28 | bwd_allreduce: 10.02 | step: 61.86
+ 80%|███████▉  | 558/700 [1:11:47<18:06,  7.65s/it]                                                   {'loss': 0.5281, 'learning_rate': 1.0408740757544416e-05, 'epoch': 5.58}
+ 80%|███████▉  | 558/700 [1:11:47<18:06,  7.65s/it][2024-06-18 23:18:42,324] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1998.77 | bwd_microstep: 1966.59 | bwd_inner_microstep: 1961.76 | bwd_allreduce_microstep: 4.79 | step_microstep: 0.07
+[2024-06-18 23:18:46,288] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:18:46,289] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.23 | bwd_microstep: 1914.78 | bwd_inner_microstep: 1909.48 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.71
+[2024-06-18 23:18:46,289] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3968.97 | bwd: 3881.39 | bwd_inner: 3871.24 | bwd_allreduce: 10.06 | step: 61.79
+ 80%|███████▉  | 559/700 [1:11:55<18:11,  7.74s/it]                                                   {'loss': 0.5738, 'learning_rate': 1.026787483836456e-05, 'epoch': 5.59}
+ 80%|███████▉  | 559/700 [1:11:55<18:11,  7.74s/it][2024-06-18 23:18:50,262] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1993.28 | bwd_microstep: 1957.94 | bwd_inner_microstep: 1953.01 | bwd_allreduce_microstep: 4.85 | step_microstep: 0.08
+[2024-06-18 23:18:54,314] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 23:18:54,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2004.05 | bwd_microstep: 1968.00 | bwd_inner_microstep: 1962.53 | bwd_allreduce_microstep: 5.33 | step_microstep: 61.64
+[2024-06-18 23:18:54,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3997.30 | bwd: 3925.96 | bwd_inner: 3915.61 | bwd_allreduce: 10.16 | step: 61.72
+ 80%|████████  | 560/700 [1:12:03<18:15,  7.83s/it]                                                   {'loss': 0.6385, 'learning_rate': 1.012785947186397e-05, 'epoch': 5.6}
+ 80%|████████  | 560/700 [1:12:03<18:15,  7.83s/it][2024-06-18 23:18:57,639] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1492.90 | bwd_microstep: 1809.86 | bwd_inner_microstep: 1805.03 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.07
+[2024-06-18 23:19:01,446] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:19:01,446] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1917.08 | bwd_microstep: 1810.35 | bwd_inner_microstep: 1805.08 | bwd_allreduce_microstep: 5.23 | step_microstep: 61.98
+[2024-06-18 23:19:01,446] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3409.96 | bwd: 3620.23 | bwd_inner: 3610.12 | bwd_allreduce: 10.00 | step: 62.06
+ 80%|████████  | 561/700 [1:12:10<17:38,  7.62s/it]                                                   {'loss': 0.3275, 'learning_rate': 9.988697655376544e-06, 'epoch': 5.61}
+ 80%|████████  | 561/700 [1:12:10<17:38,  7.62s/it][2024-06-18 23:19:05,099] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1890.89 | bwd_microstep: 1740.02 | bwd_inner_microstep: 1734.92 | bwd_allreduce_microstep: 4.96 | step_microstep: 0.09
+[2024-06-18 23:19:09,154] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.89
+[2024-06-18 23:19:09,154] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1999.67 | bwd_microstep: 1975.03 | bwd_inner_microstep: 1969.73 | bwd_allreduce_microstep: 5.26 | step_microstep: 62.51
+[2024-06-18 23:19:09,155] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3890.52 | bwd: 3715.08 | bwd_inner: 3704.70 | bwd_allreduce: 10.20 | step: 62.60
+ 80%|████████  | 562/700 [1:12:18<17:35,  7.64s/it]                                                   {'loss': 0.0976, 'learning_rate': 9.850392367964085e-06, 'epoch': 5.62}
+ 80%|████████  | 562/700 [1:12:18<17:35,  7.64s/it][2024-06-18 23:19:12,726] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1856.72 | bwd_microstep: 1693.45 | bwd_inner_microstep: 1688.74 | bwd_allreduce_microstep: 4.67 | step_microstep: 0.07
+[2024-06-18 23:19:15,969] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:19:15,970] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1444.81 | bwd_microstep: 1719.05 | bwd_inner_microstep: 1713.72 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.88
+[2024-06-18 23:19:15,970] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3301.50 | bwd: 3412.52 | bwd_inner: 3402.47 | bwd_allreduce: 9.94 | step: 61.96
+ 80%|████████  | 563/700 [1:12:25<16:53,  7.40s/it]                                                   {'loss': 0.0283, 'learning_rate': 9.712946570352472e-06, 'epoch': 5.63}
+ 80%|████████  | 563/700 [1:12:25<16:53,  7.40s/it][2024-06-18 23:19:19,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1913.13 | bwd_microstep: 1807.76 | bwd_inner_microstep: 1802.94 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.08
+[2024-06-18 23:19:23,707] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 23:19:23,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1980.24 | bwd_microstep: 1934.01 | bwd_inner_microstep: 1928.71 | bwd_allreduce_microstep: 5.22 | step_microstep: 61.57
+[2024-06-18 23:19:23,708] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3893.33 | bwd: 3741.79 | bwd_inner: 3731.69 | bwd_allreduce: 9.97 | step: 61.66
+ 81%|████████  | 564/700 [1:12:33<16:59,  7.50s/it]                                                   {'loss': 0.138, 'learning_rate': 9.576363204868416e-06, 'epoch': 5.64}
+ 81%|████████  | 564/700 [1:12:33<16:59,  7.50s/it][2024-06-18 23:19:27,359] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1888.34 | bwd_microstep: 1740.75 | bwd_inner_microstep: 1735.93 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.07
+[2024-06-18 23:19:31,313] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.90
+[2024-06-18 23:19:31,314] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.00 | bwd_microstep: 1905.22 | bwd_inner_microstep: 1899.72 | bwd_allreduce_microstep: 5.43 | step_microstep: 62.15
+[2024-06-18 23:19:31,314] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3858.31 | bwd: 3645.99 | bwd_inner: 3635.71 | bwd_allreduce: 10.14 | step: 62.23
+ 81%|████████  | 565/700 [1:12:40<16:56,  7.53s/it]                                                   {'loss': 0.2264, 'learning_rate': 9.44064519537634e-06, 'epoch': 5.65}
+ 81%|████████  | 565/700 [1:12:40<16:56,  7.53s/it][2024-06-18 23:19:35,226] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1971.20 | bwd_microstep: 1918.90 | bwd_inner_microstep: 1914.14 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.08
+[2024-06-18 23:19:39,202] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 23:19:39,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1972.16 | bwd_microstep: 1915.64 | bwd_inner_microstep: 1910.10 | bwd_allreduce_microstep: 5.40 | step_microstep: 71.04
+[2024-06-18 23:19:39,203] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3943.33 | bwd: 3834.56 | bwd_inner: 3824.29 | bwd_allreduce: 10.11 | step: 71.12
+ 81%|████████  | 566/700 [1:12:48<17:03,  7.64s/it]                                                   {'loss': 0.3201, 'learning_rate': 9.305795447215827e-06, 'epoch': 5.66}
+ 81%|████████  | 566/700 [1:12:48<17:03,  7.64s/it][2024-06-18 23:19:42,588] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1701.99 | bwd_microstep: 1661.30 | bwd_inner_microstep: 1656.45 | bwd_allreduce_microstep: 4.75 | step_microstep: 0.07
+[2024-06-18 23:19:46,397] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:19:46,398] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1917.15 | bwd_microstep: 1812.88 | bwd_inner_microstep: 1807.43 | bwd_allreduce_microstep: 5.30 | step_microstep: 61.89
+[2024-06-18 23:19:46,398] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3619.11 | bwd: 3474.20 | bwd_inner: 3463.98 | bwd_allreduce: 10.04 | step: 61.97
+ 81%|████████  | 567/700 [1:12:55<16:38,  7.51s/it]                                                   {'loss': 0.0013, 'learning_rate': 9.171816847139448e-06, 'epoch': 5.67}
+ 81%|████████  | 567/700 [1:12:55<16:38,  7.51s/it][2024-06-18 23:19:50,260] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1959.07 | bwd_microstep: 1881.62 | bwd_inner_microstep: 1876.77 | bwd_allreduce_microstep: 4.77 | step_microstep: 0.08
+[2024-06-18 23:19:54,220] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:19:54,220] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1969.90 | bwd_microstep: 1909.28 | bwd_inner_microstep: 1903.94 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.84
+[2024-06-18 23:19:54,221] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3928.93 | bwd: 3790.92 | bwd_inner: 3780.74 | bwd_allreduce: 10.04 | step: 61.92
+ 81%|████████  | 568/700 [1:13:03<16:43,  7.60s/it]                                                   {'loss': 0.4852, 'learning_rate': 9.03871226325098e-06, 'epoch': 5.68}
+ 81%|████████  | 568/700 [1:13:03<16:43,  7.60s/it][2024-06-18 23:19:55,301] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 517.17 | bwd_microstep: 543.05 | bwd_inner_microstep: 538.29 | bwd_allreduce_microstep: 4.69 | step_microstep: 0.07
+[2024-06-18 23:19:59,345] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 23:19:59,346] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1995.04 | bwd_microstep: 1969.97 | bwd_inner_microstep: 1964.66 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.90
+[2024-06-18 23:19:59,346] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2512.18 | bwd: 2513.04 | bwd_inner: 2502.97 | bwd_allreduce: 9.94 | step: 61.98
+ 81%|████████▏ | 569/700 [1:13:08<14:58,  6.86s/it]                                                   {'loss': 0.629, 'learning_rate': 8.906484544943932e-06, 'epoch': 5.69}
+ 81%|████████▏ | 569/700 [1:13:08<14:58,  6.86s/it][2024-06-18 23:20:03,090] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1915.21 | bwd_microstep: 1806.71 | bwd_inner_microstep: 1801.91 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.08
+[2024-06-18 23:20:06,893] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:20:06,894] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1915.60 | bwd_microstep: 1808.64 | bwd_inner_microstep: 1803.34 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.70
+[2024-06-18 23:20:06,894] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3830.78 | bwd: 3615.37 | bwd_inner: 3605.26 | bwd_allreduce: 9.99 | step: 61.79
+ 81%|████████▏ | 570/700 [1:13:16<15:18,  7.06s/it]                                                   {'loss': 0.0243, 'learning_rate': 8.775136522840621e-06, 'epoch': 5.7}
+ 81%|████████▏ | 570/700 [1:13:16<15:18,  7.06s/it][2024-06-18 23:20:10,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2009.19 | bwd_microstep: 1990.36 | bwd_inner_microstep: 1985.32 | bwd_allreduce_microstep: 4.95 | step_microstep: 0.09
+[2024-06-18 23:20:14,784] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 23:20:14,784] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1940.31 | bwd_microstep: 1846.97 | bwd_inner_microstep: 1841.65 | bwd_allreduce_microstep: 5.27 | step_microstep: 62.23
+[2024-06-18 23:20:14,785] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3949.47 | bwd: 3837.35 | bwd_inner: 3827.00 | bwd_allreduce: 10.23 | step: 62.33
+ 82%|████████▏ | 571/700 [1:13:24<15:43,  7.31s/it]                                                   {'loss': 0.6967, 'learning_rate': 8.644671008731569e-06, 'epoch': 5.71}
+ 82%|████████▏ | 571/700 [1:13:24<15:43,  7.31s/it][2024-06-18 23:20:18,108] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1494.60 | bwd_microstep: 1807.54 | bwd_inner_microstep: 1802.63 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.08
+[2024-06-18 23:20:21,904] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 2.05
+[2024-06-18 23:20:21,905] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1819.23 | bwd_microstep: 1894.89 | bwd_inner_microstep: 1889.33 | bwd_allreduce_microstep: 5.48 | step_microstep: 64.37
+[2024-06-18 23:20:21,905] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3313.80 | bwd: 3702.45 | bwd_inner: 3692.03 | bwd_allreduce: 10.22 | step: 64.45
+ 82%|████████▏ | 572/700 [1:13:31<15:28,  7.26s/it]                                                   {'loss': 0.4464, 'learning_rate': 8.515090795515247e-06, 'epoch': 5.72}
+ 82%|████████▏ | 572/700 [1:13:31<15:28,  7.26s/it][2024-06-18 23:20:25,395] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1827.02 | bwd_microstep: 1640.07 | bwd_inner_microstep: 1635.18 | bwd_allreduce_microstep: 4.81 | step_microstep: 0.07
+[2024-06-18 23:20:29,334] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:20:29,335] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1965.15 | bwd_microstep: 1895.04 | bwd_inner_microstep: 1889.76 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.60
+[2024-06-18 23:20:29,335] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3792.14 | bwd: 3535.13 | bwd_inner: 3524.96 | bwd_allreduce: 10.07 | step: 61.68
+ 82%|████████▏ | 573/700 [1:13:38<15:28,  7.31s/it]                                                   {'loss': 0.1574, 'learning_rate': 8.386398657138356e-06, 'epoch': 5.73}
+ 82%|████████▏ | 573/700 [1:13:38<15:28,  7.31s/it][2024-06-18 23:20:33,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1911.31 | bwd_microstep: 1801.71 | bwd_inner_microstep: 1796.81 | bwd_allreduce_microstep: 4.77 | step_microstep: 0.07
+[2024-06-18 23:20:37,069] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.89
+[2024-06-18 23:20:37,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1985.55 | bwd_microstep: 1934.11 | bwd_inner_microstep: 1928.59 | bwd_allreduce_microstep: 5.45 | step_microstep: 62.48
+[2024-06-18 23:20:37,070] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3896.83 | bwd: 3735.84 | bwd_inner: 3725.46 | bwd_allreduce: 10.23 | step: 62.56
+ 82%|████████▏ | 574/700 [1:13:46<15:36,  7.44s/it]                                                   {'loss': 0.2407, 'learning_rate': 8.25859734853645e-06, 'epoch': 5.74}
+ 82%|████████▏ | 574/700 [1:13:46<15:36,  7.44s/it][2024-06-18 23:20:40,989] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1973.05 | bwd_microstep: 1923.66 | bwd_inner_microstep: 1918.69 | bwd_allreduce_microstep: 4.83 | step_microstep: 0.07
+[2024-06-18 23:20:44,974] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:20:44,975] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1977.76 | bwd_microstep: 1928.46 | bwd_inner_microstep: 1923.16 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.50
+[2024-06-18 23:20:44,975] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3950.78 | bwd: 3852.14 | bwd_inner: 3841.90 | bwd_allreduce: 10.08 | step: 61.59
+ 82%|████████▏ | 575/700 [1:13:54<15:47,  7.58s/it]                                                   {'loss': 0.3508, 'learning_rate': 8.131689605574866e-06, 'epoch': 5.75}
+ 82%|████████▏ | 575/700 [1:13:54<15:47,  7.58s/it][2024-06-18 23:20:48,549] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1858.60 | bwd_microstep: 1693.85 | bwd_inner_microstep: 1688.95 | bwd_allreduce_microstep: 4.84 | step_microstep: 0.09
+[2024-06-18 23:20:52,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:20:52,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1972.17 | bwd_microstep: 1916.66 | bwd_inner_microstep: 1911.37 | bwd_allreduce_microstep: 5.22 | step_microstep: 61.64
+[2024-06-18 23:20:52,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3830.74 | bwd: 3610.53 | bwd_inner: 3600.33 | bwd_allreduce: 10.06 | step: 61.73
+ 82%|████████▏ | 576/700 [1:14:02<15:38,  7.57s/it]                                                   {'loss': 0.279, 'learning_rate': 8.00567814499028e-06, 'epoch': 5.76}
+ 82%|████████▏ | 576/700 [1:14:02<15:38,  7.57s/it][2024-06-18 23:20:56,032] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1652.74 | bwd_microstep: 1840.42 | bwd_inner_microstep: 1835.59 | bwd_allreduce_microstep: 4.75 | step_microstep: 0.08
+[2024-06-18 23:21:00,028] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 23:21:00,029] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1981.47 | bwd_microstep: 1935.24 | bwd_inner_microstep: 1930.04 | bwd_allreduce_microstep: 5.15 | step_microstep: 61.50
+[2024-06-18 23:21:00,029] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3634.18 | bwd: 3775.68 | bwd_inner: 3765.65 | bwd_allreduce: 9.92 | step: 61.58
+ 82%|████████▏ | 577/700 [1:14:09<15:28,  7.55s/it]                                                   {'loss': 0.6613, 'learning_rate': 7.880565664332474e-06, 'epoch': 5.77}
+ 82%|████████▏ | 577/700 [1:14:09<15:28,  7.55s/it][2024-06-18 23:21:03,948] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1974.16 | bwd_microstep: 1923.31 | bwd_inner_microstep: 1918.48 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.07
+[2024-06-18 23:21:07,747] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:21:07,747] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1913.32 | bwd_microstep: 1806.04 | bwd_inner_microstep: 1800.77 | bwd_allreduce_microstep: 5.22 | step_microstep: 61.63
+[2024-06-18 23:21:07,748] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3887.45 | bwd: 3729.38 | bwd_inner: 3719.27 | bwd_allreduce: 10.00 | step: 61.71
+ 83%|████████▎ | 578/700 [1:14:17<15:27,  7.60s/it]                                                   {'loss': 0.276, 'learning_rate': 7.756354841906582e-06, 'epoch': 5.78}
+ 83%|████████▎ | 578/700 [1:14:17<15:27,  7.60s/it][2024-06-18 23:21:11,654] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1966.60 | bwd_microstep: 1918.37 | bwd_inner_microstep: 1913.55 | bwd_allreduce_microstep: 4.78 | step_microstep: 0.08
+[2024-06-18 23:21:15,609] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 23:21:15,609] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1971.83 | bwd_microstep: 1901.47 | bwd_inner_microstep: 1895.81 | bwd_allreduce_microstep: 5.56 | step_microstep: 63.90
+[2024-06-18 23:21:15,610] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3938.40 | bwd: 3819.86 | bwd_inner: 3809.38 | bwd_allreduce: 10.36 | step: 63.99
+ 83%|████████▎ | 579/700 [1:14:25<15:29,  7.68s/it]                                                   {'loss': 0.4276, 'learning_rate': 7.633048336715814e-06, 'epoch': 5.79}
+ 83%|████████▎ | 579/700 [1:14:25<15:29,  7.68s/it][2024-06-18 23:21:19,477] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1958.13 | bwd_microstep: 1887.30 | bwd_inner_microstep: 1882.38 | bwd_allreduce_microstep: 4.78 | step_microstep: 0.07
+[2024-06-18 23:21:22,567] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:21:22,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1395.55 | bwd_microstep: 1615.18 | bwd_inner_microstep: 1609.84 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.75
+[2024-06-18 23:21:22,568] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3353.65 | bwd: 3502.50 | bwd_inner: 3492.29 | bwd_allreduce: 10.02 | step: 61.83
+ 83%|████████▎ | 580/700 [1:14:32<14:55,  7.46s/it]                                                   {'loss': 0.7235, 'learning_rate': 7.510648788404501e-06, 'epoch': 5.8}
+ 83%|████████▎ | 580/700 [1:14:32<14:55,  7.46s/it][2024-06-18 23:21:26,453] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1968.09 | bwd_microstep: 1894.81 | bwd_inner_microstep: 1889.95 | bwd_allreduce_microstep: 4.78 | step_microstep: 0.08
+[2024-06-18 23:21:29,942] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 23:21:29,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1527.18 | bwd_microstep: 1882.67 | bwd_inner_microstep: 1877.34 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.72
+[2024-06-18 23:21:29,943] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3495.24 | bwd: 3777.50 | bwd_inner: 3767.32 | bwd_allreduce: 10.05 | step: 61.81
+ 83%|████████▎ | 581/700 [1:14:39<14:44,  7.44s/it]                                                   {'loss': 0.5805, 'learning_rate': 7.389158817201542e-06, 'epoch': 5.81}
+ 83%|████████▎ | 581/700 [1:14:39<14:44,  7.44s/it][2024-06-18 23:21:33,688] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1914.32 | bwd_microstep: 1808.73 | bwd_inner_microstep: 1803.76 | bwd_allreduce_microstep: 4.82 | step_microstep: 0.07
+[2024-06-18 23:21:37,409] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.89
+[2024-06-18 23:21:37,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1893.20 | bwd_microstep: 1748.80 | bwd_inner_microstep: 1743.34 | bwd_allreduce_microstep: 5.41 | step_microstep: 62.09
+[2024-06-18 23:21:37,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3807.49 | bwd: 3557.55 | bwd_inner: 3547.16 | bwd_allreduce: 10.23 | step: 62.17
+ 83%|████████▎ | 582/700 [1:14:46<14:38,  7.45s/it]                                                   {'loss': 0.0223, 'learning_rate': 7.268581023864402e-06, 'epoch': 5.82}
+ 83%|████████▎ | 582/700 [1:14:46<14:38,  7.45s/it][2024-06-18 23:21:41,399] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1997.34 | bwd_microstep: 1969.53 | bwd_inner_microstep: 1964.79 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.08
+[2024-06-18 23:21:45,445] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:21:45,446] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2001.89 | bwd_microstep: 1965.82 | bwd_inner_microstep: 1960.40 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.62
+[2024-06-18 23:21:45,446] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3999.20 | bwd: 3935.38 | bwd_inner: 3925.24 | bwd_allreduce: 10.03 | step: 61.71
+ 83%|████████▎ | 583/700 [1:14:54<14:51,  7.62s/it]                                                   {'loss': 0.2231, 'learning_rate': 7.1489179896233885e-06, 'epoch': 5.83}
+ 83%|████���███▎ | 583/700 [1:14:54<14:51,  7.62s/it][2024-06-18 23:21:49,021] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1858.50 | bwd_microstep: 1694.31 | bwd_inner_microstep: 1689.55 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.09
+[2024-06-18 23:21:53,020] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:21:53,021] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1979.91 | bwd_microstep: 1939.68 | bwd_inner_microstep: 1934.39 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.79
+[2024-06-18 23:21:53,021] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3838.39 | bwd: 3634.01 | bwd_inner: 3623.94 | bwd_allreduce: 9.98 | step: 61.89
+ 83%|████████▎ | 584/700 [1:15:02<14:42,  7.61s/it]                                                   {'loss': 0.1097, 'learning_rate': 7.030172276126351e-06, 'epoch': 5.84}
+ 83%|████████▎ | 584/700 [1:15:02<14:42,  7.61s/it][2024-06-18 23:21:56,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1909.70 | bwd_microstep: 1803.12 | bwd_inner_microstep: 1798.31 | bwd_allreduce_microstep: 4.77 | step_microstep: 0.08
+[2024-06-18 23:22:00,009] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:22:00,009] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1446.51 | bwd_microstep: 1726.85 | bwd_inner_microstep: 1721.47 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.65
+[2024-06-18 23:22:00,009] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3356.19 | bwd: 3530.00 | bwd_inner: 3519.83 | bwd_allreduce: 10.04 | step: 61.74
+ 84%|████████▎ | 585/700 [1:15:09<14:13,  7.42s/it]                                                   {'loss': 0.1279, 'learning_rate': 6.912346425383964e-06, 'epoch': 5.85}
+ 84%|████████▎ | 585/700 [1:15:09<14:13,  7.42s/it][2024-06-18 23:22:03,889] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1962.95 | bwd_microstep: 1894.83 | bwd_inner_microstep: 1890.09 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.07
+[2024-06-18 23:22:07,888] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 23:22:07,889] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1979.88 | bwd_microstep: 1939.31 | bwd_inner_microstep: 1933.88 | bwd_allreduce_microstep: 5.38 | step_microstep: 62.88
+[2024-06-18 23:22:07,889] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3942.80 | bwd: 3834.17 | bwd_inner: 3823.97 | bwd_allreduce: 10.10 | step: 62.96
+ 84%|████████▎ | 586/700 [1:15:17<14:21,  7.56s/it]                                                   {'loss': 0.64, 'learning_rate': 6.795442959715192e-06, 'epoch': 5.86}
+ 84%|████████▎ | 586/700 [1:15:17<14:21,  7.56s/it][2024-06-18 23:22:11,248] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1501.56 | bwd_microstep: 1835.76 | bwd_inner_microstep: 1830.75 | bwd_allreduce_microstep: 4.87 | step_microstep: 0.08
+[2024-06-18 23:22:15,326] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:22:15,327] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2007.50 | bwd_microstep: 1991.44 | bwd_inner_microstep: 1986.15 | bwd_allreduce_microstep: 5.22 | step_microstep: 61.54
+[2024-06-18 23:22:15,327] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3509.03 | bwd: 3827.22 | bwd_inner: 3816.97 | bwd_allreduce: 10.07 | step: 61.62
+ 84%|████████▍ | 587/700 [1:15:24<14:10,  7.52s/it]                                                   {'loss': 0.5757, 'learning_rate': 6.679464381693323e-06, 'epoch': 5.87}
+ 84%|████████▍ | 587/700 [1:15:24<14:10,  7.52s/it][2024-06-18 23:22:18,979] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1889.80 | bwd_microstep: 1740.70 | bwd_inner_microstep: 1735.73 | bwd_allreduce_microstep: 4.89 | step_microstep: 0.09
+[2024-06-18 23:22:22,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 23:22:22,929] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1969.74 | bwd_microstep: 1899.41 | bwd_inner_microstep: 1893.97 | bwd_allreduce_microstep: 5.28 | step_microstep: 62.13
+[2024-06-18 23:22:22,929] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3859.51 | bwd: 3640.14 | bwd_inner: 3629.78 | bwd_allreduce: 10.17 | step: 62.22
+ 84%|████████▍ | 588/700 [1:15:32<14:05,  7.55s/it]                                                   {'loss': 0.2662, 'learning_rate': 6.564413174092443e-06, 'epoch': 5.88}
+ 84%|████████▍ | 588/700 [1:15:32<14:05,  7.55s/it][2024-06-18 23:22:26,909] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1998.42 | bwd_microstep: 1959.23 | bwd_inner_microstep: 1954.30 | bwd_allreduce_microstep: 4.85 | step_microstep: 0.08
+[2024-06-18 23:22:30,890] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.91
+[2024-06-18 23:22:30,891] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1975.23 | bwd_microstep: 1926.61 | bwd_inner_microstep: 1921.23 | bwd_allreduce_microstep: 5.30 | step_microstep: 61.84
+[2024-06-18 23:22:30,891] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3973.61 | bwd: 3885.86 | bwd_inner: 3875.57 | bwd_allreduce: 10.16 | step: 61.92
+ 84%|████████▍ | 589/700 [1:15:40<14:11,  7.67s/it]                                                   {'loss': 0.4304, 'learning_rate': 6.4502917998342575e-06, 'epoch': 5.89}
+ 84%|████████▍ | 589/700 [1:15:40<14:11,  7.67s/it][2024-06-18 23:22:34,878] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1998.43 | bwd_microstep: 1966.37 | bwd_inner_microstep: 1961.60 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.07
+[2024-06-18 23:22:38,953] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:22:38,953] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2014.17 | bwd_microstep: 1980.89 | bwd_inner_microstep: 1975.42 | bwd_allreduce_microstep: 5.33 | step_microstep: 61.70
+[2024-06-18 23:22:38,954] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 4012.57 | bwd: 3947.29 | bwd_inner: 3937.07 | bwd_allreduce: 10.05 | step: 61.78
+ 84%|████████▍ | 590/700 [1:15:48<14:16,  7.79s/it]                                                   {'loss': 0.6495, 'learning_rate': 6.337102701935321e-06, 'epoch': 5.9}
+ 84%|████████▍ | 590/700 [1:15:48<14:16,  7.79s/it][2024-06-18 23:22:42,828] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1962.68 | bwd_microstep: 1890.03 | bwd_inner_microstep: 1882.87 | bwd_allreduce_microstep: 7.01 | step_microstep: 0.07
+[2024-06-18 23:22:46,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 23:22:46,885] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2004.52 | bwd_microstep: 1972.56 | bwd_inner_microstep: 1966.99 | bwd_allreduce_microstep: 5.43 | step_microstep: 61.85
+[2024-06-18 23:22:46,886] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3967.17 | bwd: 3862.61 | bwd_inner: 3849.97 | bwd_allreduce: 12.40 | step: 61.93
+ 84%|████████▍ | 591/700 [1:15:56<14:13,  7.83s/it]                                                   {'loss': 0.5168, 'learning_rate': 6.2248483034548276e-06, 'epoch': 5.91}
+ 84%|████████▍ | 591/700 [1:15:56<14:13,  7.83s/it][2024-06-18 23:22:50,765] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1962.75 | bwd_microstep: 1895.10 | bwd_inner_microstep: 1890.19 | bwd_allreduce_microstep: 4.77 | step_microstep: 0.08
+[2024-06-18 23:22:54,713] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:22:54,714] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.66 | bwd_microstep: 1901.17 | bwd_inner_microstep: 1895.84 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.61
+[2024-06-18 23:22:54,714] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3930.38 | bwd: 3796.29 | bwd_inner: 3786.09 | bwd_allreduce: 10.01 | step: 61.70
+ 85%|████████▍ | 592/700 [1:16:04<14:05,  7.83s/it]                                                   {'loss': 0.2505, 'learning_rate': 6.11353100744268e-06, 'epoch': 5.92}
+ 85%|████████▍ | 592/700 [1:16:04<14:05,  7.83s/it][2024-06-18 23:22:57,925] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1545.48 | bwd_microstep: 1644.17 | bwd_inner_microstep: 1639.34 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.09
+[2024-06-18 23:23:01,533] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:23:01,533] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1660.97 | bwd_microstep: 1867.83 | bwd_inner_microstep: 1862.49 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.71
+[2024-06-18 23:23:01,534] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3206.42 | bwd: 3512.02 | bwd_inner: 3501.86 | bwd_allreduce: 10.02 | step: 61.81
+ 85%|████████▍ | 593/700 [1:16:11<13:25,  7.53s/it]                                                   {'loss': 0.3987, 'learning_rate': 6.003153196888045e-06, 'epoch': 5.93}
+ 85%|████████▍ | 593/700 [1:16:11<13:25,  7.53s/it][2024-06-18 23:23:05,407] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1962.58 | bwd_microstep: 1889.78 | bwd_inner_microstep: 1884.98 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.08
+[2024-06-18 23:23:09,489] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:23:09,490] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2011.10 | bwd_microstep: 1990.79 | bwd_inner_microstep: 1985.38 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.81
+[2024-06-18 23:23:09,490] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3973.64 | bwd: 3880.59 | bwd_inner: 3870.44 | bwd_allreduce: 9.97 | step: 61.89
+ 85%|████████▍ | 594/700 [1:16:18<13:31,  7.66s/it]                                                   {'loss': 0.556, 'learning_rate': 5.893717234668383e-06, 'epoch': 5.94}
+ 85%|████████▍ | 594/700 [1:16:18<13:31,  7.66s/it][2024-06-18 23:23:13,065] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1857.51 | bwd_microstep: 1695.63 | bwd_inner_microstep: 1690.84 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.07
+[2024-06-18 23:23:17,047] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 23:23:17,048] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1982.69 | bwd_microstep: 1919.28 | bwd_inner_microstep: 1914.01 | bwd_allreduce_microstep: 5.23 | step_microstep: 61.85
+[2024-06-18 23:23:17,048] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3840.17 | bwd: 3614.93 | bwd_inner: 3604.86 | bwd_allreduce: 9.96 | step: 61.93
+ 85%|████████▌ | 595/700 [1:16:26<13:20,  7.63s/it]                                                   {'loss': 0.3442, 'learning_rate': 5.785225463498828e-06, 'epoch': 5.95}
+ 85%|████████▌ | 595/700 [1:16:26<13:20,  7.63s/it][2024-06-18 23:23:20,954] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1972.88 | bwd_microstep: 1911.76 | bwd_inner_microstep: 1906.82 | bwd_allreduce_microstep: 4.80 | step_microstep: 0.07
+[2024-06-18 23:23:24,409] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.89
+[2024-06-18 23:23:24,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1662.92 | bwd_microstep: 1712.16 | bwd_inner_microstep: 1706.49 | bwd_allreduce_microstep: 5.53 | step_microstep: 62.72
+[2024-06-18 23:23:24,410] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3635.77 | bwd: 3623.95 | bwd_inner: 3613.42 | bwd_allreduce: 10.28 | step: 62.80
+ 85%|████████▌ | 596/700 [1:16:33<13:04,  7.55s/it]                                                   {'loss': 0.5206, 'learning_rate': 5.67768020588203e-06, 'epoch': 5.96}
+ 85%|████████▌ | 596/700 [1:16:33<13:04,  7.55s/it][2024-06-18 23:23:27,986] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1860.28 | bwd_microstep: 1693.27 | bwd_inner_microstep: 1688.51 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.07
+[2024-06-18 23:23:31,792] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:23:31,793] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1916.01 | bwd_microstep: 1811.73 | bwd_inner_microstep: 1806.44 | bwd_allreduce_microstep: 5.21 | step_microstep: 61.57
+[2024-06-18 23:23:31,793] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3776.26 | bwd: 3505.02 | bwd_inner: 3494.97 | bwd_allreduce: 9.94 | step: 61.65
+ 85%|████████▌ | 597/700 [1:16:41<12:52,  7.50s/it]                                                   {'loss': 0.0007, 'learning_rate': 5.571083764058482e-06, 'epoch': 5.97}
+ 85%|████████▌ | 597/700 [1:16:41<12:52,  7.50s/it][2024-06-18 23:23:35,210] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1523.92 | bwd_microstep: 1871.95 | bwd_inner_microstep: 1867.15 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.07
+[2024-06-18 23:23:39,180] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.89
+[2024-06-18 23:23:39,181] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1972.36 | bwd_microstep: 1918.96 | bwd_inner_microstep: 1913.54 | bwd_allreduce_microstep: 5.32 | step_microstep: 61.98
+[2024-06-18 23:23:39,181] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3496.25 | bwd: 3790.94 | bwd_inner: 3780.76 | bwd_allreduce: 10.05 | step: 62.06
+ 85%|████████▌ | 598/700 [1:16:48<12:41,  7.47s/it]                                                   {'loss': 0.4368, 'learning_rate': 5.465438419957208e-06, 'epoch': 5.98}
+ 85%|████████▌ | 598/700 [1:16:48<12:41,  7.47s/it][2024-06-18 23:23:42,836] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1892.02 | bwd_microstep: 1740.97 | bwd_inner_microstep: 1736.05 | bwd_allreduce_microstep: 4.85 | step_microstep: 0.08
+[2024-06-18 23:23:46,819] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 23:23:46,820] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1979.29 | bwd_microstep: 1925.32 | bwd_inner_microstep: 1919.85 | bwd_allreduce_microstep: 5.33 | step_microstep: 61.75
+[2024-06-18 23:23:46,820] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3871.28 | bwd: 3666.32 | bwd_inner: 3655.97 | bwd_allreduce: 10.16 | step: 61.84
+ 86%|████████▌ | 599/700 [1:16:56<12:39,  7.52s/it]                                                   {'loss': 0.0726, 'learning_rate': 5.360746435146885e-06, 'epoch': 5.99}
+ 86%|████████▌ | 599/700 [1:16:56<12:39,  7.52s/it][2024-06-18 23:23:49,929] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1423.72 | bwd_microstep: 1664.21 | bwd_inner_microstep: 1659.21 | bwd_allreduce_microstep: 4.92 | step_microstep: 0.14
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+[2024-06-18 23:23:54,657] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:23:54,658] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1959.24 | bwd_microstep: 1896.20 | bwd_inner_microstep: 1890.76 | bwd_allreduce_microstep: 5.28 | step_microstep: 61.71
+[2024-06-18 23:23:54,658] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3382.89 | bwd: 3560.43 | bwd_inner: 3550.05 | bwd_allreduce: 10.18 | step: 61.86
+ 86%|████████▌ | 600/700 [1:17:04<12:41,  7.61s/it]                                                   {'loss': 0.5461, 'learning_rate': 5.257010050787486e-06, 'epoch': 6.0}
+ 86%|████████▌ | 600/700 [1:17:04<12:41,  7.61s/it][2024-06-18 23:23:57,352] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-06-18 23:24:03,213] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-06-18 23:24:08,985] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-06-18 23:24:14,803] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)
+[2024-06-18 23:24:22,189] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1964.52 | bwd_microstep: 1905.33 | bwd_inner_microstep: 1900.48 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.08
+[2024-06-18 23:24:25,600] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:24:25,601] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1495.50 | bwd_microstep: 1836.78 | bwd_inner_microstep: 1831.43 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.77
+[2024-06-18 23:24:25,601] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3459.97 | bwd: 3742.13 | bwd_inner: 3731.99 | bwd_allreduce: 9.97 | step: 61.85
+ 86%|████████▌ | 601/700 [1:17:35<24:06, 14.61s/it]                                                   {'loss': 0.434, 'learning_rate': 5.154231487582273e-06, 'epoch': 6.01}
+ 86%|████████▌ | 601/700 [1:17:35<24:06, 14.61s/it][2024-06-18 23:24:29,332] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1902.79 | bwd_microstep: 1806.37 | bwd_inner_microstep: 1801.49 | bwd_allreduce_microstep: 4.80 | step_microstep: 0.09
+[2024-06-18 23:24:33,273] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 23:24:33,274] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1960.30 | bwd_microstep: 1901.28 | bwd_inner_microstep: 1895.96 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.60
+[2024-06-18 23:24:33,274] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3863.06 | bwd: 3707.67 | bwd_inner: 3697.47 | bwd_allreduce: 10.08 | step: 61.69
+ 86%|████████▌ | 602/700 [1:17:42<20:27, 12.53s/it]                                                   {'loss': 0.09, 'learning_rate': 5.05241294573024e-06, 'epoch': 6.02}
+ 86%|████████▌ | 602/700 [1:17:42<20:27, 12.53s/it][2024-06-18 23:24:36,915] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1881.89 | bwd_microstep: 1738.16 | bwd_inner_microstep: 1733.35 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.08
+[2024-06-18 23:24:40,975] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:24:40,976] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1999.81 | bwd_microstep: 1980.37 | bwd_inner_microstep: 1974.87 | bwd_allreduce_microstep: 5.42 | step_microstep: 61.75
+[2024-06-18 23:24:40,976] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3881.67 | bwd: 3718.56 | bwd_inner: 3708.26 | bwd_allreduce: 10.17 | step: 61.83
+ 86%|████████▌ | 603/700 [1:17:50<17:54, 11.08s/it]                                                   {'loss': 0.1361, 'learning_rate': 4.951556604879048e-06, 'epoch': 6.03}
+ 86%|████████▌ | 603/700 [1:17:50<17:54, 11.08s/it][2024-06-18 23:24:44,715] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1909.38 | bwd_microstep: 1807.24 | bwd_inner_microstep: 1802.26 | bwd_allreduce_microstep: 4.83 | step_microstep: 0.08
+[2024-06-18 23:24:48,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:24:48,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.73 | bwd_microstep: 1921.41 | bwd_inner_microstep: 1916.09 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.58
+[2024-06-18 23:24:48,684] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3877.08 | bwd: 3728.67 | bwd_inner: 3718.41 | bwd_allreduce: 10.07 | step: 61.66
+ 86%|████████▋ | 604/700 [1:17:58<16:06, 10.07s/it]                                                   {'loss': 0.204, 'learning_rate': 4.851664624078356e-06, 'epoch': 6.04}
+ 86%|████████▋ | 604/700 [1:17:58<16:06, 10.07s/it][2024-06-18 23:24:52,703] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2006.61 | bwd_microstep: 1990.32 | bwd_inner_microstep: 1985.57 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.07
+[2024-06-18 23:24:56,699] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.89
+[2024-06-18 23:24:56,700] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1979.20 | bwd_microstep: 1936.58 | bwd_inner_microstep: 1931.06 | bwd_allreduce_microstep: 5.45 | step_microstep: 62.66
+[2024-06-18 23:24:56,700] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3985.77 | bwd: 3926.92 | bwd_inner: 3916.64 | bwd_allreduce: 10.16 | step: 62.74
+ 86%|████████▋ | 605/700 [1:18:06<14:58,  9.45s/it]                                                   {'loss': 0.5965, 'learning_rate': 4.752739141733564e-06, 'epoch': 6.05}
+ 86%|████████▋ | 605/700 [1:18:06<14:58,  9.45s/it][2024-06-18 23:25:00,359] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1890.74 | bwd_microstep: 1746.39 | bwd_inner_microstep: 1741.46 | bwd_allreduce_microstep: 4.78 | step_microstep: 0.07
+[2024-06-18 23:25:04,304] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:25:04,304] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1965.57 | bwd_microstep: 1899.70 | bwd_inner_microstep: 1894.26 | bwd_allreduce_microstep: 5.30 | step_microstep: 61.55
+[2024-06-18 23:25:04,305] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3856.28 | bwd: 3646.11 | bwd_inner: 3635.83 | bwd_allreduce: 10.03 | step: 61.63
+ 87%|████████▋ | 606/700 [1:18:13<13:56,  8.90s/it]                                                   {'loss': 0.1991, 'learning_rate': 4.654782275560127e-06, 'epoch': 6.06}
+ 87%|████████▋ | 606/700 [1:18:13<13:56,  8.90s/it][2024-06-18 23:25:08,182] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1962.01 | bwd_microstep: 1893.70 | bwd_inner_microstep: 1888.82 | bwd_allreduce_microstep: 4.81 | step_microstep: 0.08
+[2024-06-18 23:25:11,657] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 23:25:11,658] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1528.94 | bwd_microstep: 1867.70 | bwd_inner_microstep: 1862.36 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.73
+[2024-06-18 23:25:11,658] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3490.92 | bwd: 3761.42 | bwd_inner: 3751.21 | bwd_allreduce: 10.08 | step: 61.82
+ 87%|████████▋ | 607/700 [1:18:21<13:04,  8.44s/it]                                                   {'loss': 0.4655, 'learning_rate': 4.557796122538089e-06, 'epoch': 6.07}
+ 87%|████████▋ | 607/700 [1:18:21<13:04,  8.44s/it][2024-06-18 23:25:15,382] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1810.94 | bwd_microstep: 1891.63 | bwd_inner_microstep: 1886.78 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.07
+[2024-06-18 23:25:19,353] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 23:25:19,353] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.14 | bwd_microstep: 1922.72 | bwd_inner_microstep: 1917.16 | bwd_allreduce_microstep: 5.48 | step_microstep: 62.48
+[2024-06-18 23:25:19,354] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3778.04 | bwd: 3814.37 | bwd_inner: 3804.01 | bwd_allreduce: 10.23 | step: 62.56
+ 87%|████████▋ | 608/700 [1:18:28<12:35,  8.21s/it]                                                   {'loss': 0.2066, 'learning_rate': 4.4617827588673166e-06, 'epoch': 6.08}
+ 87%|████████▋ | 608/700 [1:18:28<12:35,  8.21s/it][2024-06-18 23:25:23,351] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2000.44 | bwd_microstep: 1974.94 | bwd_inner_microstep: 1970.03 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.07
+[2024-06-18 23:25:27,066] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 23:25:27,066] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1894.26 | bwd_microstep: 1740.90 | bwd_inner_microstep: 1735.59 | bwd_allreduce_microstep: 5.23 | step_microstep: 61.80
+[2024-06-18 23:25:27,066] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3894.67 | bwd: 3715.86 | bwd_inner: 3705.70 | bwd_allreduce: 9.97 | step: 61.88
+ 87%|████████▋ | 609/700 [1:18:36<12:13,  8.06s/it]                                                   {'loss': 0.1753, 'learning_rate': 4.366744239922998e-06, 'epoch': 6.09}
+ 87%|████████▋ | 609/700 [1:18:36<12:13,  8.06s/it][2024-06-18 23:25:30,722] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1890.13 | bwd_microstep: 1743.78 | bwd_inner_microstep: 1738.95 | bwd_allreduce_microstep: 4.75 | step_microstep: 0.08
+[2024-06-18 23:25:34,695] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:25:34,696] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1972.40 | bwd_microstep: 1921.14 | bwd_inner_microstep: 1915.72 | bwd_allreduce_microstep: 5.34 | step_microstep: 62.04
+[2024-06-18 23:25:34,696] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3862.50 | bwd: 3664.95 | bwd_inner: 3654.72 | bwd_allreduce: 10.10 | step: 62.13
+ 87%|████████▋ | 610/700 [1:18:44<11:53,  7.93s/it]                                                   {'loss': 0.1258, 'learning_rate': 4.2726826002116085e-06, 'epoch': 6.1}
+ 87%|████████▋ | 610/700 [1:18:44<11:53,  7.93s/it][2024-06-18 23:25:38,670] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1994.76 | bwd_microstep: 1958.01 | bwd_inner_microstep: 1953.24 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.07
+[2024-06-18 23:25:42,655] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:25:42,655] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1976.94 | bwd_microstep: 1927.86 | bwd_inner_microstep: 1922.54 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.72
+[2024-06-18 23:25:42,656] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3971.66 | bwd: 3885.90 | bwd_inner: 3875.78 | bwd_allreduce: 10.00 | step: 61.80
+ 87%|████████▋ | 611/700 [1:18:52<11:46,  7.94s/it]                                                   {'loss': 0.247, 'learning_rate': 4.1795998533274265e-06, 'epoch': 6.11}
+ 87%|████████▋ | 611/700 [1:18:52<11:46,  7.94s/it][2024-06-18 23:25:45,762] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1421.64 | bwd_microstep: 1664.11 | bwd_inner_microstep: 1659.14 | bwd_allreduce_microstep: 4.89 | step_microstep: 0.08
+[2024-06-18 23:25:49,709] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:25:49,709] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1966.44 | bwd_microstep: 1900.82 | bwd_inner_microstep: 1895.33 | bwd_allreduce_microstep: 5.34 | step_microstep: 61.73
+[2024-06-18 23:25:49,710] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3388.06 | bwd: 3564.95 | bwd_inner: 3554.54 | bwd_allreduce: 10.22 | step: 61.81
+ 87%|████████▋ | 612/700 [1:18:59<11:15,  7.67s/it]                                                   {'loss': 0.1362, 'learning_rate': 4.0874979919094e-06, 'epoch': 6.12}
+ 87%|████████▋ | 612/700 [1:18:59<11:15,  7.67s/it][2024-06-18 23:25:53,183] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1682.66 | bwd_microstep: 1769.06 | bwd_inner_microstep: 1764.14 | bwd_allreduce_microstep: 4.78 | step_microstep: 0.07
+[2024-06-18 23:25:57,164] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 23:25:57,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1973.36 | bwd_microstep: 1929.46 | bwd_inner_microstep: 1924.00 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.65
+[2024-06-18 23:25:57,165] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3655.99 | bwd: 3698.54 | bwd_inner: 3688.24 | bwd_allreduce: 10.05 | step: 61.73
+ 88%|████████▊ | 613/700 [1:19:06<11:01,  7.61s/it]                                                   {'loss': 0.3731, 'learning_rate': 3.996378987598487e-06, 'epoch': 6.13}
+ 88%|████████▊ | 613/700 [1:19:06<11:01,  7.61s/it][2024-06-18 23:26:00,743] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1862.26 | bwd_microstep: 1693.58 | bwd_inner_microstep: 1688.56 | bwd_allreduce_microstep: 4.93 | step_microstep: 0.09
+[2024-06-18 23:26:04,717] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 23:26:04,717] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1975.71 | bwd_microstep: 1917.15 | bwd_inner_microstep: 1911.85 | bwd_allreduce_microstep: 5.23 | step_microstep: 61.88
+[2024-06-18 23:26:04,718] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3837.94 | bwd: 3610.75 | bwd_inner: 3600.45 | bwd_allreduce: 10.16 | step: 61.98
+ 88%|████████▊ | 614/700 [1:19:14<10:52,  7.59s/it]                                                   {'loss': 0.2558, 'learning_rate': 3.906244790995422e-06, 'epoch': 6.14}
+ 88%|████████▊ | 614/700 [1:19:14<10:52,  7.59s/it][2024-06-18 23:26:08,452] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1910.04 | bwd_microstep: 1802.86 | bwd_inner_microstep: 1797.95 | bwd_allreduce_microstep: 4.77 | step_microstep: 0.07
+[2024-06-18 23:26:11,931] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.95
+[2024-06-18 23:26:11,932] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1528.77 | bwd_microstep: 1869.91 | bwd_inner_microstep: 1864.37 | bwd_allreduce_microstep: 5.41 | step_microstep: 62.92
+[2024-06-18 23:26:11,932] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3438.78 | bwd: 3672.79 | bwd_inner: 3662.42 | bwd_allreduce: 10.16 | step: 63.00
+ 88%|████████▊ | 615/700 [1:19:21<10:35,  7.48s/it]                                                   {'loss': 0.1931, 'learning_rate': 3.8170973316190074e-06, 'epoch': 6.15}
+ 88%|████████▊ | 615/700 [1:19:21<10:35,  7.48s/it][2024-06-18 23:26:15,557] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1879.51 | bwd_microstep: 1723.57 | bwd_inner_microstep: 1718.77 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.07
+[2024-06-18 23:26:19,517] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 23:26:19,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1971.01 | bwd_microstep: 1910.07 | bwd_inner_microstep: 1904.65 | bwd_allreduce_microstep: 5.29 | step_microstep: 61.52
+[2024-06-18 23:26:19,518] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3850.48 | bwd: 3633.67 | bwd_inner: 3623.49 | bwd_allreduce: 9.99 | step: 61.60
+ 88%|████████▊ | 616/700 [1:19:29<10:30,  7.51s/it]                                                   {'loss': 0.2865, 'learning_rate': 3.728938517864794e-06, 'epoch': 6.16}
+ 88%|████████▊ | 616/700 [1:19:29<10:30,  7.51s/it][2024-06-18 23:26:23,394] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1963.38 | bwd_microstep: 1890.90 | bwd_inner_microstep: 1886.14 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.07
+[2024-06-18 23:26:27,442] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.93
+[2024-06-18 23:26:27,443] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2001.27 | bwd_microstep: 1965.42 | bwd_inner_microstep: 1959.82 | bwd_allreduce_microstep: 5.46 | step_microstep: 64.18
+[2024-06-18 23:26:27,443] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3964.62 | bwd: 3856.34 | bwd_inner: 3846.01 | bwd_allreduce: 10.20 | step: 64.26
+ 88%|████████▊ | 617/700 [1:19:36<10:33,  7.64s/it]                                                   {'loss': 0.4237, 'learning_rate': 3.6417702369641925e-06, 'epoch': 6.17}
+ 88%|████████▊ | 617/700 [1:19:36<10:33,  7.64s/it][2024-06-18 23:26:31,109] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1895.88 | bwd_microstep: 1747.96 | bwd_inner_microstep: 1743.14 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.07
+[2024-06-18 23:26:34,390] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:26:34,390] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1549.20 | bwd_microstep: 1653.34 | bwd_inner_microstep: 1648.02 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.45
+[2024-06-18 23:26:34,391] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3445.05 | bwd: 3401.33 | bwd_inner: 3391.20 | bwd_allreduce: 9.99 | step: 61.53
+ 88%|████████▊ | 618/700 [1:19:43<10:09,  7.43s/it]                                                   {'loss': 0.2245, 'learning_rate': 3.555594354944125e-06, 'epoch': 6.18}
+ 88%|████████▊ | 618/700 [1:19:43<10:09,  7.43s/it][2024-06-18 23:26:37,805] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1524.14 | bwd_microstep: 1869.21 | bwd_inner_microstep: 1864.23 | bwd_allreduce_microstep: 4.90 | step_microstep: 0.08
+[2024-06-18 23:26:41,875] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:26:41,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2004.82 | bwd_microstep: 1985.32 | bwd_inner_microstep: 1980.02 | bwd_allreduce_microstep: 5.23 | step_microstep: 61.61
+[2024-06-18 23:26:41,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3528.93 | bwd: 3854.55 | bwd_inner: 3844.28 | bwd_allreduce: 10.13 | step: 61.69
+ 88%|████████▊ | 619/700 [1:19:51<10:03,  7.45s/it]                                                   {'loss': 0.5589, 'learning_rate': 3.4704127165870517e-06, 'epoch': 6.19}
+ 88%|████████▊ | 619/700 [1:19:51<10:03,  7.45s/it][2024-06-18 23:26:45,363] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1739.19 | bwd_microstep: 1726.23 | bwd_inner_microstep: 1721.28 | bwd_allreduce_microstep: 4.81 | step_microstep: 0.07
+[2024-06-18 23:26:49,359] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:26:49,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1980.21 | bwd_microstep: 1937.19 | bwd_inner_microstep: 1931.87 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.49
+[2024-06-18 23:26:49,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3719.37 | bwd: 3663.44 | bwd_inner: 3653.22 | bwd_allreduce: 10.03 | step: 61.57
+ 89%|████████▊ | 620/700 [1:19:58<09:56,  7.46s/it]                                                   {'loss': 0.4759, 'learning_rate': 3.386227145391463e-06, 'epoch': 6.2}
+ 89%|████████▊ | 620/700 [1:19:58<09:56,  7.46s/it][2024-06-18 23:26:52,457] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1420.40 | bwd_microstep: 1655.22 | bwd_inner_microstep: 1650.40 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.07
+[2024-06-18 23:26:56,424] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:26:56,424] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1971.01 | bwd_microstep: 1916.37 | bwd_inner_microstep: 1911.02 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.59
+[2024-06-18 23:26:56,424] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3391.37 | bwd: 3571.60 | bwd_inner: 3561.48 | bwd_allreduce: 10.00 | step: 61.67
+ 89%|████████▊ | 621/700 [1:20:05<09:39,  7.34s/it]                                                   {'loss': 0.0602, 'learning_rate': 3.303039443532874e-06, 'epoch': 6.21}
+ 89%|████████▊ | 621/700 [1:20:05<09:39,  7.34s/it][2024-06-18 23:27:00,172] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1917.48 | bwd_microstep: 1808.10 | bwd_inner_microstep: 1803.11 | bwd_allreduce_microstep: 4.91 | step_microstep: 0.09
+[2024-06-18 23:27:03,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 23:27:03,876] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1740.24 | bwd_microstep: 1883.69 | bwd_inner_microstep: 1878.32 | bwd_allreduce_microstep: 5.29 | step_microstep: 62.23
+[2024-06-18 23:27:03,877] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3657.68 | bwd: 3691.81 | bwd_inner: 3681.46 | bwd_allreduce: 10.21 | step: 62.32
+ 89%|████████▉ | 622/700 [1:20:13<09:35,  7.37s/it]                                                   {'loss': 0.1631, 'learning_rate': 3.220851391825247e-06, 'epoch': 6.22}
+ 89%|████████▉ | 622/700 [1:20:13<09:35,  7.37s/it][2024-06-18 23:27:07,369] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1830.43 | bwd_microstep: 1640.18 | bwd_inner_microstep: 1635.14 | bwd_allreduce_microstep: 4.90 | step_microstep: 0.07
+[2024-06-18 23:27:11,314] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 23:27:11,314] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1966.79 | bwd_microstep: 1898.89 | bwd_inner_microstep: 1893.54 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.58
+[2024-06-18 23:27:11,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3797.20 | bwd: 3539.10 | bwd_inner: 3528.73 | bwd_allreduce: 10.18 | step: 61.66
+ 89%|████████▉ | 623/700 [1:20:20<09:29,  7.39s/it]                                                   {'loss': 0.239, 'learning_rate': 3.1396647496828247e-06, 'epoch': 6.23}
+ 89%|████████▉ | 623/700 [1:20:20<09:29,  7.39s/it][2024-06-18 23:27:15,195] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1963.91 | bwd_microstep: 1895.25 | bwd_inner_microstep: 1890.30 | bwd_allreduce_microstep: 4.80 | step_microstep: 0.08
+[2024-06-18 23:27:19,246] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 23:27:19,247] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2000.09 | bwd_microstep: 1970.94 | bwd_inner_microstep: 1965.49 | bwd_allreduce_microstep: 5.34 | step_microstep: 61.63
+[2024-06-18 23:27:19,247] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3963.97 | bwd: 3866.21 | bwd_inner: 3855.89 | bwd_allreduce: 10.13 | step: 61.72
+ 89%|████████▉ | 624/700 [1:20:28<09:34,  7.55s/it]                                                   {'loss': 0.3236, 'learning_rate': 3.059481255082519e-06, 'epoch': 6.24}
+ 89%|████████▉ | 624/700 [1:20:28<09:34,  7.55s/it][2024-06-18 23:27:23,224] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1995.60 | bwd_microstep: 1959.44 | bwd_inner_microstep: 1954.62 | bwd_allreduce_microstep: 4.75 | step_microstep: 0.07
+[2024-06-18 23:27:27,308] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 23:27:27,309] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2014.47 | bwd_microstep: 1990.99 | bwd_inner_microstep: 1985.63 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.68
+[2024-06-18 23:27:27,309] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 4010.04 | bwd: 3950.45 | bwd_inner: 3940.31 | bwd_allreduce: 10.00 | step: 61.76
+ 89%|████████▉ | 625/700 [1:20:36<09:38,  7.71s/it]                                                   {'loss': 0.5941, 'learning_rate': 2.980302624526693e-06, 'epoch': 6.25}
+ 89%|████████▉ | 625/700 [1:20:36<09:38,  7.71s/it][2024-06-18 23:27:31,216] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1969.08 | bwd_microstep: 1916.31 | bwd_inner_microstep: 1911.36 | bwd_allreduce_microstep: 4.83 | step_microstep: 0.14
+[2024-06-18 23:27:34,932] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:27:34,932] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1891.77 | bwd_microstep: 1743.81 | bwd_inner_microstep: 1738.32 | bwd_allreduce_microstep: 5.35 | step_microstep: 61.89
+[2024-06-18 23:27:34,933] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3860.82 | bwd: 3660.15 | bwd_inner: 3649.79 | bwd_allreduce: 10.16 | step: 62.03
+ 89%|████████▉ | 626/700 [1:20:44<09:28,  7.68s/it]                                                   {'loss': 0.0469, 'learning_rate': 2.9021305530063658e-06, 'epoch': 6.26}
+ 89%|████████▉ | 626/700 [1:20:44<09:28,  7.68s/it][2024-06-18 23:27:36,601] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 755.95 | bwd_microstep: 891.33 | bwd_inner_microstep: 886.45 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.07
+[2024-06-18 23:27:40,654] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:27:40,655] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2002.42 | bwd_microstep: 1972.24 | bwd_inner_microstep: 1966.78 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.70
+[2024-06-18 23:27:40,655] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2758.34 | bwd: 2863.59 | bwd_inner: 2853.35 | bwd_allreduce: 9.99 | step: 61.77
+ 90%|████████▉ | 627/700 [1:20:50<08:37,  7.09s/it]                                                   {'loss': 0.1646, 'learning_rate': 2.8249667139650216e-06, 'epoch': 6.27}
+ 90%|████████▉ | 627/700 [1:20:50<08:37,  7.09s/it][2024-06-18 23:27:44,564] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1972.79 | bwd_microstep: 1914.92 | bwd_inner_microstep: 1907.72 | bwd_allreduce_microstep: 7.05 | step_microstep: 0.09
+[2024-06-18 23:27:48,514] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 23:27:48,514] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1971.16 | bwd_microstep: 1897.12 | bwd_inner_microstep: 1891.54 | bwd_allreduce_microstep: 5.50 | step_microstep: 63.05
+[2024-06-18 23:27:48,515] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3943.92 | bwd: 3812.07 | bwd_inner: 3799.34 | bwd_allreduce: 12.54 | step: 63.15
+ 90%|████████▉ | 628/700 [1:20:58<08:47,  7.32s/it]                                                   {'loss': 0.4639, 'learning_rate': 2.7488127592626866e-06, 'epoch': 6.28}
+ 90%|████████▉ | 628/700 [1:20:58<08:47,  7.32s/it][2024-06-18 23:27:52,082] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1760.74 | bwd_microstep: 1785.03 | bwd_inner_microstep: 1780.23 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.07
+[2024-06-18 23:27:55,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 23:27:55,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1652.01 | bwd_microstep: 1837.81 | bwd_inner_microstep: 1832.38 | bwd_allreduce_microstep: 5.34 | step_microstep: 62.58
+[2024-06-18 23:27:55,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3412.72 | bwd: 3622.86 | bwd_inner: 3612.66 | bwd_allreduce: 10.07 | step: 62.66
+ 90%|████████▉ | 629/700 [1:21:05<08:36,  7.27s/it]                                                   {'loss': 0.6154, 'learning_rate': 2.6736703191406366e-06, 'epoch': 6.29}
+ 90%|████████▉ | 629/700 [1:21:05<08:36,  7.27s/it][2024-06-18 23:27:59,579] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1976.21 | bwd_microstep: 1928.92 | bwd_inner_microstep: 1924.07 | bwd_allreduce_microstep: 4.77 | step_microstep: 0.08
+[2024-06-18 23:28:03,577] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:28:03,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1980.89 | bwd_microstep: 1936.99 | bwd_inner_microstep: 1931.49 | bwd_allreduce_microstep: 5.36 | step_microstep: 61.82
+[2024-06-18 23:28:03,578] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3957.07 | bwd: 3865.93 | bwd_inner: 3855.63 | bwd_allreduce: 10.14 | step: 61.90
+ 90%|█████████ | 630/700 [1:21:13<08:42,  7.47s/it]                                                   {'loss': 0.5069, 'learning_rate': 2.5995410021864787e-06, 'epoch': 6.3}
+ 90%|█████████ | 630/700 [1:21:13<08:42,  7.47s/it][2024-06-18 23:28:06,928] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1498.71 | bwd_microstep: 1830.80 | bwd_inner_microstep: 1825.72 | bwd_allreduce_microstep: 4.96 | step_microstep: 0.08
+[2024-06-18 23:28:10,071] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.90
+[2024-06-18 23:28:10,072] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1417.65 | bwd_microstep: 1643.18 | bwd_inner_microstep: 1637.54 | bwd_allreduce_microstep: 5.50 | step_microstep: 62.89
+[2024-06-18 23:28:10,072] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2916.31 | bwd: 3474.00 | bwd_inner: 3463.36 | bwd_allreduce: 10.45 | step: 62.98
+ 90%|█████████ | 631/700 [1:21:19<08:15,  7.17s/it]                                                   {'loss': 0.2421, 'learning_rate': 2.5264263952996915e-06, 'epoch': 6.31}
+ 90%|█████████ | 631/700 [1:21:19<08:15,  7.17s/it][2024-06-18 23:28:13,994] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1975.46 | bwd_microstep: 1924.21 | bwd_inner_microstep: 1919.29 | bwd_allreduce_microstep: 4.78 | step_microstep: 0.08
+[2024-06-18 23:28:17,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:28:17,801] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1918.83 | bwd_microstep: 1808.74 | bwd_inner_microstep: 1803.34 | bwd_allreduce_microstep: 5.30 | step_microstep: 61.64
+[2024-06-18 23:28:17,802] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3894.26 | bwd: 3732.97 | bwd_inner: 3722.71 | bwd_allreduce: 10.07 | step: 61.72
+ 90%|█████████ | 632/700 [1:21:27<08:19,  7.34s/it]                                                   {'loss': 0.2286, 'learning_rate': 2.4543280636576794e-06, 'epoch': 6.32}
+ 90%|█████████ | 632/700 [1:21:27<08:19,  7.34s/it][2024-06-18 23:28:21,214] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1520.93 | bwd_microstep: 1870.34 | bwd_inner_microstep: 1865.42 | bwd_allreduce_microstep: 4.78 | step_microstep: 0.08
+[2024-06-18 23:28:25,151] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 23:28:25,152] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.01 | bwd_microstep: 1890.79 | bwd_inner_microstep: 1885.33 | bwd_allreduce_microstep: 5.32 | step_microstep: 61.77
+[2024-06-18 23:28:25,152] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3487.91 | bwd: 3761.16 | bwd_inner: 3750.85 | bwd_allreduce: 10.07 | step: 61.85
+ 90%|█████████ | 633/700 [1:21:34<08:12,  7.34s/it]                                                   {'loss': 0.2885, 'learning_rate': 2.3832475506822937e-06, 'epoch': 6.33}
+ 90%|█████████ | 633/700 [1:21:34<08:12,  7.34s/it][2024-06-18 23:28:29,075] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1975.96 | bwd_microstep: 1925.80 | bwd_inner_microstep: 1920.89 | bwd_allreduce_microstep: 4.77 | step_microstep: 0.07
+[2024-06-18 23:28:33,148] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.94
+[2024-06-18 23:28:33,149] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2007.38 | bwd_microstep: 1984.66 | bwd_inner_microstep: 1978.94 | bwd_allreduce_microstep: 5.61 | step_microstep: 63.39
+[2024-06-18 23:28:33,149] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3983.31 | bwd: 3910.48 | bwd_inner: 3899.93 | bwd_allreduce: 10.36 | step: 63.47
+ 91%|█████████ | 634/700 [1:21:42<08:17,  7.54s/it]                                                   {'loss': 0.4921, 'learning_rate': 2.3131863780067043e-06, 'epoch': 6.34}
+ 91%|█████████ | 634/700 [1:21:42<08:17,  7.54s/it][2024-06-18 23:28:36,519] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1505.29 | bwd_microstep: 1839.71 | bwd_inner_microstep: 1834.70 | bwd_allreduce_microstep: 4.87 | step_microstep: 0.07
+[2024-06-18 23:28:39,782] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:28:39,783] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1548.27 | bwd_microstep: 1636.04 | bwd_inner_microstep: 1630.59 | bwd_allreduce_microstep: 5.34 | step_microstep: 61.85
+[2024-06-18 23:28:39,783] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3053.53 | bwd: 3475.77 | bwd_inner: 3465.37 | bwd_allreduce: 10.18 | step: 61.93
+ 91%|█████████ | 635/700 [1:21:49<07:52,  7.27s/it]                                                   {'loss': 0.1256, 'learning_rate': 2.24414604544293e-06, 'epoch': 6.35}
+ 91%|█████████ | 635/700 [1:21:49<07:52,  7.27s/it][2024-06-18 23:28:41,362] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 728.19 | bwd_microstep: 830.28 | bwd_inner_microstep: 825.49 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.07
+[2024-06-18 23:28:45,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:28:45,315] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.40 | bwd_microstep: 1902.26 | bwd_inner_microstep: 1896.90 | bwd_allreduce_microstep: 5.28 | step_microstep: 61.77
+[2024-06-18 23:28:45,316] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2698.56 | bwd: 2732.57 | bwd_inner: 2722.43 | bwd_allreduce: 10.00 | step: 61.85
+ 91%|█████████ | 636/700 [1:21:54<07:11,  6.75s/it]                                                   {'loss': 0.2475, 'learning_rate': 2.1761280309496646e-06, 'epoch': 6.36}
+ 91%|█████████ | 636/700 [1:21:54<07:11,  6.75s/it][2024-06-18 23:28:49,210] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1973.43 | bwd_microstep: 1899.23 | bwd_inner_microstep: 1894.26 | bwd_allreduce_microstep: 4.90 | step_microstep: 0.09
+[2024-06-18 23:28:53,186] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 23:28:53,187] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1975.74 | bwd_microstep: 1920.10 | bwd_inner_microstep: 1914.60 | bwd_allreduce_microstep: 5.36 | step_microstep: 61.81
+[2024-06-18 23:28:53,187] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3949.13 | bwd: 3819.36 | bwd_inner: 3808.93 | bwd_allreduce: 10.24 | step: 61.90
+ 91%|█████████ | 637/700 [1:22:02<07:26,  7.08s/it]                                                   {'loss': 0.2944, 'learning_rate': 2.1091337906006482e-06, 'epoch': 6.37}
+ 91%|█████████ | 637/700 [1:22:02<07:26,  7.08s/it][2024-06-18 23:28:56,767] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1864.31 | bwd_microstep: 1694.06 | bwd_inner_microstep: 1689.13 | bwd_allreduce_microstep: 4.80 | step_microstep: 0.07
+[2024-06-18 23:29:00,864] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:29:00,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2015.87 | bwd_microstep: 2001.55 | bwd_inner_microstep: 1996.12 | bwd_allreduce_microstep: 5.35 | step_microstep: 61.47
+[2024-06-18 23:29:00,865] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3880.15 | bwd: 3695.64 | bwd_inner: 3685.32 | bwd_allreduce: 10.16 | step: 61.55
+ 91%|█████████ | 638/700 [1:22:10<07:30,  7.26s/it]                                                   {'loss': 0.3756, 'learning_rate': 2.043164758553523e-06, 'epoch': 6.38}
+ 91%|█████████ | 638/700 [1:22:10<07:30,  7.26s/it][2024-06-18 23:29:03,887] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1393.56 | bwd_microstep: 1607.32 | bwd_inner_microstep: 1602.48 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.07
+[2024-06-18 23:29:07,845] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:29:07,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1968.95 | bwd_microstep: 1910.37 | bwd_inner_microstep: 1904.90 | bwd_allreduce_microstep: 5.34 | step_microstep: 61.62
+[2024-06-18 23:29:07,846] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3362.47 | bwd: 3517.72 | bwd_inner: 3507.48 | bwd_allreduce: 10.04 | step: 61.69
+ 91%|█████████▏| 639/700 [1:22:17<07:17,  7.18s/it]                                                   {'loss': 0.1635, 'learning_rate': 1.9782223470191042e-06, 'epoch': 6.39}
+ 91%|█████████▏| 639/700 [1:22:17<07:17,  7.18s/it][2024-06-18 23:29:11,836] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2002.63 | bwd_microstep: 1965.27 | bwd_inner_microstep: 1960.19 | bwd_allreduce_microstep: 4.98 | step_microstep: 0.10
+[2024-06-18 23:29:15,777] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 23:29:15,777] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1968.21 | bwd_microstep: 1891.44 | bwd_inner_microstep: 1885.97 | bwd_allreduce_microstep: 5.33 | step_microstep: 62.07
+[2024-06-18 23:29:15,778] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3970.80 | bwd: 3856.73 | bwd_inner: 3846.23 | bwd_allreduce: 10.30 | step: 62.17
+ 91%|█████████▏| 640/700 [1:22:25<07:24,  7.40s/it]                                                   {'loss': 0.3118, 'learning_rate': 1.914307946231164e-06, 'epoch': 6.4}
+ 91%|█████████▏| 640/700 [1:22:25<07:24,  7.40s/it][2024-06-18 23:29:18,873] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1419.54 | bwd_microstep: 1655.01 | bwd_inner_microstep: 1650.20 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.07
+[2024-06-18 23:29:22,595] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.89
+[2024-06-18 23:29:22,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1895.85 | bwd_microstep: 1746.09 | bwd_inner_microstep: 1740.65 | bwd_allreduce_microstep: 5.30 | step_microstep: 61.67
+[2024-06-18 23:29:22,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3315.36 | bwd: 3401.13 | bwd_inner: 3390.92 | bwd_allreduce: 10.01 | step: 61.75
+ 92%|█████████▏| 641/700 [1:22:32<07:06,  7.23s/it]                                                   {'loss': 0.0041, 'learning_rate': 1.8514229244166569e-06, 'epoch': 6.41}
+ 92%|█████████▏| 641/700 [1:22:32<07:06,  7.23s/it][2024-06-18 23:29:26,574] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1995.96 | bwd_microstep: 1960.54 | bwd_inner_microstep: 1955.60 | bwd_allreduce_microstep: 4.80 | step_microstep: 0.07
+[2024-06-18 23:29:30,558] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.81
+[2024-06-18 23:29:30,558] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1978.57 | bwd_microstep: 1925.94 | bwd_inner_microstep: 1920.65 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.69
+[2024-06-18 23:29:30,559] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3974.50 | bwd: 3886.50 | bwd_inner: 3876.30 | bwd_allreduce: 10.03 | step: 61.76
+ 92%|█████████▏| 642/700 [1:22:40<07:12,  7.45s/it]                                                   {'loss': 0.6769, 'learning_rate': 1.7895686277664469e-06, 'epoch': 6.42}
+ 92%|█████████▏| 642/700 [1:22:40<07:12,  7.45s/it][2024-06-18 23:29:34,451] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1973.68 | bwd_microstep: 1897.20 | bwd_inner_microstep: 1892.25 | bwd_allreduce_microstep: 4.88 | step_microstep: 0.14
+[2024-06-18 23:29:38,268] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 23:29:38,268] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1919.35 | bwd_microstep: 1816.89 | bwd_inner_microstep: 1811.49 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.62
+[2024-06-18 23:29:38,268] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3892.99 | bwd: 3714.12 | bwd_inner: 3703.81 | bwd_allreduce: 10.12 | step: 61.77
+ 92%|█████████▏| 643/700 [1:22:47<07:09,  7.53s/it]                                                   {'loss': 0.1385, 'learning_rate': 1.7287463804064875e-06, 'epoch': 6.43}
+ 92%|█████████▏| 643/700 [1:22:47<07:09,  7.53s/it][2024-06-18 23:29:42,263] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2002.47 | bwd_microstep: 1970.18 | bwd_inner_microstep: 1965.39 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.07
+[2024-06-18 23:29:46,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.89
+[2024-06-18 23:29:46,249] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1979.55 | bwd_microstep: 1925.55 | bwd_inner_microstep: 1920.20 | bwd_allreduce_microstep: 5.28 | step_microstep: 63.45
+[2024-06-18 23:29:46,250] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3981.99 | bwd: 3895.76 | bwd_inner: 3885.62 | bwd_allreduce: 10.01 | step: 63.53
+ 92%|█████████▏| 644/700 [1:22:55<07:09,  7.66s/it]                                                   {'loss': 0.4119, 'learning_rate': 1.6689574843694433e-06, 'epoch': 6.44}
+ 92%|█████████▏| 644/700 [1:22:55<07:09,  7.66s/it][2024-06-18 23:29:47,462] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 556.37 | bwd_microstep: 635.44 | bwd_inner_microstep: 630.58 | bwd_allreduce_microstep: 4.77 | step_microstep: 0.07
+[2024-06-18 23:29:51,495] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:29:51,496] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1997.61 | bwd_microstep: 1956.37 | bwd_inner_microstep: 1950.94 | bwd_allreduce_microstep: 5.29 | step_microstep: 62.01
+[2024-06-18 23:29:51,496] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 2553.95 | bwd: 2591.83 | bwd_inner: 2581.60 | bwd_allreduce: 10.04 | step: 62.09
+ 92%|█████████▏| 645/700 [1:23:00<06:21,  6.94s/it]                                                   {'loss': 0.5452, 'learning_rate': 1.6102032195668637e-06, 'epoch': 6.45}
+ 92%|█████████▏| 645/700 [1:23:00<06:21,  6.94s/it][2024-06-18 23:29:55,126] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1882.64 | bwd_microstep: 1725.22 | bwd_inner_microstep: 1720.40 | bwd_allreduce_microstep: 4.75 | step_microstep: 0.07
+[2024-06-18 23:29:58,508] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:29:58,509] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1499.84 | bwd_microstep: 1804.27 | bwd_inner_microstep: 1798.86 | bwd_allreduce_microstep: 5.32 | step_microstep: 61.75
+[2024-06-18 23:29:58,509] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3382.45 | bwd: 3529.51 | bwd_inner: 3519.30 | bwd_allreduce: 10.08 | step: 61.84
+ 92%|█████████▏| 646/700 [1:23:08<06:15,  6.96s/it]                                                   {'loss': 0.1845, 'learning_rate': 1.5524848437617756e-06, 'epoch': 6.46}
+ 92%|█████████▏| 646/700 [1:23:08<06:15,  6.96s/it][2024-06-18 23:30:02,378] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1960.68 | bwd_microstep: 1886.43 | bwd_inner_microstep: 1881.56 | bwd_allreduce_microstep: 4.78 | step_microstep: 0.07
+[2024-06-18 23:30:06,332] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:30:06,333] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.44 | bwd_microstep: 1907.74 | bwd_inner_microstep: 1902.41 | bwd_allreduce_microstep: 5.26 | step_microstep: 61.71
+[2024-06-18 23:30:06,333] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3928.09 | bwd: 3794.20 | bwd_inner: 3784.02 | bwd_allreduce: 10.05 | step: 61.79
+ 92%|█████████▏| 647/700 [1:23:15<06:22,  7.22s/it]                                                   {'loss': 0.5809, 'learning_rate': 1.4958035925417003e-06, 'epoch': 6.47}
+ 92%|█████████▏| 647/700 [1:23:15<06:22,  7.22s/it][2024-06-18 23:30:09,696] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1598.47 | bwd_microstep: 1743.80 | bwd_inner_microstep: 1738.95 | bwd_allreduce_microstep: 4.77 | step_microstep: 0.07
+[2024-06-18 23:30:13,682] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:30:13,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1982.29 | bwd_microstep: 1923.32 | bwd_inner_microstep: 1917.94 | bwd_allreduce_microstep: 5.30 | step_microstep: 61.65
+[2024-06-18 23:30:13,683] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3580.73 | bwd: 3667.14 | bwd_inner: 3656.93 | bwd_allreduce: 10.08 | step: 61.73
+ 93%|█████████▎| 648/700 [1:23:23<06:17,  7.26s/it]                                                   {'loss': 0.0547, 'learning_rate': 1.4401606792923017e-06, 'epoch': 6.48}
+ 93%|█████████▎| 648/700 [1:23:23<06:17,  7.26s/it][2024-06-18 23:30:17,563] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1964.98 | bwd_microstep: 1893.17 | bwd_inner_microstep: 1888.03 | bwd_allreduce_microstep: 5.00 | step_microstep: 0.09
+[2024-06-18 23:30:20,996] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 23:30:20,997] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1508.49 | bwd_microstep: 1846.22 | bwd_inner_microstep: 1840.87 | bwd_allreduce_microstep: 5.26 | step_microstep: 62.02
+[2024-06-18 23:30:20,997] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3473.44 | bwd: 3739.41 | bwd_inner: 3728.97 | bwd_allreduce: 10.25 | step: 62.12
+ 93%|█████████▎| 649/700 [1:23:30<06:11,  7.28s/it]                                                   {'loss': 0.7891, 'learning_rate': 1.3855572951713248e-06, 'epoch': 6.49}
+ 93%|█████████▎| 649/700 [1:23:30<06:11,  7.28s/it][2024-06-18 23:30:24,103] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1424.29 | bwd_microstep: 1660.33 | bwd_inner_microstep: 1655.39 | bwd_allreduce_microstep: 4.86 | step_microstep: 0.07
+[2024-06-18 23:30:28,052] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.92
+[2024-06-18 23:30:28,053] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1971.27 | bwd_microstep: 1897.04 | bwd_inner_microstep: 1891.61 | bwd_allreduce_microstep: 5.28 | step_microstep: 62.93
+[2024-06-18 23:30:28,053] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3395.53 | bwd: 3557.39 | bwd_inner: 3547.07 | bwd_allreduce: 10.13 | step: 63.02
+ 93%|█████████▎| 650/700 [1:23:37<06:00,  7.21s/it]                                                   {'loss': 0.4184, 'learning_rate': 1.331994609083137e-06, 'epoch': 6.5}
+ 93%|█████████▎| 650/700 [1:23:37<06:00,  7.21s/it][2024-06-18 23:30:31,688] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1886.36 | bwd_microstep: 1727.20 | bwd_inner_microstep: 1722.25 | bwd_allreduce_microstep: 4.81 | step_microstep: 0.07
+[2024-06-18 23:30:35,675] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:30:35,676] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1976.10 | bwd_microstep: 1932.01 | bwd_inner_microstep: 1926.62 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.56
+[2024-06-18 23:30:35,676] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3862.44 | bwd: 3659.23 | bwd_inner: 3648.97 | bwd_allreduce: 10.06 | step: 61.65
+ 93%|█████████▎| 651/700 [1:23:45<05:59,  7.33s/it]                                                   {'loss': 0.1763, 'learning_rate': 1.2794737676536994e-06, 'epoch': 6.51}
+ 93%|█████████▎| 651/700 [1:23:45<05:59,  7.33s/it][2024-06-18 23:30:39,144] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1735.79 | bwd_microstep: 1710.65 | bwd_inner_microstep: 1705.88 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.08
+[2024-06-18 23:30:43,084] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.90
+[2024-06-18 23:30:43,085] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1966.77 | bwd_microstep: 1893.44 | bwd_inner_microstep: 1887.82 | bwd_allreduce_microstep: 5.48 | step_microstep: 62.60
+[2024-06-18 23:30:43,085] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3702.54 | bwd: 3604.12 | bwd_inner: 3593.75 | bwd_allreduce: 10.19 | step: 62.68
+ 93%|█████████▎| 652/700 [1:23:52<05:53,  7.36s/it]                                                   {'loss': 0.2703, 'learning_rate': 1.2279958952060134e-06, 'epoch': 6.52}
+ 93%|█████████▎| 652/700 [1:23:52<05:53,  7.36s/it][2024-06-18 23:30:47,053] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1991.91 | bwd_microstep: 1953.92 | bwd_inner_microstep: 1949.15 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.07
+[2024-06-18 23:30:51,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.80
+[2024-06-18 23:30:51,001] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.34 | bwd_microstep: 1898.28 | bwd_inner_microstep: 1892.97 | bwd_allreduce_microstep: 5.23 | step_microstep: 61.57
+[2024-06-18 23:30:51,002] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3962.21 | bwd: 3852.22 | bwd_inner: 3842.15 | bwd_allreduce: 9.96 | step: 61.65
+ 93%|█████████▎| 653/700 [1:24:00<05:53,  7.52s/it]                                                   {'loss': 0.1685, 'learning_rate': 1.1775620937360676e-06, 'epoch': 6.53}
+ 93%|█████████▎| 653/700 [1:24:00<05:53,  7.52s/it][2024-06-18 23:30:54,870] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1963.54 | bwd_microstep: 1883.19 | bwd_inner_microstep: 1878.22 | bwd_allreduce_microstep: 4.82 | step_microstep: 0.07
+[2024-06-18 23:30:58,871] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:30:58,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1983.99 | bwd_microstep: 1938.12 | bwd_inner_microstep: 1932.83 | bwd_allreduce_microstep: 5.24 | step_microstep: 62.04
+[2024-06-18 23:30:58,872] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3947.50 | bwd: 3821.33 | bwd_inner: 3811.11 | bwd_allreduce: 10.06 | step: 62.12
+ 93%|█████████▎| 654/700 [1:24:08<05:50,  7.63s/it]                                                   {'loss': 0.4687, 'learning_rate': 1.1281734428892409e-06, 'epoch': 6.54}
+ 93%|█████████▎| 654/700 [1:24:08<05:50,  7.63s/it][2024-06-18 23:31:02,619] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1916.82 | bwd_microstep: 1808.42 | bwd_inner_microstep: 1803.50 | bwd_allreduce_microstep: 4.77 | step_microstep: 0.08
+[2024-06-18 23:31:06,190] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 23:31:06,190] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1655.95 | bwd_microstep: 1835.81 | bwd_inner_microstep: 1830.40 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.74
+[2024-06-18 23:31:06,191] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3572.73 | bwd: 3644.25 | bwd_inner: 3634.01 | bwd_allreduce: 10.01 | step: 61.83
+ 94%|█████████▎| 655/700 [1:24:15<05:39,  7.54s/it]                                                   {'loss': 0.3129, 'learning_rate': 1.0798309999371536e-06, 'epoch': 6.55}
+ 94%|█████████▎| 655/700 [1:24:15<05:39,  7.54s/it][2024-06-18 23:31:10,168] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1997.54 | bwd_microstep: 1958.61 | bwd_inner_microstep: 1953.76 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.07
+[2024-06-18 23:31:13,983] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 23:31:13,984] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1921.83 | bwd_microstep: 1813.27 | bwd_inner_microstep: 1807.88 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.85
+[2024-06-18 23:31:13,984] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3919.33 | bwd: 3771.90 | bwd_inner: 3761.70 | bwd_allreduce: 10.04 | step: 61.93
+ 94%|█████████▎| 656/700 [1:24:23<05:34,  7.61s/it]                                                   {'loss': 0.3212, 'learning_rate': 1.0325357997551134e-06, 'epoch': 6.56}
+ 94%|█████████▎| 656/700 [1:24:23<05:34,  7.61s/it][2024-06-18 23:31:17,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1883.63 | bwd_microstep: 1726.04 | bwd_inner_microstep: 1721.22 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.08
+[2024-06-18 23:31:21,614] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 23:31:21,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1981.92 | bwd_microstep: 1934.26 | bwd_inner_microstep: 1928.58 | bwd_allreduce_microstep: 5.59 | step_microstep: 64.52
+[2024-06-18 23:31:21,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3865.52 | bwd: 3660.31 | bwd_inner: 3649.83 | bwd_allreduce: 10.34 | step: 64.60
+ 94%|█████████▍| 657/700 [1:24:31<05:27,  7.62s/it]                                                   {'loss': 0.1715, 'learning_rate': 9.862888547998829e-07, 'epoch': 6.57}
+ 94%|█████████▍| 657/700 [1:24:31<05:27,  7.62s/it][2024-06-18 23:31:25,584] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1993.86 | bwd_microstep: 1953.42 | bwd_inner_microstep: 1948.57 | bwd_allreduce_microstep: 4.77 | step_microstep: 0.07
+[2024-06-18 23:31:29,392] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.80
+[2024-06-18 23:31:29,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1919.10 | bwd_microstep: 1809.05 | bwd_inner_microstep: 1803.71 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.50
+[2024-06-18 23:31:29,393] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3912.93 | bwd: 3762.49 | bwd_inner: 3752.32 | bwd_allreduce: 10.04 | step: 61.59
+ 94%|█████████▍| 658/700 [1:24:38<05:21,  7.67s/it]                                                   {'loss': 0.0958, 'learning_rate': 9.410911550880475e-07, 'epoch': 6.58}
+ 94%|█████████▍| 658/700 [1:24:38<05:21,  7.67s/it][2024-06-18 23:31:33,314] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1975.98 | bwd_microstep: 1923.57 | bwd_inner_microstep: 1918.65 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.08
+[2024-06-18 23:31:37,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 2.00
+[2024-06-18 23:31:37,307] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1982.07 | bwd_microstep: 1928.41 | bwd_inner_microstep: 1922.89 | bwd_allreduce_microstep: 5.44 | step_microstep: 64.01
+[2024-06-18 23:31:37,307] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3958.02 | bwd: 3852.00 | bwd_inner: 3841.62 | bwd_allreduce: 10.19 | step: 64.09
+ 94%|█████████▍| 659/700 [1:24:46<05:17,  7.74s/it]                                                   {'loss': 0.4998, 'learning_rate': 8.969436681748211e-07, 'epoch': 6.59}
+ 94%|█████████▍| 659/700 [1:24:46<05:17,  7.74s/it][2024-06-18 23:31:41,190] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1964.89 | bwd_microstep: 1894.65 | bwd_inner_microstep: 1889.70 | bwd_allreduce_microstep: 4.80 | step_microstep: 0.08
+[2024-06-18 23:31:45,269] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:31:45,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2013.32 | bwd_microstep: 1985.38 | bwd_inner_microstep: 1980.00 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.73
+[2024-06-18 23:31:45,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3978.17 | bwd: 3880.04 | bwd_inner: 3869.79 | bwd_allreduce: 10.06 | step: 61.81
+ 94%|█████████▍| 660/700 [1:24:54<05:12,  7.81s/it]                                                   {'loss': 0.5603, 'learning_rate': 8.53847339133318e-07, 'epoch': 6.6}
+ 94%|█████████▍| 660/700 [1:24:54<05:12,  7.81s/it][2024-06-18 23:31:48,761] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1830.21 | bwd_microstep: 1639.36 | bwd_inner_microstep: 1634.37 | bwd_allreduce_microstep: 4.93 | step_microstep: 0.14
+[2024-06-18 23:31:52,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 23:31:52,751] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1982.58 | bwd_microstep: 1927.19 | bwd_inner_microstep: 1921.72 | bwd_allreduce_microstep: 5.31 | step_microstep: 62.00
+[2024-06-18 23:31:52,752] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3812.77 | bwd: 3566.57 | bwd_inner: 3556.14 | bwd_allreduce: 10.22 | step: 62.15
+ 94%|█████████▍| 661/700 [1:25:02<05:00,  7.71s/it]                                                   {'loss': 0.2253, 'learning_rate': 8.118030905343244e-07, 'epoch': 6.61}
+ 94%|█████████▍| 661/700 [1:25:02<05:00,  7.71s/it][2024-06-18 23:31:56,718] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1990.31 | bwd_microstep: 1954.35 | bwd_inner_microstep: 1949.41 | bwd_allreduce_microstep: 4.80 | step_microstep: 0.07
+[2024-06-18 23:32:00,666] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:32:00,666] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1972.32 | bwd_microstep: 1894.45 | bwd_inner_microstep: 1888.93 | bwd_allreduce_microstep: 5.44 | step_microstep: 62.65
+[2024-06-18 23:32:00,667] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3962.59 | bwd: 3848.81 | bwd_inner: 3838.40 | bwd_allreduce: 10.22 | step: 62.72
+ 95%|█████████▍| 662/700 [1:25:10<04:55,  7.77s/it]                                                   {'loss': 0.4123, 'learning_rate': 7.708118224265537e-07, 'epoch': 6.62}
+ 95%|█████████▍| 662/700 [1:25:10<04:55,  7.77s/it][2024-06-18 23:32:03,867] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1542.18 | bwd_microstep: 1636.62 | bwd_inner_microstep: 1631.77 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.08
+[2024-06-18 23:32:07,773] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:32:07,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1946.91 | bwd_microstep: 1879.46 | bwd_inner_microstep: 1874.19 | bwd_allreduce_microstep: 5.23 | step_microstep: 62.06
+[2024-06-18 23:32:07,774] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3489.06 | bwd: 3516.10 | bwd_inner: 3505.99 | bwd_allreduce: 10.00 | step: 62.14
+ 95%|█████████▍| 663/700 [1:25:17<04:40,  7.57s/it]                                                   {'loss': 0.1991, 'learning_rate': 7.308744123174005e-07, 'epoch': 6.63}
+ 95%|█████████▍| 663/700 [1:25:17<04:40,  7.57s/it][2024-06-18 23:32:11,681] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.96 | bwd_microstep: 1914.25 | bwd_inner_microstep: 1909.28 | bwd_allreduce_microstep: 4.82 | step_microstep: 0.08
+[2024-06-18 23:32:15,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 23:32:15,402] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1894.68 | bwd_microstep: 1745.63 | bwd_inner_microstep: 1740.18 | bwd_allreduce_microstep: 5.30 | step_microstep: 61.90
+[2024-06-18 23:32:15,403] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3865.61 | bwd: 3659.90 | bwd_inner: 3649.57 | bwd_allreduce: 10.11 | step: 61.98
+ 95%|█████████▍| 664/700 [1:25:24<04:33,  7.59s/it]                                                   {'loss': 0.2339, 'learning_rate': 6.919917151540945e-07, 'epoch': 6.64}
+ 95%|█████████▍| 664/700 [1:25:24<04:33,  7.59s/it][2024-06-18 23:32:18,765] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1598.05 | bwd_microstep: 1743.00 | bwd_inner_microstep: 1738.06 | bwd_allreduce_microstep: 4.81 | step_microstep: 0.07
+[2024-06-18 23:32:22,824] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:32:22,825] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2006.63 | bwd_microstep: 1973.85 | bwd_inner_microstep: 1968.49 | bwd_allreduce_microstep: 5.28 | step_microstep: 61.71
+[2024-06-18 23:32:22,825] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3604.64 | bwd: 3716.87 | bwd_inner: 3706.61 | bwd_allreduce: 10.07 | step: 61.79
+ 95%|█████████▌| 665/700 [1:25:32<04:23,  7.54s/it]                                                   {'loss': 0.1869, 'learning_rate': 6.54164563305465e-07, 'epoch': 6.65}
+ 95%|█████████▌| 665/700 [1:25:32<04:23,  7.54s/it][2024-06-18 23:32:26,150] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1496.10 | bwd_microstep: 1807.75 | bwd_inner_microstep: 1802.69 | bwd_allreduce_microstep: 4.97 | step_microstep: 0.10
+[2024-06-18 23:32:29,756] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 23:32:29,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1658.99 | bwd_microstep: 1866.19 | bwd_inner_microstep: 1860.88 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.90
+[2024-06-18 23:32:29,757] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3155.06 | bwd: 3673.97 | bwd_inner: 3663.60 | bwd_allreduce: 10.22 | step: 62.00
+ 95%|█████████▌| 666/700 [1:25:39<04:10,  7.36s/it]                                                   {'loss': 0.4296, 'learning_rate': 6.173937665440943e-07, 'epoch': 6.66}
+ 95%|█████████▌| 666/700 [1:25:39<04:10,  7.36s/it][2024-06-18 23:32:33,270] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1650.05 | bwd_microstep: 1841.96 | bwd_inner_microstep: 1837.15 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.07
+[2024-06-18 23:32:37,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:32:37,297] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1993.69 | bwd_microstep: 1953.52 | bwd_inner_microstep: 1948.16 | bwd_allreduce_microstep: 5.24 | step_microstep: 61.61
+[2024-06-18 23:32:37,298] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3643.70 | bwd: 3795.50 | bwd_inner: 3785.39 | bwd_allreduce: 9.97 | step: 61.69
+ 95%|█████████▌| 667/700 [1:25:46<04:04,  7.41s/it]                                                   {'loss': 0.6248, 'learning_rate': 5.816801120289761e-07, 'epoch': 6.67}
+ 95%|█████████▌| 667/700 [1:25:46<04:04,  7.41s/it][2024-06-18 23:32:41,172] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1964.79 | bwd_microstep: 1888.17 | bwd_inner_microstep: 1883.32 | bwd_allreduce_microstep: 4.77 | step_microstep: 0.11
+[2024-06-18 23:32:45,227] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 23:32:45,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2003.12 | bwd_microstep: 1971.30 | bwd_inner_microstep: 1966.02 | bwd_allreduce_microstep: 5.20 | step_microstep: 62.04
+[2024-06-18 23:32:45,228] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3967.88 | bwd: 3859.49 | bwd_inner: 3849.37 | bwd_allreduce: 9.98 | step: 62.16
+ 95%|█████████▌| 668/700 [1:25:54<04:02,  7.57s/it]                                                   {'loss': 0.4564, 'learning_rate': 5.470243642886729e-07, 'epoch': 6.68}
+ 95%|█████████▌| 668/700 [1:25:54<04:02,  7.57s/it][2024-06-18 23:32:49,149] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1973.44 | bwd_microstep: 1925.47 | bwd_inner_microstep: 1920.75 | bwd_allreduce_microstep: 4.68 | step_microstep: 0.07
+[2024-06-18 23:32:52,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.90
+[2024-06-18 23:32:52,458] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1567.04 | bwd_microstep: 1662.61 | bwd_inner_microstep: 1657.14 | bwd_allreduce_microstep: 5.40 | step_microstep: 62.54
+[2024-06-18 23:32:52,459] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3540.45 | bwd: 3588.11 | bwd_inner: 3577.90 | bwd_allreduce: 10.10 | step: 62.62
+ 96%|█████████▌| 669/700 [1:26:01<03:51,  7.47s/it]                                                   {'loss': 0.0565, 'learning_rate': 5.13427265204941e-07, 'epoch': 6.69}
+ 96%|█████████▌| 669/700 [1:26:01<03:51,  7.47s/it][2024-06-18 23:32:56,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1968.33 | bwd_microstep: 1911.13 | bwd_inner_microstep: 1906.46 | bwd_allreduce_microstep: 4.62 | step_microstep: 0.07
+[2024-06-18 23:33:00,394] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:33:00,395] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1994.72 | bwd_microstep: 1960.58 | bwd_inner_microstep: 1955.11 | bwd_allreduce_microstep: 5.33 | step_microstep: 61.80
+[2024-06-18 23:33:00,395] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3963.02 | bwd: 3871.74 | bwd_inner: 3861.63 | bwd_allreduce: 9.95 | step: 61.88
+ 96%|█████████▌| 670/700 [1:26:09<03:48,  7.61s/it]                                                   {'loss': 0.6664, 'learning_rate': 4.808895339968645e-07, 'epoch': 6.7}
+ 96%|█████████▌| 670/700 [1:26:09<03:48,  7.61s/it][2024-06-18 23:33:04,106] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1809.96 | bwd_microstep: 1879.71 | bwd_inner_microstep: 1874.65 | bwd_allreduce_microstep: 4.97 | step_microstep: 0.08
+[2024-06-18 23:33:07,595] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:33:07,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1529.89 | bwd_microstep: 1880.36 | bwd_inner_microstep: 1875.04 | bwd_allreduce_microstep: 5.25 | step_microstep: 61.76
+[2024-06-18 23:33:07,596] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3339.82 | bwd: 3760.09 | bwd_inner: 3749.74 | bwd_allreduce: 10.22 | step: 61.86
+ 96%|█████████▌| 671/700 [1:26:17<03:37,  7.49s/it]                                                   {'loss': 0.6173, 'learning_rate': 4.4941186720546255e-07, 'epoch': 6.71}
+ 96%|█████████▌| 671/700 [1:26:17<03:37,  7.49s/it][2024-06-18 23:33:11,256] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1894.54 | bwd_microstep: 1743.41 | bwd_inner_microstep: 1738.62 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.07
+[2024-06-18 23:33:14,651] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:33:14,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1499.26 | bwd_microstep: 1817.57 | bwd_inner_microstep: 1812.05 | bwd_allreduce_microstep: 5.38 | step_microstep: 61.80
+[2024-06-18 23:33:14,652] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3393.77 | bwd: 3561.00 | bwd_inner: 3550.73 | bwd_allreduce: 10.09 | step: 61.88
+ 96%|█████████▌| 672/700 [1:26:24<03:25,  7.36s/it]                                                   {'loss': 0.1852, 'learning_rate': 4.189949386787462e-07, 'epoch': 6.72}
+ 96%|█████████▌| 672/700 [1:26:24<03:25,  7.36s/it][2024-06-18 23:33:17,940] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1576.17 | bwd_microstep: 1690.71 | bwd_inner_microstep: 1685.91 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.07
+[2024-06-18 23:33:21,985] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:33:21,986] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1996.91 | bwd_microstep: 1968.45 | bwd_inner_microstep: 1963.12 | bwd_allreduce_microstep: 5.28 | step_microstep: 61.89
+[2024-06-18 23:33:21,986] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3573.04 | bwd: 3659.18 | bwd_inner: 3649.04 | bwd_allreduce: 10.02 | step: 61.97
+ 96%|█████████▌| 673/700 [1:26:31<03:18,  7.35s/it]                                                   {'loss': 0.1576, 'learning_rate': 3.8963939955731775e-07, 'epoch': 6.73}
+ 96%|█████████▌| 673/700 [1:26:31<03:18,  7.35s/it][2024-06-18 23:33:25,861] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1962.86 | bwd_microstep: 1891.02 | bwd_inner_microstep: 1886.26 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.07
+[2024-06-18 23:33:29,264] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 23:33:29,265] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1505.19 | bwd_microstep: 1817.73 | bwd_inner_microstep: 1812.19 | bwd_allreduce_microstep: 5.47 | step_microstep: 62.86
+[2024-06-18 23:33:29,265] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3468.02 | bwd: 3708.77 | bwd_inner: 3698.46 | bwd_allreduce: 10.20 | step: 62.94
+ 96%|█████████▋| 674/700 [1:26:38<03:10,  7.33s/it]                                                   {'loss': 0.5002, 'learning_rate': 3.6134587826043285e-07, 'epoch': 6.74}
+ 96%|█████████▋| 674/700 [1:26:38<03:10,  7.33s/it][2024-06-18 23:33:33,140] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1962.29 | bwd_microstep: 1890.20 | bwd_inner_microstep: 1885.39 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.07
+[2024-06-18 23:33:36,484] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:33:36,484] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1578.50 | bwd_microstep: 1686.14 | bwd_inner_microstep: 1680.77 | bwd_allreduce_microstep: 5.30 | step_microstep: 61.94
+[2024-06-18 23:33:36,484] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3540.77 | bwd: 3576.36 | bwd_inner: 3566.20 | bwd_allreduce: 10.03 | step: 62.03
+ 96%|█████████▋| 675/700 [1:26:45<03:02,  7.30s/it]                                                   {'loss': 0.2256, 'learning_rate': 3.3411498047254965e-07, 'epoch': 6.75}
+ 96%|█████████▋| 675/700 [1:26:45<03:02,  7.30s/it][2024-06-18 23:33:40,640] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2049.28 | bwd_microstep: 2084.97 | bwd_inner_microstep: 2080.16 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.07
+[2024-06-18 23:33:44,571] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.92
+[2024-06-18 23:33:44,572] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1963.45 | bwd_microstep: 1888.08 | bwd_inner_microstep: 1882.66 | bwd_allreduce_microstep: 5.35 | step_microstep: 62.02
+[2024-06-18 23:33:44,572] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 4012.70 | bwd: 3973.07 | bwd_inner: 3962.85 | bwd_allreduce: 10.08 | step: 62.11
+ 97%|█████████▋| 676/700 [1:26:54<03:00,  7.53s/it]                                                   {'loss': 0.7607, 'learning_rate': 3.0794728913033366e-07, 'epoch': 6.76}
+ 97%|█████████▋| 676/700 [1:26:54<03:00,  7.53s/it][2024-06-18 23:33:48,470] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1970.46 | bwd_microstep: 1905.48 | bwd_inner_microstep: 1900.64 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.07
+[2024-06-18 23:33:52,186] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 23:33:52,186] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1893.90 | bwd_microstep: 1741.64 | bwd_inner_microstep: 1735.88 | bwd_allreduce_microstep: 5.62 | step_microstep: 62.71
+[2024-06-18 23:33:52,187] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3864.33 | bwd: 3647.15 | bwd_inner: 3636.62 | bwd_allreduce: 10.34 | step: 62.79
+ 97%|█████████▋| 677/700 [1:27:01<02:53,  7.56s/it]                                                   {'loss': 0.247, 'learning_rate': 2.82843364410218e-07, 'epoch': 6.77}
+ 97%|█████████▋| 677/700 [1:27:01<02:53,  7.56s/it][2024-06-18 23:33:55,630] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1726.83 | bwd_microstep: 1694.39 | bwd_inner_microstep: 1689.33 | bwd_allreduce_microstep: 4.92 | step_microstep: 0.14
+[2024-06-18 23:33:59,680] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:33:59,681] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2000.54 | bwd_microstep: 1968.79 | bwd_inner_microstep: 1963.49 | bwd_allreduce_microstep: 5.23 | step_microstep: 62.38
+[2024-06-18 23:33:59,681] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3727.34 | bwd: 3663.21 | bwd_inner: 3652.89 | bwd_allreduce: 10.13 | step: 62.53
+ 97%|█████████▋| 678/700 [1:27:09<02:45,  7.54s/it]                                                   {'loss': 0.2779, 'learning_rate': 2.5880374371639594e-07, 'epoch': 6.78}
+ 97%|█████████▋| 678/700 [1:27:09<02:45,  7.54s/it][2024-06-18 23:34:03,587] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1968.15 | bwd_microstep: 1916.07 | bwd_inner_microstep: 1911.19 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.07
+[2024-06-18 23:34:06,674] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 23:34:06,674] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1391.08 | bwd_microstep: 1617.34 | bwd_inner_microstep: 1611.97 | bwd_allreduce_microstep: 5.30 | step_microstep: 61.65
+[2024-06-18 23:34:06,675] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3359.20 | bwd: 3533.43 | bwd_inner: 3523.23 | bwd_allreduce: 10.03 | step: 61.73
+ 97%|█████████▋| 679/700 [1:27:16<02:34,  7.38s/it]                                                   {'loss': 0.3975, 'learning_rate': 2.3582894166930268e-07, 'epoch': 6.79}
+ 97%|█████████▋| 679/700 [1:27:16<02:34,  7.38s/it][2024-06-18 23:34:10,579] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1968.05 | bwd_microstep: 1914.48 | bwd_inner_microstep: 1909.59 | bwd_allreduce_microstep: 4.75 | step_microstep: 0.09
+[2024-06-18 23:34:13,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 23:34:13,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1496.80 | bwd_microstep: 1809.28 | bwd_inner_microstep: 1803.88 | bwd_allreduce_microstep: 5.32 | step_microstep: 63.11
+[2024-06-18 23:34:13,965] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3464.82 | bwd: 3723.77 | bwd_inner: 3713.53 | bwd_allreduce: 10.09 | step: 63.20
+ 97%|█████████▋| 680/700 [1:27:23<02:26,  7.35s/it]                                                   {'loss': 0.5484, 'learning_rate': 2.1391945009461844e-07, 'epoch': 6.8}
+ 97%|█████████▋| 680/700 [1:27:23<02:26,  7.35s/it][2024-06-18 23:34:17,702] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1909.00 | bwd_microstep: 1806.15 | bwd_inner_microstep: 1801.19 | bwd_allreduce_microstep: 4.82 | step_microstep: 0.07
+[2024-06-18 23:34:21,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:34:21,381] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1686.62 | bwd_microstep: 1913.03 | bwd_inner_microstep: 1907.54 | bwd_allreduce_microstep: 5.40 | step_microstep: 61.59
+[2024-06-18 23:34:21,381] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3595.59 | bwd: 3719.20 | bwd_inner: 3708.80 | bwd_allreduce: 10.21 | step: 61.67
+ 97%|█████████▋| 681/700 [1:27:30<02:20,  7.37s/it]                                                   {'loss': 0.3157, 'learning_rate': 1.9307573801273236e-07, 'epoch': 6.81}
+ 97%|█████████▋| 681/700 [1:27:30<02:20,  7.37s/it][2024-06-18 23:34:25,306] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1975.65 | bwd_microstep: 1927.23 | bwd_inner_microstep: 1922.46 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.07
+[2024-06-18 23:34:29,246] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 23:34:29,247] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1964.55 | bwd_microstep: 1896.06 | bwd_inner_microstep: 1890.60 | bwd_allreduce_microstep: 5.31 | step_microstep: 61.49
+[2024-06-18 23:34:29,247] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3940.17 | bwd: 3823.31 | bwd_inner: 3813.12 | bwd_allreduce: 10.00 | step: 61.56
+ 97%|█████████▋| 682/700 [1:27:38<02:15,  7.52s/it]                                                   {'loss': 0.2321, 'learning_rate': 1.7329825162870073e-07, 'epoch': 6.82}
+ 97%|█████████▋| 682/700 [1:27:38<02:15,  7.52s/it][2024-06-18 23:34:32,988] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1911.56 | bwd_microstep: 1807.56 | bwd_inner_microstep: 1802.62 | bwd_allreduce_microstep: 4.79 | step_microstep: 0.07
+[2024-06-18 23:34:37,041] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 23:34:37,041] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1999.05 | bwd_microstep: 1972.89 | bwd_inner_microstep: 1967.25 | bwd_allreduce_microstep: 5.49 | step_microstep: 62.53
+[2024-06-18 23:34:37,042] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3910.58 | bwd: 3780.47 | bwd_inner: 3769.98 | bwd_allreduce: 10.27 | step: 62.61
+ 98%|█████████▊| 683/700 [1:27:46<02:09,  7.60s/it]                                                   {'loss': 0.3094, 'learning_rate': 1.545874143226933e-07, 'epoch': 6.83}
+ 98%|█████████▊| 683/700 [1:27:46<02:09,  7.60s/it][2024-06-18 23:34:40,675] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1884.03 | bwd_microstep: 1727.73 | bwd_inner_microstep: 1722.82 | bwd_allreduce_microstep: 4.78 | step_microstep: 0.08
+[2024-06-18 23:34:44,478] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:34:44,478] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1914.35 | bwd_microstep: 1808.10 | bwd_inner_microstep: 1802.72 | bwd_allreduce_microstep: 5.26 | step_microstep: 62.19
+[2024-06-18 23:34:44,479] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3798.35 | bwd: 3535.86 | bwd_inner: 3525.64 | bwd_allreduce: 10.02 | step: 62.27
+ 98%|█████████▊| 684/700 [1:27:53<02:00,  7.55s/it]                                                   {'loss': 0.0238, 'learning_rate': 1.3694362664094518e-07, 'epoch': 6.84}
+ 98%|█████████▊| 684/700 [1:27:53<02:00,  7.55s/it][2024-06-18 23:34:48,360] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1964.99 | bwd_microstep: 1894.90 | bwd_inner_microstep: 1890.00 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.08
+[2024-06-18 23:34:51,614] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 23:34:51,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1447.23 | bwd_microstep: 1728.16 | bwd_inner_microstep: 1722.84 | bwd_allreduce_microstep: 5.24 | step_microstep: 62.07
+[2024-06-18 23:34:51,615] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3412.19 | bwd: 3623.08 | bwd_inner: 3612.91 | bwd_allreduce: 10.01 | step: 62.16
+ 98%|█████████▊| 685/700 [1:28:01<01:51,  7.43s/it]                                                   {'loss': 0.302, 'learning_rate': 1.2036726628715245e-07, 'epoch': 6.85}
+ 98%|█████████▊| 685/700 [1:28:01<01:51,  7.43s/it][2024-06-18 23:34:55,488] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1961.91 | bwd_microstep: 1888.69 | bwd_inner_microstep: 1883.73 | bwd_allreduce_microstep: 4.82 | step_microstep: 0.07
+[2024-06-18 23:34:59,463] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:34:59,464] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1972.06 | bwd_microstep: 1923.29 | bwd_inner_microstep: 1917.94 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.87
+[2024-06-18 23:34:59,464] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3933.94 | bwd: 3812.01 | bwd_inner: 3801.74 | bwd_allreduce: 10.07 | step: 61.95
+ 98%|█████████▊| 686/700 [1:28:08<01:45,  7.55s/it]                                                   {'loss': 0.3072, 'learning_rate': 1.0485868811441757e-07, 'epoch': 6.86}
+ 98%|█████████▊| 686/700 [1:28:08<01:45,  7.55s/it][2024-06-18 23:35:03,444] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1992.90 | bwd_microstep: 1965.61 | bwd_inner_microstep: 1960.55 | bwd_allreduce_microstep: 4.99 | step_microstep: 0.14
+[2024-06-18 23:35:07,434] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 23:35:07,434] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1979.97 | bwd_microstep: 1928.45 | bwd_inner_microstep: 1923.09 | bwd_allreduce_microstep: 5.32 | step_microstep: 61.91
+[2024-06-18 23:35:07,435] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3972.84 | bwd: 3894.08 | bwd_inner: 3883.65 | bwd_allreduce: 10.31 | step: 62.06
+ 98%|█████████▊| 687/700 [1:28:16<01:39,  7.68s/it]                                                   {'loss': 0.336, 'learning_rate': 9.041822411763324e-08, 'epoch': 6.87}
+ 98%|█████████▊| 687/700 [1:28:16<01:39,  7.68s/it][2024-06-18 23:35:11,318] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1963.72 | bwd_microstep: 1898.00 | bwd_inner_microstep: 1893.14 | bwd_allreduce_microstep: 4.74 | step_microstep: 0.07
+[2024-06-18 23:35:15,261] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:35:15,262] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.34 | bwd_microstep: 1895.33 | bwd_inner_microstep: 1889.86 | bwd_allreduce_microstep: 5.39 | step_microstep: 62.59
+[2024-06-18 23:35:15,262] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3931.03 | bwd: 3793.35 | bwd_inner: 3783.07 | bwd_allreduce: 10.14 | step: 62.67
+ 98%|█████████▊| 688/700 [1:28:24<01:32,  7.72s/it]                                                   {'loss': 0.2623, 'learning_rate': 7.704618342638802e-08, 'epoch': 6.88}
+ 98%|█████████▊| 688/700 [1:28:24<01:32,  7.72s/it][2024-06-18 23:35:19,137] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1961.01 | bwd_microstep: 1893.17 | bwd_inner_microstep: 1888.45 | bwd_allreduce_microstep: 4.68 | step_microstep: 0.07
+[2024-06-18 23:35:22,694] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.85
+[2024-06-18 23:35:22,695] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1650.58 | bwd_microstep: 1827.28 | bwd_inner_microstep: 1821.92 | bwd_allreduce_microstep: 5.28 | step_microstep: 61.89
+[2024-06-18 23:35:22,695] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3611.57 | bwd: 3720.47 | bwd_inner: 3710.39 | bwd_allreduce: 9.98 | step: 61.97
+ 98%|█████████▊| 689/700 [1:28:32<01:23,  7.64s/it]                                                   {'loss': 0.201, 'learning_rate': 6.474285229833843e-08, 'epoch': 6.89}
+ 98%|█████████▊| 689/700 [1:28:32<01:23,  7.64s/it][2024-06-18 23:35:26,599] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1965.78 | bwd_microstep: 1916.21 | bwd_inner_microstep: 1911.19 | bwd_allreduce_microstep: 4.94 | step_microstep: 0.07
+[2024-06-18 23:35:30,153] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:35:30,154] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1646.84 | bwd_microstep: 1829.37 | bwd_inner_microstep: 1823.96 | bwd_allreduce_microstep: 5.27 | step_microstep: 61.44
+[2024-06-18 23:35:30,154] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3612.59 | bwd: 3745.61 | bwd_inner: 3735.22 | bwd_allreduce: 10.19 | step: 61.52
+ 99%|█████████▊| 690/700 [1:28:39<01:15,  7.58s/it]                                                   {'loss': 0.2409, 'learning_rate': 5.350849411307479e-08, 'epoch': 6.9}
+ 99%|█████████▊| 690/700 [1:28:39<01:15,  7.58s/it][2024-06-18 23:35:34,056] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1967.39 | bwd_microstep: 1912.82 | bwd_inner_microstep: 1908.00 | bwd_allreduce_microstep: 4.70 | step_microstep: 0.07
+[2024-06-18 23:35:38,038] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.86
+[2024-06-18 23:35:38,038] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1976.60 | bwd_microstep: 1925.22 | bwd_inner_microstep: 1919.89 | bwd_allreduce_microstep: 5.26 | step_microstep: 62.03
+[2024-06-18 23:35:38,039] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3943.96 | bwd: 3838.06 | bwd_inner: 3827.95 | bwd_allreduce: 9.97 | step: 62.11
+ 99%|█████████▊| 691/700 [1:28:47<01:09,  7.67s/it]                                                   {'loss': 0.3016, 'learning_rate': 4.334334936652029e-08, 'epoch': 6.91}
+ 99%|█████████▊| 691/700 [1:28:47<01:09,  7.67s/it][2024-06-18 23:35:41,697] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1894.22 | bwd_microstep: 1742.50 | bwd_inner_microstep: 1737.46 | bwd_allreduce_microstep: 4.95 | step_microstep: 0.09
+[2024-06-18 23:35:45,690] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.87
+[2024-06-18 23:35:45,691] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1978.12 | bwd_microstep: 1934.62 | bwd_inner_microstep: 1929.22 | bwd_allreduce_microstep: 5.32 | step_microstep: 61.92
+[2024-06-18 23:35:45,691] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3872.31 | bwd: 3677.14 | bwd_inner: 3666.72 | bwd_allreduce: 10.27 | step: 62.02
+ 99%|█████████▉| 692/700 [1:28:55<01:01,  7.67s/it]                                                   {'loss': 0.2335, 'learning_rate': 3.424763566572398e-08, 'epoch': 6.92}
+ 99%|█████████▉| 692/700 [1:28:55<01:01,  7.67s/it][2024-06-18 23:35:49,347] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1895.29 | bwd_microstep: 1739.12 | bwd_inner_microstep: 1734.39 | bwd_allreduce_microstep: 4.68 | step_microstep: 0.07
+[2024-06-18 23:35:53,413] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.90
+[2024-06-18 23:35:53,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2007.57 | bwd_microstep: 1979.05 | bwd_inner_microstep: 1973.60 | bwd_allreduce_microstep: 5.37 | step_microstep: 62.12
+[2024-06-18 23:35:53,414] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3902.83 | bwd: 3718.20 | bwd_inner: 3708.02 | bwd_allreduce: 10.07 | step: 62.20
+ 99%|█████████▉| 693/700 [1:29:02<00:53,  7.68s/it]                                                   {'loss': 0.2046, 'learning_rate': 2.6221547724253337e-08, 'epoch': 6.93}
+ 99%|█████████▉| 693/700 [1:29:02<00:53,  7.68s/it][2024-06-18 23:35:57,411] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 2004.46 | bwd_microstep: 1970.44 | bwd_inner_microstep: 1965.61 | bwd_allreduce_microstep: 4.76 | step_microstep: 0.07
+[2024-06-18 23:36:01,379] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:36:01,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1979.31 | bwd_microstep: 1909.12 | bwd_inner_microstep: 1903.71 | bwd_allreduce_microstep: 5.33 | step_microstep: 61.63
+[2024-06-18 23:36:01,380] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3983.74 | bwd: 3879.58 | bwd_inner: 3869.36 | bwd_allreduce: 10.09 | step: 61.71
+ 99%|█████████▉| 694/700 [1:29:10<00:46,  7.77s/it]                                                   {'loss': 0.3476, 'learning_rate': 1.9265257358008772e-08, 'epoch': 6.94}
+ 99%|█████████▉| 694/700 [1:29:10<00:46,  7.77s/it][2024-06-18 23:36:04,700] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1494.31 | bwd_microstep: 1804.66 | bwd_inner_microstep: 1799.88 | bwd_allreduce_microstep: 4.71 | step_microstep: 0.07
+[2024-06-18 23:36:08,411] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.89
+[2024-06-18 23:36:08,412] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1891.53 | bwd_microstep: 1739.86 | bwd_inner_microstep: 1734.37 | bwd_allreduce_microstep: 5.44 | step_microstep: 62.57
+[2024-06-18 23:36:08,412] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3385.81 | bwd: 3544.54 | bwd_inner: 3534.26 | bwd_allreduce: 10.17 | step: 62.65
+ 99%|█████████▉| 695/700 [1:29:17<00:37,  7.55s/it]                                                   {'loss': 0.0757, 'learning_rate': 1.3378913481526533e-08, 'epoch': 6.95}
+ 99%|█████████▉| 695/700 [1:29:17<00:37,  7.55s/it][2024-06-18 23:36:12,286] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1963.08 | bwd_microstep: 1889.41 | bwd_inner_microstep: 1884.54 | bwd_allreduce_microstep: 4.73 | step_microstep: 0.07
+[2024-06-18 23:36:16,216] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.83
+[2024-06-18 23:36:16,217] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1961.84 | bwd_microstep: 1889.05 | bwd_inner_microstep: 1883.81 | bwd_allreduce_microstep: 5.19 | step_microstep: 61.75
+[2024-06-18 23:36:16,217] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3924.88 | bwd: 3778.49 | bwd_inner: 3768.41 | bwd_allreduce: 9.94 | step: 61.83
+ 99%|█████████▉| 696/700 [1:29:25<00:30,  7.62s/it]                                                   {'loss': 0.5525, 'learning_rate': 8.562642104831265e-09, 'epoch': 6.96}
+ 99%|█████████▉| 696/700 [1:29:25<00:30,  7.62s/it][2024-06-18 23:36:19,911] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1807.79 | bwd_microstep: 1865.38 | bwd_inner_microstep: 1860.36 | bwd_allreduce_microstep: 4.88 | step_microstep: 0.08
+[2024-06-18 23:36:23,898] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.84
+[2024-06-18 23:36:23,899] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1980.87 | bwd_microstep: 1926.14 | bwd_inner_microstep: 1920.67 | bwd_allreduce_microstep: 5.32 | step_microstep: 61.67
+[2024-06-18 23:36:23,899] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3788.63 | bwd: 3791.54 | bwd_inner: 3781.13 | bwd_allreduce: 10.18 | step: 61.76
+100%|█████████▉| 697/700 [1:29:33<00:22,  7.64s/it]                                                   {'loss': 0.3606, 'learning_rate': 4.816546330688176e-09, 'epoch': 6.97}
+100%|█████████▉| 697/700 [1:29:33<00:22,  7.64s/it][2024-06-18 23:36:27,883] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1997.71 | bwd_microstep: 1964.86 | bwd_inner_microstep: 1959.85 | bwd_allreduce_microstep: 4.86 | step_microstep: 0.07
+[2024-06-18 23:36:31,601] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.82
+[2024-06-18 23:36:31,602] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1895.97 | bwd_microstep: 1742.97 | bwd_inner_microstep: 1737.51 | bwd_allreduce_microstep: 5.33 | step_microstep: 61.70
+[2024-06-18 23:36:31,602] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3893.65 | bwd: 3707.86 | bwd_inner: 3697.46 | bwd_allreduce: 10.15 | step: 61.79
+100%|█████████▉| 698/700 [1:29:41<00:15,  7.66s/it]                                                   {'loss': 0.3668, 'learning_rate': 2.140706352443678e-09, 'epoch': 6.98}
+100%|█████████▉| 698/700 [1:29:41<00:15,  7.66s/it][2024-06-18 23:36:35,176] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1859.01 | bwd_microstep: 1693.98 | bwd_inner_microstep: 1689.11 | bwd_allreduce_microstep: 4.78 | step_microstep: 0.08
+[2024-06-18 23:36:39,173] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.89
+[2024-06-18 23:36:39,174] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1982.53 | bwd_microstep: 1933.96 | bwd_inner_microstep: 1928.14 | bwd_allreduce_microstep: 5.68 | step_microstep: 63.15
+[2024-06-18 23:36:39,174] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3841.50 | bwd: 3627.97 | bwd_inner: 3617.34 | bwd_allreduce: 10.44 | step: 63.24
+100%|█████████▉| 699/700 [1:29:48<00:07,  7.63s/it]                                                   {'loss': 0.1318, 'learning_rate': 5.351794522823195e-10, 'epoch': 6.99}
+100%|█████████▉| 699/700 [1:29:48<00:07,  7.63s/it][2024-06-18 23:36:43,151] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1995.68 | bwd_microstep: 1958.77 | bwd_inner_microstep: 1954.00 | bwd_allreduce_microstep: 4.72 | step_microstep: 0.07
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+please install petrel_client
+Replace train sampler!!
+petrel_client is not installed. Using PIL to load images.
+[2024-06-18 23:36:47,899] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | optimizer_step: 1.88
+[2024-06-18 23:36:47,899] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd_microstep: 1971.76 | bwd_microstep: 1934.27 | bwd_inner_microstep: 1928.62 | bwd_allreduce_microstep: 5.56 | step_microstep: 63.92
+[2024-06-18 23:36:47,900] [INFO] [logging.py:96:log_dist] [Rank 0] time (ms) | fwd: 3967.37 | bwd: 3893.06 | bwd_inner: 3882.64 | bwd_allreduce: 10.29 | step: 64.00
+100%|██████████| 700/700 [1:29:57<00:00,  7.96s/it]                                                   {'loss': 0.3211, 'learning_rate': 0.0, 'epoch': 7.0}
+100%|██████████| 700/700 [1:29:57<00:00,  7.96s/it][INFO|trainer.py:1962] 2024-06-18 23:36:47,912 >> 
+
+Training completed. Do not forget to share your model on huggingface.co/models =)
+
+
+                                                   {'train_runtime': 5397.3976, 'train_samples_per_second': 0.519, 'train_steps_per_second': 0.13, 'train_loss': 0.5665054323007852, 'epoch': 7.0}
+100%|██████████| 700/700 [1:29:57<00:00,  7.96s/it]100%|██████████| 700/700 [1:29:57<00:00,  7.71s/it]
+[INFO|trainer.py:2936] 2024-06-18 23:37:16,312 >> Saving model checkpoint to ckpts/baseline3_7_epochs/
+[INFO|configuration_utils.py:473] 2024-06-18 23:37:16,317 >> Configuration saved in ckpts/baseline3_7_epochs/config.json
+[INFO|configuration_utils.py:594] 2024-06-18 23:37:16,318 >> Configuration saved in ckpts/baseline3_7_epochs/generation_config.json
+[INFO|modeling_utils.py:2501] 2024-06-18 23:37:56,091 >> The model is bigger than the maximum size per checkpoint (5GB) and is going to be split in 11 checkpoint shards. You can find where each parameters has been saved in the index located at ckpts/baseline3_7_epochs/model.safetensors.index.json.
+[INFO|tokenization_utils_base.py:2433] 2024-06-18 23:37:56,094 >> tokenizer config file saved in ckpts/baseline3_7_epochs/tokenizer_config.json
+[INFO|tokenization_utils_base.py:2442] 2024-06-18 23:37:56,094 >> Special tokens file saved in ckpts/baseline3_7_epochs/special_tokens_map.json
+[INFO|tokenization_utils_base.py:2493] 2024-06-18 23:37:56,095 >> added tokens file saved in ckpts/baseline3_7_epochs/added_tokens.json
+***** train metrics *****
+  epoch                    =        7.0
+  train_loss               =     0.5665
+  train_runtime            = 1:29:57.39
+  train_samples            =        400
+  train_samples_per_second =      0.519
+  train_steps_per_second   =       0.13