[2022-12-19 11:14:18,662] [WARNING] [runner.py:179:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only. [2022-12-19 11:14:18,671] [INFO] [runner.py:508:main] cmd = /usr/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMF19 --master_addr=127.0.0.1 --master_port=29500 run_speech_recognition_seq2seq_streaming.py --deepspeed=ds_config.json --model_name_or_path=openai/whisper-medium --dataset_name=mozilla-foundation/common_voice_11_0 --dataset_config_name=sk --language=slovak --train_split_name=train+validation --eval_split_name=test --model_index_name=Whisper Medium Slovak CV11 --max_steps=5000 --output_dir=./ --per_device_train_batch_size=64 --per_device_eval_batch_size=32 --logging_steps=25 --learning_rate=1e-5 --warmup_steps=500 --evaluation_strategy=steps --eval_steps=1000 --save_strategy=steps --save_steps=1000 --generation_max_length=225 --length_column_name=input_length --max_duration_in_seconds=30 --text_column_name=sentence --freeze_feature_encoder=False --report_to=tensorboard --metric_for_best_model=wer --greater_is_better=False --load_best_model_at_end --gradient_checkpointing --fp16 --overwrite_output_dir --do_train --do_eval --predict_with_generate --do_normalize_eval --streaming=False --use_auth_token --push_to_hub [2022-12-19 11:14:21,921] [INFO] [launch.py:135:main] 0 NV_LIBNCCL_DEV_PACKAGE=libnccl-dev=2.13.4-1+cuda11.7 [2022-12-19 11:14:21,922] [INFO] [launch.py:135:main] 0 NV_LIBNCCL_DEV_PACKAGE_VERSION=2.13.4-1 [2022-12-19 11:14:21,922] [INFO] [launch.py:135:main] 0 NCCL_VERSION=2.13.4-1 [2022-12-19 11:14:21,922] [INFO] [launch.py:135:main] 0 NV_LIBNCCL_DEV_PACKAGE_NAME=libnccl-dev [2022-12-19 11:14:21,922] [INFO] [launch.py:135:main] 0 NV_LIBNCCL_PACKAGE=libnccl2=2.13.4-1+cuda11.7 [2022-12-19 11:14:21,922] [INFO] [launch.py:135:main] 0 NV_LIBNCCL_PACKAGE_NAME=libnccl2 [2022-12-19 11:14:21,922] [INFO] [launch.py:135:main] 0 NV_LIBNCCL_PACKAGE_VERSION=2.13.4-1 [2022-12-19 11:14:21,922] [INFO] [launch.py:142:main] WORLD INFO DICT: {'localhost': [0]} [2022-12-19 11:14:21,922] [INFO] [launch.py:148:main] nnodes=1, num_local_procs=1, node_rank=0 [2022-12-19 11:14:21,922] [INFO] [launch.py:161:main] global_rank_mapping=defaultdict(, {'localhost': [0]}) [2022-12-19 11:14:21,922] [INFO] [launch.py:162:main] dist_world_size=1 [2022-12-19 11:14:21,922] [INFO] [launch.py:164:main] Setting CUDA_VISIBLE_DEVICES=0 [2022-12-19 11:14:29,454] [INFO] [comm.py:654:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl 12/19/2022 11:14:29 - WARNING - __main__ - Process rank: 0, device: cuda:0, n_gpu: 1distributed training: True, 16-bits training: True 12/19/2022 11:14:29 - INFO - __main__ - Training/evaluation parameters Seq2SeqTrainingArguments( _n_gpu=1, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, bf16=False, bf16_full_eval=False, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=0, dataloader_pin_memory=True, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=ds_config.json, disable_tqdm=False, do_eval=True, do_predict=False, do_train=True, eval_accumulation_steps=None, eval_delay=0, eval_steps=1000, evaluation_strategy=steps, fp16=True, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, fsdp=[], fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, generation_max_length=225, generation_num_beams=None, gradient_accumulation_steps=1, gradient_checkpointing=True, greater_is_better=False, group_by_length=False, half_precision_backend=auto, hub_model_id=None, hub_private_repo=False, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_inputs_for_metrics=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=1e-05, length_column_name=input_length, load_best_model_at_end=True, local_rank=0, log_level=passive, log_level_replica=passive, log_on_each_node=True, logging_dir=./runs/Dec19_11-14-29_fe2747a042f0, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=25, logging_strategy=steps, lr_scheduler_type=linear, max_grad_norm=1.0, max_steps=5000, metric_for_best_model=wer, mp_parameters=, no_cuda=False, num_train_epochs=3.0, optim=adamw_hf, optim_args=None, output_dir=./, overwrite_output_dir=True, past_index=-1, per_device_eval_batch_size=32, per_device_train_batch_size=64, predict_with_generate=True, prediction_loss_only=False, push_to_hub=True, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, ray_scope=last, remove_unused_columns=True, report_to=['tensorboard'], resume_from_checkpoint=None, run_name=./, save_on_each_node=False, save_steps=1000, save_strategy=steps, save_total_limit=None, seed=42, sharded_ddp=[], skip_memory_metrics=True, sortish_sampler=False, tf32=None, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_ipex=False, use_legacy_prediction_loop=False, use_mps_device=False, warmup_ratio=0.0, warmup_steps=500, weight_decay=0.0, xpu_backend=None, ) 12/19/2022 11:14:29 - INFO - __main__ - Training/evaluation parameters Seq2SeqTrainingArguments( _n_gpu=1, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, bf16=False, bf16_full_eval=False, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=0, dataloader_pin_memory=True, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=ds_config.json, disable_tqdm=False, do_eval=True, do_predict=False, do_train=True, eval_accumulation_steps=None, eval_delay=0, eval_steps=1000, evaluation_strategy=steps, fp16=True, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, fsdp=[], fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, generation_max_length=225, generation_num_beams=None, gradient_accumulation_steps=1, gradient_checkpointing=True, greater_is_better=False, group_by_length=False, half_precision_backend=auto, hub_model_id=None, hub_private_repo=False, hub_strategy=every_save, hub_token=, ignore_data_skip=False, include_inputs_for_metrics=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=1e-05, length_column_name=input_length, load_best_model_at_end=True, local_rank=0, log_level=passive, log_level_replica=passive, log_on_each_node=True, logging_dir=./runs/Dec19_11-14-29_fe2747a042f0, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=25, logging_strategy=steps, lr_scheduler_type=linear, max_grad_norm=1.0, max_steps=5000, metric_for_best_model=wer, mp_parameters=, no_cuda=False, num_train_epochs=3.0, optim=adamw_hf, optim_args=None, output_dir=./, overwrite_output_dir=True, past_index=-1, per_device_eval_batch_size=32, per_device_train_batch_size=64, predict_with_generate=True, prediction_loss_only=False, push_to_hub=True, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=, ray_scope=last, remove_unused_columns=True, report_to=['tensorboard'], resume_from_checkpoint=None, run_name=./, save_on_each_node=False, save_steps=1000, save_strategy=steps, save_total_limit=None, seed=42, sharded_ddp=[], skip_memory_metrics=True, sortish_sampler=False, tf32=None, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_ipex=False, use_legacy_prediction_loop=False, use_mps_device=False, warmup_ratio=0.0, warmup_steps=500, weight_decay=0.0, xpu_backend=None, ) 12/19/2022 11:14:33 - INFO - datasets.info - Loading Dataset Infos from /root/.cache/huggingface/modules/datasets_modules/datasets/mozilla-foundation--common_voice_11_0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f 12/19/2022 11:14:33 - INFO - datasets.builder - Generating dataset common_voice_11_0 (/root/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/sk/11.0.0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f) Downloading and preparing dataset common_voice_11_0/sk to /root/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/sk/11.0.0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f... 12/19/2022 11:14:33 - INFO - datasets.builder - Dataset not on Hf google storage. Downloading and preparing it from source 12/19/2022 11:14:33 - INFO - datasets.download.download_manager - Downloading took 0.0 min 12/19/2022 11:14:33 - INFO - datasets.download.download_manager - Checksum Computation took 0.0 min 12/19/2022 11:14:34 - INFO - datasets.utils.file_utils - https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/sk/train/sk_train_0.tar not found in cache or force_download set to True, downloading to /root/.cache/huggingface/datasets/downloads/tmpfcxe8xjf 12/19/2022 11:14:37 - INFO - datasets.utils.file_utils - storing https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/sk/train/sk_train_0.tar in cache at /root/.cache/huggingface/datasets/downloads/dc60bece3babd1ec34c0d46bc16ae66faf1b9cbcbfb15aec62d81e4be6fbf6bc 12/19/2022 11:14:37 - INFO - datasets.utils.file_utils - creating metadata file for /root/.cache/huggingface/datasets/downloads/dc60bece3babd1ec34c0d46bc16ae66faf1b9cbcbfb15aec62d81e4be6fbf6bc 12/19/2022 11:14:38 - INFO - datasets.utils.file_utils - https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/sk/dev/sk_dev_0.tar not found in cache or force_download set to True, downloading to /root/.cache/huggingface/datasets/downloads/tmpzhl6hdfs 12/19/2022 11:14:41 - INFO - datasets.utils.file_utils - storing https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/sk/dev/sk_dev_0.tar in cache at /root/.cache/huggingface/datasets/downloads/5f1420f0120f31ad1f506ec49fb795f4cd827b99fb4eafebac44ff0f63caaed2 12/19/2022 11:14:41 - INFO - datasets.utils.file_utils - creating metadata file for /root/.cache/huggingface/datasets/downloads/5f1420f0120f31ad1f506ec49fb795f4cd827b99fb4eafebac44ff0f63caaed2 12/19/2022 11:14:42 - INFO - datasets.utils.file_utils - https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/sk/test/sk_test_0.tar not found in cache or force_download set to True, downloading to /root/.cache/huggingface/datasets/downloads/tmpniv8gs8j 12/19/2022 11:14:45 - INFO - datasets.utils.file_utils - storing https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/sk/test/sk_test_0.tar in cache at /root/.cache/huggingface/datasets/downloads/0a53aff953dd59d0697ffaa461246f461d99f9fd7c9ce4ff851f6a78c6893565 12/19/2022 11:14:45 - INFO - datasets.utils.file_utils - creating metadata file for /root/.cache/huggingface/datasets/downloads/0a53aff953dd59d0697ffaa461246f461d99f9fd7c9ce4ff851f6a78c6893565 12/19/2022 11:14:45 - INFO - datasets.utils.file_utils - https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/sk/other/sk_other_0.tar not found in cache or force_download set to True, downloading to /root/.cache/huggingface/datasets/downloads/tmpxrkfsvrp 12/19/2022 11:14:47 - INFO - datasets.utils.file_utils - storing https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/sk/other/sk_other_0.tar in cache at /root/.cache/huggingface/datasets/downloads/00bc49d96ccd766e8321fb91ac512d72c63efe96cc1b1f11ff32365bc96044a3 12/19/2022 11:14:47 - INFO - datasets.utils.file_utils - creating metadata file for /root/.cache/huggingface/datasets/downloads/00bc49d96ccd766e8321fb91ac512d72c63efe96cc1b1f11ff32365bc96044a3 12/19/2022 11:14:48 - INFO - datasets.utils.file_utils - https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/sk/invalidated/sk_invalidated_0.tar not found in cache or force_download set to True, downloading to /root/.cache/huggingface/datasets/downloads/tmp6pi3yjxe 12/19/2022 11:14:50 - INFO - datasets.utils.file_utils - storing https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/audio/sk/invalidated/sk_invalidated_0.tar in cache at /root/.cache/huggingface/datasets/downloads/5b62d26cd68c7d9f66ad5c933b865db14e3db1aaaaf89e4968618835b5fed105 12/19/2022 11:14:50 - INFO - datasets.utils.file_utils - creating metadata file for /root/.cache/huggingface/datasets/downloads/5b62d26cd68c7d9f66ad5c933b865db14e3db1aaaaf89e4968618835b5fed105 12/19/2022 11:14:50 - INFO - datasets.download.download_manager - Downloading took 0.0 min 12/19/2022 11:14:51 - INFO - datasets.download.download_manager - Checksum Computation took 0.0 min 12/19/2022 11:14:51 - INFO - datasets.utils.py_utils - Spawning 5 processes for 5 objects in slices of [1, 1, 1, 1, 1] 12/19/2022 11:14:52 - INFO - datasets.utils.py_utils - Finished 5 processes 12/19/2022 11:14:52 - INFO - datasets.utils.py_utils - Unpacked 5 objects 12/19/2022 11:14:53 - INFO - datasets.utils.file_utils - https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/transcript/sk/train.tsv not found in cache or force_download set to True, downloading to /root/.cache/huggingface/datasets/downloads/tmpbsn_3w9z 12/19/2022 11:14:54 - INFO - datasets.utils.file_utils - storing https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/transcript/sk/train.tsv in cache at /root/.cache/huggingface/datasets/downloads/84e9d6c3445faa56fe7f4baf001b75e1e9f81ffc486f049ffabe6202138831e2 12/19/2022 11:14:54 - INFO - datasets.utils.file_utils - creating metadata file for /root/.cache/huggingface/datasets/downloads/84e9d6c3445faa56fe7f4baf001b75e1e9f81ffc486f049ffabe6202138831e2 12/19/2022 11:14:55 - INFO - datasets.utils.file_utils - https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/transcript/sk/dev.tsv not found in cache or force_download set to True, downloading to /root/.cache/huggingface/datasets/downloads/tmpfyyx15dx 12/19/2022 11:14:56 - INFO - datasets.utils.file_utils - storing https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/transcript/sk/dev.tsv in cache at /root/.cache/huggingface/datasets/downloads/269bfb57a2f9de09c6113bcc84a9b36d8df7c58611682c76c31d5314c7f8f689 12/19/2022 11:14:56 - INFO - datasets.utils.file_utils - creating metadata file for /root/.cache/huggingface/datasets/downloads/269bfb57a2f9de09c6113bcc84a9b36d8df7c58611682c76c31d5314c7f8f689 12/19/2022 11:14:57 - INFO - datasets.utils.file_utils - https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/transcript/sk/test.tsv not found in cache or force_download set to True, downloading to /root/.cache/huggingface/datasets/downloads/tmp727ax739 12/19/2022 11:14:58 - INFO - datasets.utils.file_utils - storing https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/transcript/sk/test.tsv in cache at /root/.cache/huggingface/datasets/downloads/35cee125ac2f288e3813dfc999980ba2cfeffe6ac77770e4a2e9c95c72a15392 12/19/2022 11:14:58 - INFO - datasets.utils.file_utils - creating metadata file for /root/.cache/huggingface/datasets/downloads/35cee125ac2f288e3813dfc999980ba2cfeffe6ac77770e4a2e9c95c72a15392 12/19/2022 11:14:59 - INFO - datasets.utils.file_utils - https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/transcript/sk/other.tsv not found in cache or force_download set to True, downloading to /root/.cache/huggingface/datasets/downloads/tmpf04skevi 12/19/2022 11:15:00 - INFO - datasets.utils.file_utils - storing https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/transcript/sk/other.tsv in cache at /root/.cache/huggingface/datasets/downloads/bda4e520dc25d592f67db8bc3080e6a86eaf9fafccfead8c9e1e43c244420aee 12/19/2022 11:15:00 - INFO - datasets.utils.file_utils - creating metadata file for /root/.cache/huggingface/datasets/downloads/bda4e520dc25d592f67db8bc3080e6a86eaf9fafccfead8c9e1e43c244420aee 12/19/2022 11:15:01 - INFO - datasets.utils.file_utils - https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/transcript/sk/invalidated.tsv not found in cache or force_download set to True, downloading to /root/.cache/huggingface/datasets/downloads/tmpyvkivypo 12/19/2022 11:15:02 - INFO - datasets.utils.file_utils - storing https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/resolve/streaming/transcript/sk/invalidated.tsv in cache at /root/.cache/huggingface/datasets/downloads/cdb07f76580d10a78d2bc3374ef15f6c6dc93470f67c36007db980722d6606a8 12/19/2022 11:15:02 - INFO - datasets.utils.file_utils - creating metadata file for /root/.cache/huggingface/datasets/downloads/cdb07f76580d10a78d2bc3374ef15f6c6dc93470f67c36007db980722d6606a8 12/19/2022 11:15:02 - INFO - datasets.download.download_manager - Downloading took 0.0 min 12/19/2022 11:15:02 - INFO - datasets.download.download_manager - Checksum Computation took 0.0 min 12/19/2022 11:15:02 - INFO - datasets.utils.py_utils - Spawning 5 processes for 5 objects in slices of [1, 1, 1, 1, 1] 12/19/2022 11:15:03 - INFO - datasets.utils.py_utils - Finished 5 processes 12/19/2022 11:15:03 - INFO - datasets.utils.py_utils - Unpacked 5 objects 12/19/2022 11:15:03 - INFO - datasets.utils.info_utils - Unable to verify checksums. 12/19/2022 11:15:03 - INFO - datasets.builder - Generating train split 12/19/2022 11:15:04 - INFO - datasets.builder - Generating validation split 12/19/2022 11:15:05 - INFO - datasets.builder - Generating test split 12/19/2022 11:15:06 - INFO - datasets.builder - Generating other split 12/19/2022 11:15:06 - INFO - datasets.builder - Generating invalidated split 12/19/2022 11:15:07 - INFO - datasets.utils.info_utils - Unable to verify splits sizes. Dataset common_voice_11_0 downloaded and prepared to /root/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/sk/11.0.0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f. Subsequent calls will reuse this data. 12/19/2022 11:15:10 - INFO - datasets.info - Loading Dataset Infos from /root/.cache/huggingface/modules/datasets_modules/datasets/mozilla-foundation--common_voice_11_0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f 12/19/2022 11:15:10 - INFO - datasets.builder - Overwrite dataset info from restored data version. 12/19/2022 11:15:10 - INFO - datasets.info - Loading Dataset info from /root/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/sk/11.0.0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f 12/19/2022 11:15:10 - WARNING - datasets.builder - Found cached dataset common_voice_11_0 (/root/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/sk/11.0.0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f) 12/19/2022 11:15:10 - INFO - datasets.info - Loading Dataset info from /root/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/sk/11.0.0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f 12/19/2022 11:15:13 - INFO - datasets.info - Loading Dataset Infos from /root/.cache/huggingface/modules/datasets_modules/datasets/mozilla-foundation--common_voice_11_0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f 12/19/2022 11:15:13 - INFO - datasets.builder - Overwrite dataset info from restored data version. 12/19/2022 11:15:13 - INFO - datasets.info - Loading Dataset info from /root/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/sk/11.0.0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f 12/19/2022 11:15:13 - WARNING - datasets.builder - Found cached dataset common_voice_11_0 (/root/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/sk/11.0.0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f) 12/19/2022 11:15:13 - INFO - datasets.info - Loading Dataset info from /root/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/sk/11.0.0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f 12/19/2022 11:15:29 - INFO - datasets.arrow_dataset - Caching processed dataset at /root/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/sk/11.0.0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f/cache-1198445f23fbce84.arrow 12/19/2022 11:32:01 - INFO - datasets.arrow_dataset - Caching processed dataset at /root/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/sk/11.0.0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f/cache-0b88e3fb4a4fb48f.arrow 12/19/2022 11:43:14 - INFO - datasets.arrow_dataset - Caching processed dataset at /root/.cache/huggingface/datasets/mozilla-foundation___common_voice_11_0/sk/11.0.0/f8e47235d9b4e68fa24ed71d63266a02018ccf7194b2a8c9c598a5f3ab304d9f/cache-843405339f5029e5.arrow 12/19/2022 11:53:27 - WARNING - huggingface_hub.repository - /usr/src/app/models/whisper-medium-sk-cv11/./ is already a clone of https://huggingface.co/mikr/whisper-medium-sk-cv11. Make sure you pull the latest changes with `repo.git_pull()`. [2022-12-19 11:53:32,051] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.8.0+a25c31b6, git-hash=a25c31b6, git-branch=master [2022-12-19 11:53:33,693] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False Adam Optimizer #0 is created with AVX2 arithmetic capability. Config: alpha=0.000010, betas=(0.900000, 0.999000), weight_decay=0.000000, adam_w=1 [2022-12-19 11:53:36,545] [INFO] [logging.py:68:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adamw as basic optimizer [2022-12-19 11:53:36,707] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = DeepSpeedCPUAdam [2022-12-19 11:53:36,707] [INFO] [utils.py:52:is_zero_supported_optimizer] Checking ZeRO support for optimizer=DeepSpeedCPUAdam type= [2022-12-19 11:53:36,708] [INFO] [logging.py:68:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer [2022-12-19 11:53:36,708] [INFO] [stage_1_and_2.py:141:__init__] Reduce bucket size 200000000 [2022-12-19 11:53:36,708] [INFO] [stage_1_and_2.py:142:__init__] Allgather bucket size 200000000 [2022-12-19 11:53:36,708] [INFO] [stage_1_and_2.py:143:__init__] CPU Offload: True [2022-12-19 11:53:36,708] [INFO] [stage_1_and_2.py:144:__init__] Round robin gradient partitioning: False Rank: 0 partition count [1] and sizes[(763857920, False)] [2022-12-19 11:53:40,277] [INFO] [utils.py:831:see_memory_usage] Before initializing optimizer states [2022-12-19 11:53:40,278] [INFO] [utils.py:832:see_memory_usage] MA 1.52 GB Max_MA 1.52 GB CA 1.53 GB Max_CA 2 GB [2022-12-19 11:53:40,278] [INFO] [utils.py:840:see_memory_usage] CPU Virtual Memory: used = 191.59 GB, percent = 38.0% [2022-12-19 11:53:43,514] [INFO] [utils.py:831:see_memory_usage] After initializing optimizer states [2022-12-19 11:53:43,515] [INFO] [utils.py:832:see_memory_usage] MA 1.52 GB Max_MA 1.52 GB CA 1.53 GB Max_CA 2 GB [2022-12-19 11:53:43,515] [INFO] [utils.py:840:see_memory_usage] CPU Virtual Memory: used = 200.63 GB, percent = 39.8% [2022-12-19 11:53:43,515] [INFO] [stage_1_and_2.py:527:__init__] optimizer state initialized [2022-12-19 11:53:43,579] [INFO] [utils.py:831:see_memory_usage] After initializing ZeRO optimizer [2022-12-19 11:53:43,580] [INFO] [utils.py:832:see_memory_usage] MA 1.52 GB Max_MA 1.52 GB CA 1.53 GB Max_CA 2 GB [2022-12-19 11:53:43,580] [INFO] [utils.py:840:see_memory_usage] CPU Virtual Memory: used = 200.59 GB, percent = 39.8% [2022-12-19 11:53:43,615] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = adamw [2022-12-19 11:53:43,616] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using configured LR scheduler = WarmupDecayLR [2022-12-19 11:53:43,616] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = [2022-12-19 11:53:43,616] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[1e-05], mom=[[0.9, 0.999]] [2022-12-19 11:53:43,618] [INFO] [config.py:1008:print] DeepSpeedEngine configuration: [2022-12-19 11:53:43,618] [INFO] [config.py:1012:print] activation_checkpointing_config { "partition_activations": false, "contiguous_memory_optimization": false, "cpu_checkpointing": false, "number_checkpoints": null, "synchronize_checkpoint_boundary": false, "profile": false } [2022-12-19 11:53:43,618] [INFO] [config.py:1012:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} [2022-12-19 11:53:43,618] [INFO] [config.py:1012:print] amp_enabled .................. False [2022-12-19 11:53:43,618] [INFO] [config.py:1012:print] amp_params ................... False [2022-12-19 11:53:43,619] [INFO] [config.py:1012:print] autotuning_config ............ { "enabled": false, "start_step": null, "end_step": null, "metric_path": null, "arg_mappings": null, "metric": "throughput", "model_info": null, "results_dir": "autotuning_results", "exps_dir": "autotuning_exps", "overwrite": true, "fast": true, "start_profile_step": 3, "end_profile_step": 5, "tuner_type": "gridsearch", "tuner_early_stopping": 5, "tuner_num_trials": 50, "model_info_path": null, "mp_size": 1, "max_train_batch_size": null, "min_train_batch_size": 1, "max_train_micro_batch_size_per_gpu": 1.024000e+03, "min_train_micro_batch_size_per_gpu": 1, "num_tuning_micro_batch_sizes": 3 } [2022-12-19 11:53:43,619] [INFO] [config.py:1012:print] bfloat16_enabled ............. False [2022-12-19 11:53:43,619] [INFO] [config.py:1012:print] checkpoint_parallel_write_pipeline False [2022-12-19 11:53:43,619] [INFO] [config.py:1012:print] checkpoint_tag_validation_enabled True [2022-12-19 11:53:43,619] [INFO] [config.py:1012:print] checkpoint_tag_validation_fail False [2022-12-19 11:53:43,619] [INFO] [config.py:1012:print] comms_config ................. [2022-12-19 11:53:43,619] [INFO] [config.py:1012:print] communication_data_type ...... None [2022-12-19 11:53:43,619] [INFO] [config.py:1012:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} [2022-12-19 11:53:43,619] [INFO] [config.py:1012:print] curriculum_enabled_legacy .... False [2022-12-19 11:53:43,619] [INFO] [config.py:1012:print] curriculum_params_legacy ..... False [2022-12-19 11:53:43,619] [INFO] [config.py:1012:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} [2022-12-19 11:53:43,619] [INFO] [config.py:1012:print] data_efficiency_enabled ...... False [2022-12-19 11:53:43,619] [INFO] [config.py:1012:print] dataloader_drop_last ......... False [2022-12-19 11:53:43,619] [INFO] [config.py:1012:print] disable_allgather ............ False [2022-12-19 11:53:43,620] [INFO] [config.py:1012:print] dump_state ................... False [2022-12-19 11:53:43,620] [INFO] [config.py:1012:print] dynamic_loss_scale_args ...... {'init_scale': 65536, 'scale_window': 1000, 'delayed_shift': 2, 'min_scale': 1} [2022-12-19 11:53:43,620] [INFO] [config.py:1012:print] eigenvalue_enabled ........... False [2022-12-19 11:53:43,620] [INFO] [config.py:1012:print] eigenvalue_gas_boundary_resolution 1 [2022-12-19 11:53:43,620] [INFO] [config.py:1012:print] eigenvalue_layer_name ........ bert.encoder.layer [2022-12-19 11:53:43,620] [INFO] [config.py:1012:print] eigenvalue_layer_num ......... 0 [2022-12-19 11:53:43,620] [INFO] [config.py:1012:print] eigenvalue_max_iter .......... 100 [2022-12-19 11:53:43,620] [INFO] [config.py:1012:print] eigenvalue_stability ......... 1e-06 [2022-12-19 11:53:43,620] [INFO] [config.py:1012:print] eigenvalue_tol ............... 0.01 [2022-12-19 11:53:43,620] [INFO] [config.py:1012:print] eigenvalue_verbose ........... False [2022-12-19 11:53:43,620] [INFO] [config.py:1012:print] elasticity_enabled ........... False [2022-12-19 11:53:43,620] [INFO] [config.py:1012:print] flops_profiler_config ........ { "enabled": false, "profile_step": 1, "module_depth": -1, "top_modules": 1, "detailed": true, "output_file": null } [2022-12-19 11:53:43,620] [INFO] [config.py:1012:print] fp16_auto_cast ............... False [2022-12-19 11:53:43,620] [INFO] [config.py:1012:print] fp16_enabled ................. True [2022-12-19 11:53:43,620] [INFO] [config.py:1012:print] fp16_master_weights_and_gradients False [2022-12-19 11:53:43,620] [INFO] [config.py:1012:print] global_rank .................. 0 [2022-12-19 11:53:43,620] [INFO] [config.py:1012:print] grad_accum_dtype ............. None [2022-12-19 11:53:43,621] [INFO] [config.py:1012:print] gradient_accumulation_steps .. 1 [2022-12-19 11:53:43,621] [INFO] [config.py:1012:print] gradient_clipping ............ 1.0 [2022-12-19 11:53:43,621] [INFO] [config.py:1012:print] gradient_predivide_factor .... 1.0 [2022-12-19 11:53:43,621] [INFO] [config.py:1012:print] initial_dynamic_scale ........ 65536 [2022-12-19 11:53:43,621] [INFO] [config.py:1012:print] load_universal_checkpoint .... False [2022-12-19 11:53:43,621] [INFO] [config.py:1012:print] loss_scale ................... 0 [2022-12-19 11:53:43,621] [INFO] [config.py:1012:print] memory_breakdown ............. False [2022-12-19 11:53:43,621] [INFO] [config.py:1012:print] monitor_config ............... [2022-12-19 11:53:43,621] [INFO] [config.py:1012:print] nebula_config ................ { "enabled": false, "persistent_storage_path": null, "persistent_time_interval": 100, "num_of_version_in_retention": 2, "enable_nebula_load": true, "load_path": null } [2022-12-19 11:53:43,621] [INFO] [config.py:1012:print] optimizer_legacy_fusion ...... False [2022-12-19 11:53:43,621] [INFO] [config.py:1012:print] optimizer_name ............... adamw [2022-12-19 11:53:43,621] [INFO] [config.py:1012:print] optimizer_params ............. {'lr': 1e-05, 'betas': [0.9, 0.999], 'eps': 1e-08, 'weight_decay': 0.0} [2022-12-19 11:53:43,621] [INFO] [config.py:1012:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} [2022-12-19 11:53:43,621] [INFO] [config.py:1012:print] pld_enabled .................. False [2022-12-19 11:53:43,621] [INFO] [config.py:1012:print] pld_params ................... False [2022-12-19 11:53:43,621] [INFO] [config.py:1012:print] prescale_gradients ........... False [2022-12-19 11:53:43,622] [INFO] [config.py:1012:print] scheduler_name ............... WarmupDecayLR [2022-12-19 11:53:43,622] [INFO] [config.py:1012:print] scheduler_params ............. {'last_batch_iteration': -1, 'total_num_steps': 5000, 'warmup_min_lr': 0, 'warmup_max_lr': 1e-05, 'warmup_num_steps': 500} [2022-12-19 11:53:43,622] [INFO] [config.py:1012:print] sparse_attention ............. None [2022-12-19 11:53:43,622] [INFO] [config.py:1012:print] sparse_gradients_enabled ..... False [2022-12-19 11:53:43,622] [INFO] [config.py:1012:print] steps_per_print .............. 10 [2022-12-19 11:53:43,622] [INFO] [config.py:1012:print] train_batch_size ............. 64 [2022-12-19 11:53:43,622] [INFO] [config.py:1012:print] train_micro_batch_size_per_gpu 64 [2022-12-19 11:53:43,622] [INFO] [config.py:1012:print] use_node_local_storage ....... False [2022-12-19 11:53:43,622] [INFO] [config.py:1012:print] wall_clock_breakdown ......... False [2022-12-19 11:53:43,622] [INFO] [config.py:1012:print] world_size ................... 1 [2022-12-19 11:53:43,622] [INFO] [config.py:1012:print] zero_allow_untested_optimizer False [2022-12-19 11:53:43,622] [INFO] [config.py:1012:print] zero_config .................. stage=2 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=200000000 allgather_partitions=True allgather_bucket_size=200000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='cpu', nvme_path=None, buffer_count=4, pin_memory=True, pipeline=False, pipeline_read=False, pipeline_write=False, fast_init=False) sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50,000,000 param_persistence_threshold=100,000 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False [2022-12-19 11:53:43,622] [INFO] [config.py:1012:print] zero_enabled ................. True [2022-12-19 11:53:43,622] [INFO] [config.py:1012:print] zero_optimization_stage ...... 2 [2022-12-19 11:53:43,623] [INFO] [config.py:997:print_user_config] json = { "fp16": { "enabled": true, "loss_scale": 0, "loss_scale_window": 1000, "initial_scale_power": 16, "hysteresis": 2, "min_loss_scale": 1 }, "optimizer": { "type": "AdamW", "params": { "lr": 1e-05, "betas": [0.9, 0.999], "eps": 1e-08, "weight_decay": 0.0 } }, "scheduler": { "type": "WarmupDecayLR", "params": { "last_batch_iteration": -1, "total_num_steps": 5.000000e+03, "warmup_min_lr": 0, "warmup_max_lr": 1e-05, "warmup_num_steps": 500 } }, "zero_optimization": { "stage": 2, "offload_optimizer": { "device": "cpu", "pin_memory": true }, "allgather_partitions": true, "allgather_bucket_size": 2.000000e+08, "overlap_comm": true, "reduce_scatter": true, "reduce_bucket_size": 2.000000e+08, "contiguous_gradients": true }, "gradient_accumulation_steps": 1, "gradient_clipping": 1.0, "train_batch_size": 64, "train_micro_batch_size_per_gpu": 64 } [2022-12-19 11:53:57,008] [INFO] [stage_1_and_2.py:1767:step] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 65536 [2022-12-19 11:54:08,740] [INFO] [stage_1_and_2.py:1767:step] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768.0 [2022-12-19 11:54:20,566] [INFO] [stage_1_and_2.py:1767:step] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768.0, reducing to 16384.0 [2022-12-19 11:54:32,523] [INFO] [stage_1_and_2.py:1767:step] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384.0, reducing to 8192.0 [2022-12-19 11:55:49,396] [INFO] [logging.py:68:log_dist] [Rank 0] step=10, skipped=4, lr=[2.883141528559073e-06], mom=[[0.9, 0.999]] [2022-12-19 11:55:49,397] [INFO] [timer.py:196:stop] epoch=0/micro_step=10/global_step=10, RunningAvgSamplesPerSec=6.284724477029608, CurrSamplesPerSec=6.148180060714191, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 11:58:08,169] [INFO] [logging.py:68:log_dist] [Rank 0] step=20, skipped=4, lr=[4.461405575910259e-06], mom=[[0.9, 0.999]] [2022-12-19 11:58:08,171] [INFO] [timer.py:196:stop] epoch=0/micro_step=20/global_step=20, RunningAvgSamplesPerSec=5.884120194534651, CurrSamplesPerSec=5.054178494750393, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.8611, 'learning_rate': 4.898977360288234e-06, 'epoch': 0.36} [2022-12-19 12:00:49,857] [INFO] [logging.py:68:log_dist] [Rank 0] step=30, skipped=4, lr=[5.242641991936178e-06], mom=[[0.9, 0.999]] [2022-12-19 12:00:49,859] [INFO] [timer.py:196:stop] epoch=0/micro_step=30/global_step=30, RunningAvgSamplesPerSec=5.357395051686146, CurrSamplesPerSec=4.271363536074205, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 12:03:36,350] [INFO] [logging.py:68:log_dist] [Rank 0] step=40, skipped=4, lr=[5.766283057118146e-06], mom=[[0.9, 0.999]] [2022-12-19 12:03:36,352] [INFO] [timer.py:196:stop] epoch=0/micro_step=40/global_step=40, RunningAvgSamplesPerSec=5.089969895421541, CurrSamplesPerSec=4.327689824299493, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 12:06:23,404] [INFO] [logging.py:68:log_dist] [Rank 0] step=50, skipped=4, lr=[6.160712527409633e-06], mom=[[0.9, 0.999]] [2022-12-19 12:06:23,405] [INFO] [timer.py:196:stop] epoch=0/micro_step=50/global_step=50, RunningAvgSamplesPerSec=4.94469846197508, CurrSamplesPerSec=4.742895867520358, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.3224, 'learning_rate': 6.160712527409633e-06, 'epoch': 0.71} [2022-12-19 12:08:49,940] [INFO] [logging.py:68:log_dist] [Rank 0] step=60, skipped=4, lr=[6.4772414076394205e-06], mom=[[0.9, 0.999]] [2022-12-19 12:08:49,942] [INFO] [timer.py:196:stop] epoch=0/micro_step=60/global_step=60, RunningAvgSamplesPerSec=4.9854128285166475, CurrSamplesPerSec=5.299728280676971, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 12:11:25,236] [INFO] [logging.py:68:log_dist] [Rank 0] step=70, skipped=4, lr=[6.741623406776245e-06], mom=[[0.9, 0.999]] [2022-12-19 12:11:25,238] [INFO] [timer.py:196:stop] epoch=0/micro_step=70/global_step=70, RunningAvgSamplesPerSec=4.962771426482941, CurrSamplesPerSec=4.826170613844314, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.2375, 'learning_rate': 6.85912902234906e-06, 'epoch': 1.07} [2022-12-19 12:14:02,605] [INFO] [logging.py:68:log_dist] [Rank 0] step=80, skipped=4, lr=[6.968634661590082e-06], mom=[[0.9, 0.999]] [2022-12-19 12:14:02,606] [INFO] [timer.py:196:stop] epoch=0/micro_step=80/global_step=80, RunningAvgSamplesPerSec=4.937271773600281, CurrSamplesPerSec=4.78181846456958, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 12:16:33,552] [INFO] [logging.py:68:log_dist] [Rank 0] step=90, skipped=4, lr=[7.1675433522258775e-06], mom=[[0.9, 0.999]] [2022-12-19 12:16:33,554] [INFO] [timer.py:196:stop] epoch=0/micro_step=90/global_step=90, RunningAvgSamplesPerSec=4.945269383765002, CurrSamplesPerSec=5.042845013770653, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 12:19:09,093] [INFO] [logging.py:68:log_dist] [Rank 0] step=100, skipped=4, lr=[7.344547104469332e-06], mom=[[0.9, 0.999]] [2022-12-19 12:19:09,095] [INFO] [timer.py:196:stop] epoch=0/micro_step=100/global_step=100, RunningAvgSamplesPerSec=4.933974241037233, CurrSamplesPerSec=4.701477254589951, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.1304, 'learning_rate': 7.344547104469332e-06, 'epoch': 1.43} [2022-12-19 12:21:47,390] [INFO] [logging.py:68:log_dist] [Rank 0] step=110, skipped=4, lr=[7.503995457567235e-06], mom=[[0.9, 0.999]] [2022-12-19 12:21:47,392] [INFO] [timer.py:196:stop] epoch=0/micro_step=110/global_step=110, RunningAvgSamplesPerSec=4.91591709126971, CurrSamplesPerSec=4.688850220105803, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 12:24:23,449] [INFO] [logging.py:68:log_dist] [Rank 0] step=120, skipped=4, lr=[7.649058662787184e-06], mom=[[0.9, 0.999]] [2022-12-19 12:24:23,450] [INFO] [timer.py:196:stop] epoch=0/micro_step=120/global_step=120, RunningAvgSamplesPerSec=4.9070530462655775, CurrSamplesPerSec=5.1081928187039445, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.1404, 'learning_rate': 7.716963756434345e-06, 'epoch': 1.79} [2022-12-19 12:27:01,797] [INFO] [logging.py:68:log_dist] [Rank 0] step=130, skipped=4, lr=[7.782118888847307e-06], mom=[[0.9, 0.999]] [2022-12-19 12:27:01,799] [INFO] [timer.py:196:stop] epoch=0/micro_step=130/global_step=130, RunningAvgSamplesPerSec=4.8920194677244355, CurrSamplesPerSec=4.416138360200384, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 12:29:44,263] [INFO] [logging.py:68:log_dist] [Rank 0] step=140, skipped=4, lr=[7.905011559752758e-06], mom=[[0.9, 0.999]] [2022-12-19 12:29:44,264] [INFO] [timer.py:196:stop] epoch=0/micro_step=140/global_step=140, RunningAvgSamplesPerSec=4.868397273066603, CurrSamplesPerSec=4.920148346706677, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 12:32:21,595] [INFO] [logging.py:68:log_dist] [Rank 0] step=150, skipped=4, lr=[8.019180844200955e-06], mom=[[0.9, 0.999]] [2022-12-19 12:32:21,597] [INFO] [timer.py:196:stop] epoch=0/micro_step=150/global_step=150, RunningAvgSamplesPerSec=4.861667948366876, CurrSamplesPerSec=4.915494797679944, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.1005, 'learning_rate': 8.019180844200955e-06, 'epoch': 2.14} [2022-12-19 12:35:01,285] [INFO] [logging.py:68:log_dist] [Rank 0] step=160, skipped=4, lr=[8.125783520495252e-06], mom=[[0.9, 0.999]] [2022-12-19 12:35:01,287] [INFO] [timer.py:196:stop] epoch=0/micro_step=160/global_step=160, RunningAvgSamplesPerSec=4.850342817067083, CurrSamplesPerSec=4.7749939204247775, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 12:37:40,486] [INFO] [logging.py:68:log_dist] [Rank 0] step=170, skipped=4, lr=[8.225760510392298e-06], mom=[[0.9, 0.999]] [2022-12-19 12:37:40,488] [INFO] [timer.py:196:stop] epoch=0/micro_step=170/global_step=170, RunningAvgSamplesPerSec=4.84217822772632, CurrSamplesPerSec=4.639845609652397, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0496, 'learning_rate': 8.27351214279797e-06, 'epoch': 2.5} [2022-12-19 12:40:13,823] [INFO] [logging.py:68:log_dist] [Rank 0] step=180, skipped=4, lr=[8.31988745412743e-06], mom=[[0.9, 0.999]] [2022-12-19 12:40:13,825] [INFO] [timer.py:196:stop] epoch=0/micro_step=180/global_step=180, RunningAvgSamplesPerSec=4.846620755543373, CurrSamplesPerSec=4.94334514558199, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 12:42:51,582] [INFO] [logging.py:68:log_dist] [Rank 0] step=190, skipped=4, lr=[8.408811289387583e-06], mom=[[0.9, 0.999]] [2022-12-19 12:42:51,583] [INFO] [timer.py:196:stop] epoch=0/micro_step=190/global_step=190, RunningAvgSamplesPerSec=4.842148589841127, CurrSamplesPerSec=4.636333563213749, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 12:45:28,742] [INFO] [logging.py:68:log_dist] [Rank 0] step=200, skipped=4, lr=[8.49307723936858e-06], mom=[[0.9, 0.999]] [2022-12-19 12:45:28,744] [INFO] [timer.py:196:stop] epoch=0/micro_step=200/global_step=200, RunningAvgSamplesPerSec=4.838832903723075, CurrSamplesPerSec=4.873535141481062, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0468, 'learning_rate': 8.49307723936858e-06, 'epoch': 2.86} [2022-12-19 12:48:11,320] [INFO] [logging.py:68:log_dist] [Rank 0] step=210, skipped=4, lr=[8.573149077803088e-06], mom=[[0.9, 0.999]] [2022-12-19 12:48:11,322] [INFO] [timer.py:196:stop] epoch=0/micro_step=210/global_step=210, RunningAvgSamplesPerSec=4.825149349910002, CurrSamplesPerSec=4.864172355960737, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 12:50:45,911] [INFO] [logging.py:68:log_dist] [Rank 0] step=220, skipped=4, lr=[8.64942458567722e-06], mom=[[0.9, 0.999]] [2022-12-19 12:50:45,912] [INFO] [timer.py:196:stop] epoch=0/micro_step=220/global_step=220, RunningAvgSamplesPerSec=4.82800487670514, CurrSamplesPerSec=5.104461169974322, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0305, 'learning_rate': 8.686247975778677e-06, 'epoch': 3.21} [2022-12-19 12:53:15,841] [INFO] [logging.py:68:log_dist] [Rank 0] step=230, skipped=4, lr=[8.722247506883805e-06], mom=[[0.9, 0.999]] [2022-12-19 12:53:15,842] [INFO] [timer.py:196:stop] epoch=0/micro_step=230/global_step=230, RunningAvgSamplesPerSec=4.8362180032202176, CurrSamplesPerSec=4.783041219701146, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 12:55:55,413] [INFO] [logging.py:68:log_dist] [Rank 0] step=240, skipped=4, lr=[8.79191691333329e-06], mom=[[0.9, 0.999]] [2022-12-19 12:55:55,415] [INFO] [timer.py:196:stop] epoch=0/micro_step=240/global_step=240, RunningAvgSamplesPerSec=4.830415934243562, CurrSamplesPerSec=4.67587043487864, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 12:58:29,579] [INFO] [logging.py:68:log_dist] [Rank 0] step=250, skipped=4, lr=[8.858694625217149e-06], mom=[[0.9, 0.999]] [2022-12-19 12:58:29,581] [INFO] [timer.py:196:stop] epoch=0/micro_step=250/global_step=250, RunningAvgSamplesPerSec=4.833466213517386, CurrSamplesPerSec=5.486877476242189, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0245, 'learning_rate': 8.858694625217149e-06, 'epoch': 3.57} [2022-12-19 13:00:57,038] [INFO] [logging.py:68:log_dist] [Rank 0] step=260, skipped=4, lr=[8.922811151820517e-06], mom=[[0.9, 0.999]] [2022-12-19 13:00:57,039] [INFO] [timer.py:196:stop] epoch=0/micro_step=260/global_step=260, RunningAvgSamplesPerSec=4.845110965147443, CurrSamplesPerSec=4.9921115157357105, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 13:03:33,670] [INFO] [logging.py:68:log_dist] [Rank 0] step=270, skipped=4, lr=[8.984470493319244e-06], mom=[[0.9, 0.999]] [2022-12-19 13:03:33,672] [INFO] [timer.py:196:stop] epoch=0/micro_step=270/global_step=270, RunningAvgSamplesPerSec=4.8429444681448635, CurrSamplesPerSec=4.802661310533987, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0204, 'learning_rate': 9.014436199608479e-06, 'epoch': 3.93} [2022-12-19 13:06:10,407] [INFO] [logging.py:68:log_dist] [Rank 0] step=280, skipped=4, lr=[9.043854055968706e-06], mom=[[0.9, 0.999]] [2022-12-19 13:06:10,409] [INFO] [timer.py:196:stop] epoch=0/micro_step=280/global_step=280, RunningAvgSamplesPerSec=4.841414423642298, CurrSamplesPerSec=4.812496898464995, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 13:08:35,681] [INFO] [logging.py:68:log_dist] [Rank 0] step=290, skipped=4, lr=[9.10112387015335e-06], mom=[[0.9, 0.999]] [2022-12-19 13:08:35,682] [INFO] [timer.py:196:stop] epoch=0/micro_step=290/global_step=290, RunningAvgSamplesPerSec=4.855070183701984, CurrSamplesPerSec=5.301100474873604, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 13:11:04,702] [INFO] [logging.py:68:log_dist] [Rank 0] step=300, skipped=4, lr=[9.156425255148058e-06], mom=[[0.9, 0.999]] [2022-12-19 13:11:04,704] [INFO] [timer.py:196:stop] epoch=0/micro_step=300/global_step=300, RunningAvgSamplesPerSec=4.862483705117125, CurrSamplesPerSec=4.96419714698448, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0161, 'learning_rate': 9.156425255148058e-06, 'epoch': 4.29} [2022-12-19 13:13:41,785] [INFO] [logging.py:68:log_dist] [Rank 0] step=310, skipped=4, lr=[9.209889040960644e-06], mom=[[0.9, 0.999]] [2022-12-19 13:13:41,786] [INFO] [timer.py:196:stop] epoch=0/micro_step=310/global_step=310, RunningAvgSamplesPerSec=4.859764335974417, CurrSamplesPerSec=4.878675089060409, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 13:16:17,620] [INFO] [logging.py:68:log_dist] [Rank 0] step=320, skipped=4, lr=[9.261633432763397e-06], mom=[[0.9, 0.999]] [2022-12-19 13:16:17,621] [INFO] [timer.py:196:stop] epoch=0/micro_step=320/global_step=320, RunningAvgSamplesPerSec=4.858997002509444, CurrSamplesPerSec=4.80213043314379, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0129, 'learning_rate': 9.28689473531776e-06, 'epoch': 4.64} [2022-12-19 13:18:54,230] [INFO] [logging.py:68:log_dist] [Rank 0] step=330, skipped=4, lr=[9.311765584761373e-06], mom=[[0.9, 0.999]] [2022-12-19 13:18:54,231] [INFO] [timer.py:196:stop] epoch=0/micro_step=330/global_step=330, RunningAvgSamplesPerSec=4.857019464893833, CurrSamplesPerSec=4.742613643782026, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 13:21:32,500] [INFO] [logging.py:68:log_dist] [Rank 0] step=340, skipped=4, lr=[9.360382936198493e-06], mom=[[0.9, 0.999]] [2022-12-19 13:21:32,502] [INFO] [timer.py:196:stop] epoch=0/micro_step=340/global_step=340, RunningAvgSamplesPerSec=4.853928759897602, CurrSamplesPerSec=4.8359388617015355, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 13:24:08,047] [INFO] [logging.py:68:log_dist] [Rank 0] step=350, skipped=4, lr=[9.407574351377137e-06], mom=[[0.9, 0.999]] [2022-12-19 13:24:08,049] [INFO] [timer.py:196:stop] epoch=0/micro_step=350/global_step=350, RunningAvgSamplesPerSec=4.853820410388651, CurrSamplesPerSec=4.898408063422721, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.013, 'learning_rate': 9.407574351377137e-06, 'epoch': 5.0} [2022-12-19 13:26:45,328] [INFO] [logging.py:68:log_dist] [Rank 0] step=360, skipped=4, lr=[9.45342109721062e-06], mom=[[0.9, 0.999]] [2022-12-19 13:26:45,329] [INFO] [timer.py:196:stop] epoch=0/micro_step=360/global_step=360, RunningAvgSamplesPerSec=4.85178775805814, CurrSamplesPerSec=4.948161599842712, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 13:29:20,026] [INFO] [logging.py:68:log_dist] [Rank 0] step=370, skipped=4, lr=[9.497997685324628e-06], mom=[[0.9, 0.999]] [2022-12-19 13:29:20,027] [INFO] [timer.py:196:stop] epoch=0/micro_step=370/global_step=370, RunningAvgSamplesPerSec=4.852030422658968, CurrSamplesPerSec=4.587538166471939, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0084, 'learning_rate': 9.519831289296397e-06, 'epoch': 5.36} [2022-12-19 13:31:55,844] [INFO] [logging.py:68:log_dist] [Rank 0] step=380, skipped=4, lr=[9.541372600623587e-06], mom=[[0.9, 0.999]] [2022-12-19 13:31:55,845] [INFO] [timer.py:196:stop] epoch=0/micro_step=380/global_step=380, RunningAvgSamplesPerSec=4.851833732350051, CurrSamplesPerSec=4.7891279033152125, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 13:34:33,105] [INFO] [logging.py:68:log_dist] [Rank 0] step=390, skipped=4, lr=[9.583608934209288e-06], mom=[[0.9, 0.999]] [2022-12-19 13:34:33,106] [INFO] [timer.py:196:stop] epoch=0/micro_step=390/global_step=390, RunningAvgSamplesPerSec=4.850083370123769, CurrSamplesPerSec=4.644631745875583, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 13:37:12,002] [INFO] [logging.py:68:log_dist] [Rank 0] step=400, skipped=4, lr=[9.624764935335318e-06], mom=[[0.9, 0.999]] [2022-12-19 13:37:12,004] [INFO] [timer.py:196:stop] epoch=0/micro_step=400/global_step=400, RunningAvgSamplesPerSec=4.846477731334026, CurrSamplesPerSec=4.710384798486559, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0063, 'learning_rate': 9.624764935335318e-06, 'epoch': 5.71} [2022-12-19 13:39:47,883] [INFO] [logging.py:68:log_dist] [Rank 0] step=410, skipped=4, lr=[9.664894494516345e-06], mom=[[0.9, 0.999]] [2022-12-19 13:39:47,885] [INFO] [timer.py:196:stop] epoch=0/micro_step=410/global_step=410, RunningAvgSamplesPerSec=4.846520980004455, CurrSamplesPerSec=4.8562435318187935, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 13:42:18,602] [INFO] [logging.py:68:log_dist] [Rank 0] step=420, skipped=4, lr=[9.704047567846437e-06], mom=[[0.9, 0.999]] [2022-12-19 13:42:18,603] [INFO] [timer.py:196:stop] epoch=0/micro_step=420/global_step=420, RunningAvgSamplesPerSec=4.850877865185188, CurrSamplesPerSec=5.319254205108675, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0091, 'learning_rate': 9.723272550712454e-06, 'epoch': 6.07} [2022-12-19 13:44:50,834] [INFO] [logging.py:68:log_dist] [Rank 0] step=430, skipped=4, lr=[9.742270550908135e-06], mom=[[0.9, 0.999]] [2022-12-19 13:44:50,835] [INFO] [timer.py:196:stop] epoch=0/micro_step=430/global_step=430, RunningAvgSamplesPerSec=4.8535268333430155, CurrSamplesPerSec=4.883467831694119, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 13:47:26,590] [INFO] [logging.py:68:log_dist] [Rank 0] step=440, skipped=4, lr=[9.779606609292176e-06], mom=[[0.9, 0.999]] [2022-12-19 13:47:26,592] [INFO] [timer.py:196:stop] epoch=0/micro_step=440/global_step=440, RunningAvgSamplesPerSec=4.853048414412156, CurrSamplesPerSec=4.76805433357737, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 13:50:03,822] [INFO] [logging.py:68:log_dist] [Rank 0] step=450, skipped=4, lr=[9.816095971633122e-06], mom=[[0.9, 0.999]] [2022-12-19 13:50:03,823] [INFO] [timer.py:196:stop] epoch=0/micro_step=450/global_step=450, RunningAvgSamplesPerSec=4.851727248421968, CurrSamplesPerSec=4.862512962573305, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.008, 'learning_rate': 9.816095971633122e-06, 'epoch': 6.43} [2022-12-19 13:52:37,209] [INFO] [logging.py:68:log_dist] [Rank 0] step=460, skipped=4, lr=[9.851776190149156e-06], mom=[[0.9, 0.999]] [2022-12-19 13:52:37,211] [INFO] [timer.py:196:stop] epoch=0/micro_step=460/global_step=460, RunningAvgSamplesPerSec=4.854686255524522, CurrSamplesPerSec=5.006905800888788, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 13:54:58,883] [INFO] [logging.py:68:log_dist] [Rank 0] step=470, skipped=4, lr=[9.886682372916766e-06], mom=[[0.9, 0.999]] [2022-12-19 13:54:58,884] [INFO] [timer.py:196:stop] epoch=0/micro_step=470/global_step=470, RunningAvgSamplesPerSec=4.866808478943239, CurrSamplesPerSec=5.55124573746386, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0082, 'learning_rate': 9.90385555539545e-06, 'epoch': 6.79} [2022-12-19 13:57:24,670] [INFO] [logging.py:68:log_dist] [Rank 0] step=480, skipped=4, lr=[9.92084739148192e-06], mom=[[0.9, 0.999]] [2022-12-19 13:57:24,671] [INFO] [timer.py:196:stop] epoch=0/micro_step=480/global_step=480, RunningAvgSamplesPerSec=4.8735694202012425, CurrSamplesPerSec=5.105844323967845, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 13:59:51,505] [INFO] [logging.py:68:log_dist] [Rank 0] step=490, skipped=4, lr=[9.954302066885107e-06], mom=[[0.9, 0.999]] [2022-12-19 13:59:51,507] [INFO] [timer.py:196:stop] epoch=0/micro_step=490/global_step=490, RunningAvgSamplesPerSec=4.879328693961105, CurrSamplesPerSec=5.361670640483412, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 14:02:17,782] [INFO] [logging.py:68:log_dist] [Rank 0] step=500, skipped=4, lr=[9.987075336738768e-06], mom=[[0.9, 0.999]] [2022-12-19 14:02:17,783] [INFO] [timer.py:196:stop] epoch=0/micro_step=500/global_step=500, RunningAvgSamplesPerSec=4.887102593599529, CurrSamplesPerSec=5.400279695125675, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0085, 'learning_rate': 9.987075336738768e-06, 'epoch': 7.14} [2022-12-19 14:04:38,368] [INFO] [logging.py:68:log_dist] [Rank 0] step=510, skipped=4, lr=[9.98888888888889e-06], mom=[[0.9, 0.999]] [2022-12-19 14:04:38,369] [INFO] [timer.py:196:stop] epoch=0/micro_step=510/global_step=510, RunningAvgSamplesPerSec=4.898997366676477, CurrSamplesPerSec=5.396514227837796, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 14:07:03,835] [INFO] [logging.py:68:log_dist] [Rank 0] step=520, skipped=4, lr=[9.966666666666667e-06], mom=[[0.9, 0.999]] [2022-12-19 14:07:03,837] [INFO] [timer.py:196:stop] epoch=0/micro_step=520/global_step=520, RunningAvgSamplesPerSec=4.905130946159965, CurrSamplesPerSec=4.988483885181161, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0063, 'learning_rate': 9.955555555555556e-06, 'epoch': 7.5} [2022-12-19 14:09:35,891] [INFO] [logging.py:68:log_dist] [Rank 0] step=530, skipped=4, lr=[9.944444444444445e-06], mom=[[0.9, 0.999]] [2022-12-19 14:09:35,893] [INFO] [timer.py:196:stop] epoch=0/micro_step=530/global_step=530, RunningAvgSamplesPerSec=4.906098705272462, CurrSamplesPerSec=4.758214160450357, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 14:12:11,852] [INFO] [logging.py:68:log_dist] [Rank 0] step=540, skipped=4, lr=[9.922222222222222e-06], mom=[[0.9, 0.999]] [2022-12-19 14:12:11,853] [INFO] [timer.py:196:stop] epoch=0/micro_step=540/global_step=540, RunningAvgSamplesPerSec=4.904466433323427, CurrSamplesPerSec=4.969838803352754, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 14:14:47,792] [INFO] [logging.py:68:log_dist] [Rank 0] step=550, skipped=4, lr=[9.9e-06], mom=[[0.9, 0.999]] [2022-12-19 14:14:47,794] [INFO] [timer.py:196:stop] epoch=0/micro_step=550/global_step=550, RunningAvgSamplesPerSec=4.902692656494618, CurrSamplesPerSec=4.781843337754648, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0084, 'learning_rate': 9.9e-06, 'epoch': 7.86} [2022-12-19 14:17:23,420] [INFO] [logging.py:68:log_dist] [Rank 0] step=560, skipped=4, lr=[9.877777777777778e-06], mom=[[0.9, 0.999]] [2022-12-19 14:17:23,421] [INFO] [timer.py:196:stop] epoch=0/micro_step=560/global_step=560, RunningAvgSamplesPerSec=4.901205410548811, CurrSamplesPerSec=5.010666437943654, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 14:20:01,753] [INFO] [logging.py:68:log_dist] [Rank 0] step=570, skipped=4, lr=[9.855555555555555e-06], mom=[[0.9, 0.999]] [2022-12-19 14:20:01,754] [INFO] [timer.py:196:stop] epoch=0/micro_step=570/global_step=570, RunningAvgSamplesPerSec=4.898100972927672, CurrSamplesPerSec=4.759615844767654, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0049, 'learning_rate': 9.844444444444446e-06, 'epoch': 8.21} [2022-12-19 14:22:35,621] [INFO] [logging.py:68:log_dist] [Rank 0] step=580, skipped=4, lr=[9.833333333333333e-06], mom=[[0.9, 0.999]] [2022-12-19 14:22:35,622] [INFO] [timer.py:196:stop] epoch=0/micro_step=580/global_step=580, RunningAvgSamplesPerSec=4.898424414949789, CurrSamplesPerSec=5.1699776603263325, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 14:25:02,778] [INFO] [logging.py:68:log_dist] [Rank 0] step=590, skipped=4, lr=[9.811111111111112e-06], mom=[[0.9, 0.999]] [2022-12-19 14:25:02,779] [INFO] [timer.py:196:stop] epoch=0/micro_step=590/global_step=590, RunningAvgSamplesPerSec=4.902626294312846, CurrSamplesPerSec=5.092728847325434, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 14:27:29,246] [INFO] [logging.py:68:log_dist] [Rank 0] step=600, skipped=4, lr=[9.78888888888889e-06], mom=[[0.9, 0.999]] [2022-12-19 14:27:29,248] [INFO] [timer.py:196:stop] epoch=0/micro_step=600/global_step=600, RunningAvgSamplesPerSec=4.907046849888979, CurrSamplesPerSec=5.223062659736944, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0059, 'learning_rate': 9.78888888888889e-06, 'epoch': 8.57} [2022-12-19 14:29:57,726] [INFO] [logging.py:68:log_dist] [Rank 0] step=610, skipped=4, lr=[9.766666666666667e-06], mom=[[0.9, 0.999]] [2022-12-19 14:29:57,727] [INFO] [timer.py:196:stop] epoch=0/micro_step=610/global_step=610, RunningAvgSamplesPerSec=4.910471417559111, CurrSamplesPerSec=5.122817956349585, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 14:32:21,053] [INFO] [logging.py:68:log_dist] [Rank 0] step=620, skipped=4, lr=[9.744444444444445e-06], mom=[[0.9, 0.999]] [2022-12-19 14:32:21,054] [INFO] [timer.py:196:stop] epoch=0/micro_step=620/global_step=620, RunningAvgSamplesPerSec=4.917031539120992, CurrSamplesPerSec=5.2714502306701085, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0066, 'learning_rate': 9.733333333333334e-06, 'epoch': 8.93} [2022-12-19 14:34:53,075] [INFO] [logging.py:68:log_dist] [Rank 0] step=630, skipped=4, lr=[9.722222222222223e-06], mom=[[0.9, 0.999]] [2022-12-19 14:34:53,077] [INFO] [timer.py:196:stop] epoch=0/micro_step=630/global_step=630, RunningAvgSamplesPerSec=4.917835817961153, CurrSamplesPerSec=4.917546549724145, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 14:37:28,079] [INFO] [logging.py:68:log_dist] [Rank 0] step=640, skipped=4, lr=[9.7e-06], mom=[[0.9, 0.999]] [2022-12-19 14:37:28,080] [INFO] [timer.py:196:stop] epoch=0/micro_step=640/global_step=640, RunningAvgSamplesPerSec=4.917070836153263, CurrSamplesPerSec=4.88024030445332, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 14:39:52,985] [INFO] [logging.py:68:log_dist] [Rank 0] step=650, skipped=4, lr=[9.677777777777778e-06], mom=[[0.9, 0.999]] [2022-12-19 14:39:52,987] [INFO] [timer.py:196:stop] epoch=0/micro_step=650/global_step=650, RunningAvgSamplesPerSec=4.922261688223727, CurrSamplesPerSec=5.31764146920894, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0053, 'learning_rate': 9.677777777777778e-06, 'epoch': 9.29} [2022-12-19 14:42:20,988] [INFO] [logging.py:68:log_dist] [Rank 0] step=660, skipped=4, lr=[9.655555555555556e-06], mom=[[0.9, 0.999]] [2022-12-19 14:42:20,990] [INFO] [timer.py:196:stop] epoch=0/micro_step=660/global_step=660, RunningAvgSamplesPerSec=4.925255429831317, CurrSamplesPerSec=5.157295982637809, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 14:44:48,284] [INFO] [logging.py:68:log_dist] [Rank 0] step=670, skipped=4, lr=[9.633333333333335e-06], mom=[[0.9, 0.999]] [2022-12-19 14:44:48,286] [INFO] [timer.py:196:stop] epoch=0/micro_step=670/global_step=670, RunningAvgSamplesPerSec=4.928434904008725, CurrSamplesPerSec=5.061157079205904, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.004, 'learning_rate': 9.622222222222222e-06, 'epoch': 9.64} [2022-12-19 14:47:15,119] [INFO] [logging.py:68:log_dist] [Rank 0] step=680, skipped=4, lr=[9.611111111111112e-06], mom=[[0.9, 0.999]] [2022-12-19 14:47:15,121] [INFO] [timer.py:196:stop] epoch=0/micro_step=680/global_step=680, RunningAvgSamplesPerSec=4.932332719017831, CurrSamplesPerSec=5.07334611342772, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 14:49:42,987] [INFO] [logging.py:68:log_dist] [Rank 0] step=690, skipped=4, lr=[9.58888888888889e-06], mom=[[0.9, 0.999]] [2022-12-19 14:49:42,989] [INFO] [timer.py:196:stop] epoch=0/micro_step=690/global_step=690, RunningAvgSamplesPerSec=4.934952106280914, CurrSamplesPerSec=5.23162515983437, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 14:52:09,781] [INFO] [logging.py:68:log_dist] [Rank 0] step=700, skipped=4, lr=[9.566666666666668e-06], mom=[[0.9, 0.999]] [2022-12-19 14:52:09,782] [INFO] [timer.py:196:stop] epoch=0/micro_step=700/global_step=700, RunningAvgSamplesPerSec=4.9380357034046325, CurrSamplesPerSec=5.24687746476847, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0054, 'learning_rate': 9.566666666666668e-06, 'epoch': 10.0} [2022-12-19 14:54:39,366] [INFO] [logging.py:68:log_dist] [Rank 0] step=710, skipped=4, lr=[9.544444444444445e-06], mom=[[0.9, 0.999]] [2022-12-19 14:54:39,367] [INFO] [timer.py:196:stop] epoch=0/micro_step=710/global_step=710, RunningAvgSamplesPerSec=4.939692191956853, CurrSamplesPerSec=5.010938063895246, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 14:57:11,603] [INFO] [logging.py:68:log_dist] [Rank 0] step=720, skipped=4, lr=[9.522222222222223e-06], mom=[[0.9, 0.999]] [2022-12-19 14:57:11,604] [INFO] [timer.py:196:stop] epoch=0/micro_step=720/global_step=720, RunningAvgSamplesPerSec=4.9399408655745685, CurrSamplesPerSec=5.087045225116837, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0041, 'learning_rate': 9.511111111111112e-06, 'epoch': 10.36} [2022-12-19 14:59:38,451] [INFO] [logging.py:68:log_dist] [Rank 0] step=730, skipped=4, lr=[9.5e-06], mom=[[0.9, 0.999]] [2022-12-19 14:59:38,452] [INFO] [timer.py:196:stop] epoch=0/micro_step=730/global_step=730, RunningAvgSamplesPerSec=4.943429470946694, CurrSamplesPerSec=5.086961162941001, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 15:02:14,198] [INFO] [logging.py:68:log_dist] [Rank 0] step=740, skipped=4, lr=[9.47777777777778e-06], mom=[[0.9, 0.999]] [2022-12-19 15:02:14,200] [INFO] [timer.py:196:stop] epoch=0/micro_step=740/global_step=740, RunningAvgSamplesPerSec=4.941945164153034, CurrSamplesPerSec=4.616645396900982, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 15:04:50,347] [INFO] [logging.py:68:log_dist] [Rank 0] step=750, skipped=4, lr=[9.455555555555557e-06], mom=[[0.9, 0.999]] [2022-12-19 15:04:50,349] [INFO] [timer.py:196:stop] epoch=0/micro_step=750/global_step=750, RunningAvgSamplesPerSec=4.940280554893238, CurrSamplesPerSec=4.9899512607127114, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0042, 'learning_rate': 9.455555555555557e-06, 'epoch': 10.71} [2022-12-19 15:07:23,856] [INFO] [logging.py:68:log_dist] [Rank 0] step=760, skipped=4, lr=[9.433333333333335e-06], mom=[[0.9, 0.999]] [2022-12-19 15:07:23,857] [INFO] [timer.py:196:stop] epoch=0/micro_step=760/global_step=760, RunningAvgSamplesPerSec=4.939938762139419, CurrSamplesPerSec=5.072232465913488, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 15:09:55,140] [INFO] [logging.py:68:log_dist] [Rank 0] step=770, skipped=4, lr=[9.411111111111113e-06], mom=[[0.9, 0.999]] [2022-12-19 15:09:55,141] [INFO] [timer.py:196:stop] epoch=0/micro_step=770/global_step=770, RunningAvgSamplesPerSec=4.940492222311285, CurrSamplesPerSec=4.816172566434398, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0031, 'learning_rate': 9.4e-06, 'epoch': 11.07} [2022-12-19 15:12:30,313] [INFO] [logging.py:68:log_dist] [Rank 0] step=780, skipped=4, lr=[9.38888888888889e-06], mom=[[0.9, 0.999]] [2022-12-19 15:12:30,314] [INFO] [timer.py:196:stop] epoch=0/micro_step=780/global_step=780, RunningAvgSamplesPerSec=4.939305584796219, CurrSamplesPerSec=4.9126494042424165, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 15:15:04,911] [INFO] [logging.py:68:log_dist] [Rank 0] step=790, skipped=4, lr=[9.366666666666668e-06], mom=[[0.9, 0.999]] [2022-12-19 15:15:04,913] [INFO] [timer.py:196:stop] epoch=0/micro_step=790/global_step=790, RunningAvgSamplesPerSec=4.938492549839854, CurrSamplesPerSec=5.011928756665134, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 15:17:34,843] [INFO] [logging.py:68:log_dist] [Rank 0] step=800, skipped=4, lr=[9.344444444444446e-06], mom=[[0.9, 0.999]] [2022-12-19 15:17:34,844] [INFO] [timer.py:196:stop] epoch=0/micro_step=800/global_step=800, RunningAvgSamplesPerSec=4.939797377547636, CurrSamplesPerSec=4.823039931741798, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0042, 'learning_rate': 9.344444444444446e-06, 'epoch': 11.43} [2022-12-19 15:20:10,508] [INFO] [logging.py:68:log_dist] [Rank 0] step=810, skipped=4, lr=[9.322222222222223e-06], mom=[[0.9, 0.999]] [2022-12-19 15:20:10,509] [INFO] [timer.py:196:stop] epoch=0/micro_step=810/global_step=810, RunningAvgSamplesPerSec=4.938616706325122, CurrSamplesPerSec=4.615424885903679, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 15:22:46,952] [INFO] [logging.py:68:log_dist] [Rank 0] step=820, skipped=4, lr=[9.3e-06], mom=[[0.9, 0.999]] [2022-12-19 15:22:46,954] [INFO] [timer.py:196:stop] epoch=0/micro_step=820/global_step=820, RunningAvgSamplesPerSec=4.936606889293279, CurrSamplesPerSec=4.723105056028439, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0029, 'learning_rate': 9.28888888888889e-06, 'epoch': 11.79} [2022-12-19 15:25:15,818] [INFO] [logging.py:68:log_dist] [Rank 0] step=830, skipped=4, lr=[9.277777777777778e-06], mom=[[0.9, 0.999]] [2022-12-19 15:25:15,819] [INFO] [timer.py:196:stop] epoch=0/micro_step=830/global_step=830, RunningAvgSamplesPerSec=4.938464256304168, CurrSamplesPerSec=5.032372515852035, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 15:27:47,540] [INFO] [logging.py:68:log_dist] [Rank 0] step=840, skipped=4, lr=[9.255555555555556e-06], mom=[[0.9, 0.999]] [2022-12-19 15:27:47,542] [INFO] [timer.py:196:stop] epoch=0/micro_step=840/global_step=840, RunningAvgSamplesPerSec=4.938820182366459, CurrSamplesPerSec=5.013181412913762, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 15:30:22,235] [INFO] [logging.py:68:log_dist] [Rank 0] step=850, skipped=4, lr=[9.233333333333334e-06], mom=[[0.9, 0.999]] [2022-12-19 15:30:22,237] [INFO] [timer.py:196:stop] epoch=0/micro_step=850/global_step=850, RunningAvgSamplesPerSec=4.937928612579426, CurrSamplesPerSec=4.998566565455339, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0029, 'learning_rate': 9.233333333333334e-06, 'epoch': 12.14} [2022-12-19 15:32:58,690] [INFO] [logging.py:68:log_dist] [Rank 0] step=860, skipped=4, lr=[9.211111111111111e-06], mom=[[0.9, 0.999]] [2022-12-19 15:32:58,691] [INFO] [timer.py:196:stop] epoch=0/micro_step=860/global_step=860, RunningAvgSamplesPerSec=4.936165433304045, CurrSamplesPerSec=4.865863664726239, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 15:35:35,572] [INFO] [logging.py:68:log_dist] [Rank 0] step=870, skipped=4, lr=[9.188888888888889e-06], mom=[[0.9, 0.999]] [2022-12-19 15:35:35,573] [INFO] [timer.py:196:stop] epoch=0/micro_step=870/global_step=870, RunningAvgSamplesPerSec=4.93439287989596, CurrSamplesPerSec=4.796413956971808, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0021, 'learning_rate': 9.17777777777778e-06, 'epoch': 12.5} [2022-12-19 15:38:05,552] [INFO] [logging.py:68:log_dist] [Rank 0] step=880, skipped=4, lr=[9.166666666666666e-06], mom=[[0.9, 0.999]] [2022-12-19 15:38:05,553] [INFO] [timer.py:196:stop] epoch=0/micro_step=880/global_step=880, RunningAvgSamplesPerSec=4.935435621213895, CurrSamplesPerSec=5.141677427678901, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 15:40:40,472] [INFO] [logging.py:68:log_dist] [Rank 0] step=890, skipped=4, lr=[9.144444444444444e-06], mom=[[0.9, 0.999]] [2022-12-19 15:40:40,474] [INFO] [timer.py:196:stop] epoch=0/micro_step=890/global_step=890, RunningAvgSamplesPerSec=4.9344182185583465, CurrSamplesPerSec=4.7700867918514005, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 15:43:14,392] [INFO] [logging.py:68:log_dist] [Rank 0] step=900, skipped=4, lr=[9.122222222222223e-06], mom=[[0.9, 0.999]] [2022-12-19 15:43:14,394] [INFO] [timer.py:196:stop] epoch=0/micro_step=900/global_step=900, RunningAvgSamplesPerSec=4.93406040285683, CurrSamplesPerSec=4.997473128027491, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0018, 'learning_rate': 9.122222222222223e-06, 'epoch': 12.86} [2022-12-19 15:45:46,440] [INFO] [logging.py:68:log_dist] [Rank 0] step=910, skipped=4, lr=[9.100000000000001e-06], mom=[[0.9, 0.999]] [2022-12-19 15:45:46,441] [INFO] [timer.py:196:stop] epoch=0/micro_step=910/global_step=910, RunningAvgSamplesPerSec=4.934397846811847, CurrSamplesPerSec=5.335555684355236, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 15:48:19,718] [INFO] [logging.py:68:log_dist] [Rank 0] step=920, skipped=4, lr=[9.077777777777779e-06], mom=[[0.9, 0.999]] [2022-12-19 15:48:19,719] [INFO] [timer.py:196:stop] epoch=0/micro_step=920/global_step=920, RunningAvgSamplesPerSec=4.934105943596077, CurrSamplesPerSec=4.728669681717305, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0015, 'learning_rate': 9.066666666666667e-06, 'epoch': 13.21} [2022-12-19 15:50:55,618] [INFO] [logging.py:68:log_dist] [Rank 0] step=930, skipped=4, lr=[9.055555555555556e-06], mom=[[0.9, 0.999]] [2022-12-19 15:50:55,620] [INFO] [timer.py:196:stop] epoch=0/micro_step=930/global_step=930, RunningAvgSamplesPerSec=4.9329043169365985, CurrSamplesPerSec=4.770316514010341, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 15:53:30,553] [INFO] [logging.py:68:log_dist] [Rank 0] step=940, skipped=4, lr=[9.033333333333334e-06], mom=[[0.9, 0.999]] [2022-12-19 15:53:30,554] [INFO] [timer.py:196:stop] epoch=0/micro_step=940/global_step=940, RunningAvgSamplesPerSec=4.932100753197186, CurrSamplesPerSec=4.881037799953705, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 15:56:06,793] [INFO] [logging.py:68:log_dist] [Rank 0] step=950, skipped=4, lr=[9.011111111111111e-06], mom=[[0.9, 0.999]] [2022-12-19 15:56:06,794] [INFO] [timer.py:196:stop] epoch=0/micro_step=950/global_step=950, RunningAvgSamplesPerSec=4.930800433891351, CurrSamplesPerSec=4.800189378580432, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0018, 'learning_rate': 9.011111111111111e-06, 'epoch': 13.57} [2022-12-19 15:58:42,760] [INFO] [logging.py:68:log_dist] [Rank 0] step=960, skipped=4, lr=[8.988888888888889e-06], mom=[[0.9, 0.999]] [2022-12-19 15:58:42,761] [INFO] [timer.py:196:stop] epoch=0/micro_step=960/global_step=960, RunningAvgSamplesPerSec=4.929535632898922, CurrSamplesPerSec=4.854309583752725, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 16:01:17,719] [INFO] [logging.py:68:log_dist] [Rank 0] step=970, skipped=4, lr=[8.966666666666667e-06], mom=[[0.9, 0.999]] [2022-12-19 16:01:17,720] [INFO] [timer.py:196:stop] epoch=0/micro_step=970/global_step=970, RunningAvgSamplesPerSec=4.929074176976312, CurrSamplesPerSec=5.365831210088781, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0012, 'learning_rate': 8.955555555555555e-06, 'epoch': 13.93} [2022-12-19 16:03:41,427] [INFO] [logging.py:68:log_dist] [Rank 0] step=980, skipped=4, lr=[8.944444444444446e-06], mom=[[0.9, 0.999]] [2022-12-19 16:03:41,428] [INFO] [timer.py:196:stop] epoch=0/micro_step=980/global_step=980, RunningAvgSamplesPerSec=4.932864222713419, CurrSamplesPerSec=5.449948670863249, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 16:06:06,943] [INFO] [logging.py:68:log_dist] [Rank 0] step=990, skipped=4, lr=[8.922222222222224e-06], mom=[[0.9, 0.999]] [2022-12-19 16:06:06,945] [INFO] [timer.py:196:stop] epoch=0/micro_step=990/global_step=990, RunningAvgSamplesPerSec=4.935715217442322, CurrSamplesPerSec=5.154145743675715, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 16:08:33,907] [INFO] [logging.py:68:log_dist] [Rank 0] step=1000, skipped=4, lr=[8.900000000000001e-06], mom=[[0.9, 0.999]] [2022-12-19 16:08:33,909] [INFO] [timer.py:196:stop] epoch=0/micro_step=1000/global_step=1000, RunningAvgSamplesPerSec=4.9379997074583395, CurrSamplesPerSec=5.124673891667898, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.001, 'learning_rate': 8.900000000000001e-06, 'epoch': 14.29} {'eval_loss': 0.398193359375, 'eval_wer': 23.14374107567825, 'eval_runtime': 830.3901, 'eval_samples_per_second': 2.73, 'eval_steps_per_second': 0.086, 'epoch': 14.29} [2022-12-19 16:22:27,447] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step1000 is begin to save! [2022-12-19 16:22:27,482] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: ./checkpoint-1000/global_step1000/mp_rank_00_model_states.pt [2022-12-19 16:22:27,482] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving ./checkpoint-1000/global_step1000/mp_rank_00_model_states.pt... [2022-12-19 16:22:30,465] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved ./checkpoint-1000/global_step1000/mp_rank_00_model_states.pt. [2022-12-19 16:22:30,466] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving ./checkpoint-1000/global_step1000/zero_pp_rank_0_mp_rank_00_optim_states.pt... [2022-12-19 16:22:42,888] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved ./checkpoint-1000/global_step1000/zero_pp_rank_0_mp_rank_00_optim_states.pt. [2022-12-19 16:22:42,888] [INFO] [engine.py:3394:_save_zero_checkpoint] zero checkpoint saved ./checkpoint-1000/global_step1000/zero_pp_rank_0_mp_rank_00_optim_states.pt [2022-12-19 16:22:42,888] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step1000 is ready now! [2022-12-19 16:25:54,150] [INFO] [logging.py:68:log_dist] [Rank 0] step=1010, skipped=4, lr=[8.877777777777779e-06], mom=[[0.9, 0.999]] [2022-12-19 16:25:54,151] [INFO] [timer.py:196:stop] epoch=0/micro_step=1010/global_step=1010, RunningAvgSamplesPerSec=4.946155612200781, CurrSamplesPerSec=5.427266503393029, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 16:28:23,813] [INFO] [logging.py:68:log_dist] [Rank 0] step=1020, skipped=4, lr=[8.855555555555556e-06], mom=[[0.9, 0.999]] [2022-12-19 16:28:23,815] [INFO] [timer.py:196:stop] epoch=0/micro_step=1020/global_step=1020, RunningAvgSamplesPerSec=4.947909070147826, CurrSamplesPerSec=4.948194983344168, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0008, 'learning_rate': 8.844444444444445e-06, 'epoch': 14.64} [2022-12-19 16:30:48,792] [INFO] [logging.py:68:log_dist] [Rank 0] step=1030, skipped=4, lr=[8.833333333333334e-06], mom=[[0.9, 0.999]] [2022-12-19 16:30:48,794] [INFO] [timer.py:196:stop] epoch=0/micro_step=1030/global_step=1030, RunningAvgSamplesPerSec=4.95104548255284, CurrSamplesPerSec=5.5161473542347155, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 16:33:08,775] [INFO] [logging.py:68:log_dist] [Rank 0] step=1040, skipped=4, lr=[8.811111111111112e-06], mom=[[0.9, 0.999]] [2022-12-19 16:33:08,776] [INFO] [timer.py:196:stop] epoch=0/micro_step=1040/global_step=1040, RunningAvgSamplesPerSec=4.955651586571475, CurrSamplesPerSec=5.480776866515935, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 16:35:29,232] [INFO] [logging.py:68:log_dist] [Rank 0] step=1050, skipped=4, lr=[8.788888888888891e-06], mom=[[0.9, 0.999]] [2022-12-19 16:35:29,233] [INFO] [timer.py:196:stop] epoch=0/micro_step=1050/global_step=1050, RunningAvgSamplesPerSec=4.960084763552949, CurrSamplesPerSec=5.492401854443362, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0007, 'learning_rate': 8.788888888888891e-06, 'epoch': 15.0} [2022-12-19 16:37:52,329] [INFO] [logging.py:68:log_dist] [Rank 0] step=1060, skipped=4, lr=[8.766666666666669e-06], mom=[[0.9, 0.999]] [2022-12-19 16:37:52,330] [INFO] [timer.py:196:stop] epoch=0/micro_step=1060/global_step=1060, RunningAvgSamplesPerSec=4.963590044894763, CurrSamplesPerSec=5.31988618374188, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 16:40:18,486] [INFO] [logging.py:68:log_dist] [Rank 0] step=1070, skipped=4, lr=[8.744444444444446e-06], mom=[[0.9, 0.999]] [2022-12-19 16:40:18,488] [INFO] [timer.py:196:stop] epoch=0/micro_step=1070/global_step=1070, RunningAvgSamplesPerSec=4.9662693022023605, CurrSamplesPerSec=5.34212095662918, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0006, 'learning_rate': 8.733333333333333e-06, 'epoch': 15.36} [2022-12-19 16:42:44,512] [INFO] [logging.py:68:log_dist] [Rank 0] step=1080, skipped=4, lr=[8.722222222222224e-06], mom=[[0.9, 0.999]] [2022-12-19 16:42:44,513] [INFO] [timer.py:196:stop] epoch=0/micro_step=1080/global_step=1080, RunningAvgSamplesPerSec=4.968768029840848, CurrSamplesPerSec=5.341763767861054, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 16:45:11,694] [INFO] [logging.py:68:log_dist] [Rank 0] step=1090, skipped=4, lr=[8.700000000000001e-06], mom=[[0.9, 0.999]] [2022-12-19 16:45:11,696] [INFO] [timer.py:196:stop] epoch=0/micro_step=1090/global_step=1090, RunningAvgSamplesPerSec=4.970716936745166, CurrSamplesPerSec=5.171019593071395, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 16:47:38,523] [INFO] [logging.py:68:log_dist] [Rank 0] step=1100, skipped=4, lr=[8.677777777777779e-06], mom=[[0.9, 0.999]] [2022-12-19 16:47:38,524] [INFO] [timer.py:196:stop] epoch=0/micro_step=1100/global_step=1100, RunningAvgSamplesPerSec=4.972778133592894, CurrSamplesPerSec=5.330383449780738, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0005, 'learning_rate': 8.677777777777779e-06, 'epoch': 15.71} [2022-12-19 16:50:04,965] [INFO] [logging.py:68:log_dist] [Rank 0] step=1110, skipped=4, lr=[8.655555555555557e-06], mom=[[0.9, 0.999]] [2022-12-19 16:50:04,966] [INFO] [timer.py:196:stop] epoch=0/micro_step=1110/global_step=1110, RunningAvgSamplesPerSec=4.974780379467304, CurrSamplesPerSec=5.171142517435853, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 16:52:31,835] [INFO] [logging.py:68:log_dist] [Rank 0] step=1120, skipped=4, lr=[8.633333333333334e-06], mom=[[0.9, 0.999]] [2022-12-19 16:52:31,837] [INFO] [timer.py:196:stop] epoch=0/micro_step=1120/global_step=1120, RunningAvgSamplesPerSec=4.976596873108193, CurrSamplesPerSec=5.29747262822537, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0008, 'learning_rate': 8.622222222222223e-06, 'epoch': 16.07} [2022-12-19 16:54:58,352] [INFO] [logging.py:68:log_dist] [Rank 0] step=1130, skipped=4, lr=[8.611111111111112e-06], mom=[[0.9, 0.999]] [2022-12-19 16:54:58,353] [INFO] [timer.py:196:stop] epoch=0/micro_step=1130/global_step=1130, RunningAvgSamplesPerSec=4.978445938121315, CurrSamplesPerSec=5.210862000919856, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 16:57:25,480] [INFO] [logging.py:68:log_dist] [Rank 0] step=1140, skipped=4, lr=[8.58888888888889e-06], mom=[[0.9, 0.999]] [2022-12-19 16:57:25,482] [INFO] [timer.py:196:stop] epoch=0/micro_step=1140/global_step=1140, RunningAvgSamplesPerSec=4.980338331622698, CurrSamplesPerSec=5.196840258344095, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 16:59:51,037] [INFO] [logging.py:68:log_dist] [Rank 0] step=1150, skipped=4, lr=[8.566666666666667e-06], mom=[[0.9, 0.999]] [2022-12-19 16:59:51,039] [INFO] [timer.py:196:stop] epoch=0/micro_step=1150/global_step=1150, RunningAvgSamplesPerSec=4.982735231478478, CurrSamplesPerSec=5.537131609186084, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0007, 'learning_rate': 8.566666666666667e-06, 'epoch': 16.43} [2022-12-19 17:02:09,443] [INFO] [logging.py:68:log_dist] [Rank 0] step=1160, skipped=4, lr=[8.544444444444445e-06], mom=[[0.9, 0.999]] [2022-12-19 17:02:09,444] [INFO] [timer.py:196:stop] epoch=0/micro_step=1160/global_step=1160, RunningAvgSamplesPerSec=4.987734487375767, CurrSamplesPerSec=5.400914122804589, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 17:04:31,640] [INFO] [logging.py:68:log_dist] [Rank 0] step=1170, skipped=4, lr=[8.522222222222222e-06], mom=[[0.9, 0.999]] [2022-12-19 17:04:31,641] [INFO] [timer.py:196:stop] epoch=0/micro_step=1170/global_step=1170, RunningAvgSamplesPerSec=4.991026786908645, CurrSamplesPerSec=5.1132182720378365, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0006, 'learning_rate': 8.511111111111113e-06, 'epoch': 16.79} [2022-12-19 17:06:59,036] [INFO] [logging.py:68:log_dist] [Rank 0] step=1180, skipped=4, lr=[8.5e-06], mom=[[0.9, 0.999]] [2022-12-19 17:06:59,037] [INFO] [timer.py:196:stop] epoch=0/micro_step=1180/global_step=1180, RunningAvgSamplesPerSec=4.99253365711222, CurrSamplesPerSec=5.182763986787628, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 17:09:25,982] [INFO] [logging.py:68:log_dist] [Rank 0] step=1190, skipped=4, lr=[8.477777777777778e-06], mom=[[0.9, 0.999]] [2022-12-19 17:09:25,984] [INFO] [timer.py:196:stop] epoch=0/micro_step=1190/global_step=1190, RunningAvgSamplesPerSec=4.9941447292969245, CurrSamplesPerSec=5.223813895502338, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 17:11:52,580] [INFO] [logging.py:68:log_dist] [Rank 0] step=1200, skipped=4, lr=[8.455555555555555e-06], mom=[[0.9, 0.999]] [2022-12-19 17:11:52,582] [INFO] [timer.py:196:stop] epoch=0/micro_step=1200/global_step=1200, RunningAvgSamplesPerSec=4.995659807165458, CurrSamplesPerSec=5.315487693162457, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0005, 'learning_rate': 8.455555555555555e-06, 'epoch': 17.14} [2022-12-19 17:14:19,243] [INFO] [logging.py:68:log_dist] [Rank 0] step=1210, skipped=4, lr=[8.433333333333334e-06], mom=[[0.9, 0.999]] [2022-12-19 17:14:19,245] [INFO] [timer.py:196:stop] epoch=0/micro_step=1210/global_step=1210, RunningAvgSamplesPerSec=4.99743772487544, CurrSamplesPerSec=5.233664661395427, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 17:16:40,713] [INFO] [logging.py:68:log_dist] [Rank 0] step=1220, skipped=4, lr=[8.411111111111112e-06], mom=[[0.9, 0.999]] [2022-12-19 17:16:40,715] [INFO] [timer.py:196:stop] epoch=0/micro_step=1220/global_step=1220, RunningAvgSamplesPerSec=5.000515493283669, CurrSamplesPerSec=5.390047022550994, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0005, 'learning_rate': 8.400000000000001e-06, 'epoch': 17.5} [2022-12-19 17:19:06,213] [INFO] [logging.py:68:log_dist] [Rank 0] step=1230, skipped=4, lr=[8.38888888888889e-06], mom=[[0.9, 0.999]] [2022-12-19 17:19:06,214] [INFO] [timer.py:196:stop] epoch=0/micro_step=1230/global_step=1230, RunningAvgSamplesPerSec=5.002513756654844, CurrSamplesPerSec=5.168636673137713, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 17:21:34,803] [INFO] [logging.py:68:log_dist] [Rank 0] step=1240, skipped=4, lr=[8.366666666666667e-06], mom=[[0.9, 0.999]] [2022-12-19 17:21:34,804] [INFO] [timer.py:196:stop] epoch=0/micro_step=1240/global_step=1240, RunningAvgSamplesPerSec=5.003637462526274, CurrSamplesPerSec=5.108512064344439, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 17:24:02,874] [INFO] [logging.py:68:log_dist] [Rank 0] step=1250, skipped=4, lr=[8.344444444444445e-06], mom=[[0.9, 0.999]] [2022-12-19 17:24:02,876] [INFO] [timer.py:196:stop] epoch=0/micro_step=1250/global_step=1250, RunningAvgSamplesPerSec=5.004850058775802, CurrSamplesPerSec=5.165307771079629, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0007, 'learning_rate': 8.344444444444445e-06, 'epoch': 17.86} [2022-12-19 17:26:27,879] [INFO] [logging.py:68:log_dist] [Rank 0] step=1260, skipped=4, lr=[8.322222222222223e-06], mom=[[0.9, 0.999]] [2022-12-19 17:26:27,881] [INFO] [timer.py:196:stop] epoch=0/micro_step=1260/global_step=1260, RunningAvgSamplesPerSec=5.006734729727626, CurrSamplesPerSec=5.409646293749169, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 17:28:52,506] [INFO] [logging.py:68:log_dist] [Rank 0] step=1270, skipped=4, lr=[8.3e-06], mom=[[0.9, 0.999]] [2022-12-19 17:28:52,507] [INFO] [timer.py:196:stop] epoch=0/micro_step=1270/global_step=1270, RunningAvgSamplesPerSec=5.008667810078256, CurrSamplesPerSec=5.263254861392433, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0009, 'learning_rate': 8.288888888888889e-06, 'epoch': 18.21} [2022-12-19 17:31:18,748] [INFO] [logging.py:68:log_dist] [Rank 0] step=1280, skipped=4, lr=[8.277777777777778e-06], mom=[[0.9, 0.999]] [2022-12-19 17:31:18,749] [INFO] [timer.py:196:stop] epoch=0/micro_step=1280/global_step=1280, RunningAvgSamplesPerSec=5.010395634381795, CurrSamplesPerSec=5.133399428913989, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 17:33:41,627] [INFO] [logging.py:68:log_dist] [Rank 0] step=1290, skipped=4, lr=[8.255555555555557e-06], mom=[[0.9, 0.999]] [2022-12-19 17:33:41,629] [INFO] [timer.py:196:stop] epoch=0/micro_step=1290/global_step=1290, RunningAvgSamplesPerSec=5.013034147201725, CurrSamplesPerSec=5.563105384457626, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 17:36:01,543] [INFO] [logging.py:68:log_dist] [Rank 0] step=1300, skipped=4, lr=[8.233333333333335e-06], mom=[[0.9, 0.999]] [2022-12-19 17:36:01,545] [INFO] [timer.py:196:stop] epoch=0/micro_step=1300/global_step=1300, RunningAvgSamplesPerSec=5.0163781171227635, CurrSamplesPerSec=5.499212470927062, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0004, 'learning_rate': 8.233333333333335e-06, 'epoch': 18.57} [2022-12-19 17:38:22,777] [INFO] [logging.py:68:log_dist] [Rank 0] step=1310, skipped=4, lr=[8.211111111111112e-06], mom=[[0.9, 0.999]] [2022-12-19 17:38:22,778] [INFO] [timer.py:196:stop] epoch=0/micro_step=1310/global_step=1310, RunningAvgSamplesPerSec=5.019608777760168, CurrSamplesPerSec=5.498456753866667, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 17:40:46,820] [INFO] [logging.py:68:log_dist] [Rank 0] step=1320, skipped=4, lr=[8.18888888888889e-06], mom=[[0.9, 0.999]] [2022-12-19 17:40:46,822] [INFO] [timer.py:196:stop] epoch=0/micro_step=1320/global_step=1320, RunningAvgSamplesPerSec=5.0214578371810275, CurrSamplesPerSec=5.212379233532717, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0005, 'learning_rate': 8.177777777777779e-06, 'epoch': 18.93} [2022-12-19 17:43:13,639] [INFO] [logging.py:68:log_dist] [Rank 0] step=1330, skipped=4, lr=[8.166666666666668e-06], mom=[[0.9, 0.999]] [2022-12-19 17:43:13,640] [INFO] [timer.py:196:stop] epoch=0/micro_step=1330/global_step=1330, RunningAvgSamplesPerSec=5.022633838147506, CurrSamplesPerSec=5.228448053717973, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 17:45:39,210] [INFO] [logging.py:68:log_dist] [Rank 0] step=1340, skipped=4, lr=[8.144444444444445e-06], mom=[[0.9, 0.999]] [2022-12-19 17:45:39,212] [INFO] [timer.py:196:stop] epoch=0/micro_step=1340/global_step=1340, RunningAvgSamplesPerSec=5.024334879899157, CurrSamplesPerSec=5.214608571866765, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 17:48:06,499] [INFO] [logging.py:68:log_dist] [Rank 0] step=1350, skipped=4, lr=[8.122222222222223e-06], mom=[[0.9, 0.999]] [2022-12-19 17:48:06,500] [INFO] [timer.py:196:stop] epoch=0/micro_step=1350/global_step=1350, RunningAvgSamplesPerSec=5.025300127711808, CurrSamplesPerSec=5.239038924938197, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0004, 'learning_rate': 8.122222222222223e-06, 'epoch': 19.29} [2022-12-19 17:50:33,042] [INFO] [logging.py:68:log_dist] [Rank 0] step=1360, skipped=4, lr=[8.1e-06], mom=[[0.9, 0.999]] [2022-12-19 17:50:33,044] [INFO] [timer.py:196:stop] epoch=0/micro_step=1360/global_step=1360, RunningAvgSamplesPerSec=5.026458786514721, CurrSamplesPerSec=5.222402876720334, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 17:53:01,272] [INFO] [logging.py:68:log_dist] [Rank 0] step=1370, skipped=4, lr=[8.077777777777778e-06], mom=[[0.9, 0.999]] [2022-12-19 17:53:01,273] [INFO] [timer.py:196:stop] epoch=0/micro_step=1370/global_step=1370, RunningAvgSamplesPerSec=5.027270630914416, CurrSamplesPerSec=5.045503737060974, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0004, 'learning_rate': 8.066666666666667e-06, 'epoch': 19.64} [2022-12-19 17:55:26,068] [INFO] [logging.py:68:log_dist] [Rank 0] step=1380, skipped=4, lr=[8.055555555555557e-06], mom=[[0.9, 0.999]] [2022-12-19 17:55:26,069] [INFO] [timer.py:196:stop] epoch=0/micro_step=1380/global_step=1380, RunningAvgSamplesPerSec=5.028856888820663, CurrSamplesPerSec=5.307613814297282, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 17:57:51,008] [INFO] [logging.py:68:log_dist] [Rank 0] step=1390, skipped=4, lr=[8.033333333333335e-06], mom=[[0.9, 0.999]] [2022-12-19 17:57:51,010] [INFO] [timer.py:196:stop] epoch=0/micro_step=1390/global_step=1390, RunningAvgSamplesPerSec=5.030442363983549, CurrSamplesPerSec=5.348233514952674, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 18:00:15,725] [INFO] [logging.py:68:log_dist] [Rank 0] step=1400, skipped=4, lr=[8.011111111111113e-06], mom=[[0.9, 0.999]] [2022-12-19 18:00:15,726] [INFO] [timer.py:196:stop] epoch=0/micro_step=1400/global_step=1400, RunningAvgSamplesPerSec=5.03213433821175, CurrSamplesPerSec=5.516520083070614, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0009, 'learning_rate': 8.011111111111113e-06, 'epoch': 20.0} [2022-12-19 18:02:41,881] [INFO] [logging.py:68:log_dist] [Rank 0] step=1410, skipped=4, lr=[7.98888888888889e-06], mom=[[0.9, 0.999]] [2022-12-19 18:02:41,883] [INFO] [timer.py:196:stop] epoch=0/micro_step=1410/global_step=1410, RunningAvgSamplesPerSec=5.03357037336381, CurrSamplesPerSec=5.256077476723685, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 18:05:06,604] [INFO] [logging.py:68:log_dist] [Rank 0] step=1420, skipped=4, lr=[7.966666666666668e-06], mom=[[0.9, 0.999]] [2022-12-19 18:05:06,605] [INFO] [timer.py:196:stop] epoch=0/micro_step=1420/global_step=1420, RunningAvgSamplesPerSec=5.035063806960154, CurrSamplesPerSec=5.375410536808028, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0014, 'learning_rate': 7.955555555555557e-06, 'epoch': 20.36} [2022-12-19 18:07:30,437] [INFO] [logging.py:68:log_dist] [Rank 0] step=1430, skipped=4, lr=[7.944444444444445e-06], mom=[[0.9, 0.999]] [2022-12-19 18:07:30,439] [INFO] [timer.py:196:stop] epoch=0/micro_step=1430/global_step=1430, RunningAvgSamplesPerSec=5.036971491138291, CurrSamplesPerSec=5.285265662831161, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 18:09:55,168] [INFO] [logging.py:68:log_dist] [Rank 0] step=1440, skipped=4, lr=[7.922222222222223e-06], mom=[[0.9, 0.999]] [2022-12-19 18:09:55,169] [INFO] [timer.py:196:stop] epoch=0/micro_step=1440/global_step=1440, RunningAvgSamplesPerSec=5.038604269410047, CurrSamplesPerSec=5.185768884195439, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 18:12:22,238] [INFO] [logging.py:68:log_dist] [Rank 0] step=1450, skipped=4, lr=[7.9e-06], mom=[[0.9, 0.999]] [2022-12-19 18:12:22,239] [INFO] [timer.py:196:stop] epoch=0/micro_step=1450/global_step=1450, RunningAvgSamplesPerSec=5.039437304325378, CurrSamplesPerSec=5.102659999488661, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0015, 'learning_rate': 7.9e-06, 'epoch': 20.71} [2022-12-19 18:14:49,744] [INFO] [logging.py:68:log_dist] [Rank 0] step=1460, skipped=4, lr=[7.877777777777778e-06], mom=[[0.9, 0.999]] [2022-12-19 18:14:49,746] [INFO] [timer.py:196:stop] epoch=0/micro_step=1460/global_step=1460, RunningAvgSamplesPerSec=5.040409150933666, CurrSamplesPerSec=5.11486394639355, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 18:17:13,902] [INFO] [logging.py:68:log_dist] [Rank 0] step=1470, skipped=4, lr=[7.855555555555556e-06], mom=[[0.9, 0.999]] [2022-12-19 18:17:13,903] [INFO] [timer.py:196:stop] epoch=0/micro_step=1470/global_step=1470, RunningAvgSamplesPerSec=5.041978266342931, CurrSamplesPerSec=5.47038542334029, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0018, 'learning_rate': 7.844444444444446e-06, 'epoch': 21.07} [2022-12-19 18:19:42,088] [INFO] [logging.py:68:log_dist] [Rank 0] step=1480, skipped=4, lr=[7.833333333333333e-06], mom=[[0.9, 0.999]] [2022-12-19 18:19:42,089] [INFO] [timer.py:196:stop] epoch=0/micro_step=1480/global_step=1480, RunningAvgSamplesPerSec=5.0424094545990386, CurrSamplesPerSec=4.956443088106126, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 18:22:14,789] [INFO] [logging.py:68:log_dist] [Rank 0] step=1490, skipped=4, lr=[7.811111111111111e-06], mom=[[0.9, 0.999]] [2022-12-19 18:22:14,790] [INFO] [timer.py:196:stop] epoch=0/micro_step=1490/global_step=1490, RunningAvgSamplesPerSec=5.041676545490984, CurrSamplesPerSec=4.679113370051968, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 18:24:48,008] [INFO] [logging.py:68:log_dist] [Rank 0] step=1500, skipped=4, lr=[7.788888888888889e-06], mom=[[0.9, 0.999]] [2022-12-19 18:24:48,009] [INFO] [timer.py:196:stop] epoch=0/micro_step=1500/global_step=1500, RunningAvgSamplesPerSec=5.04080329822501, CurrSamplesPerSec=4.8980657385413355, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0016, 'learning_rate': 7.788888888888889e-06, 'epoch': 21.43} [2022-12-19 18:27:24,828] [INFO] [logging.py:68:log_dist] [Rank 0] step=1510, skipped=4, lr=[7.766666666666666e-06], mom=[[0.9, 0.999]] [2022-12-19 18:27:24,829] [INFO] [timer.py:196:stop] epoch=0/micro_step=1510/global_step=1510, RunningAvgSamplesPerSec=5.039061336985348, CurrSamplesPerSec=4.840822813869123, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 18:30:01,945] [INFO] [logging.py:68:log_dist] [Rank 0] step=1520, skipped=4, lr=[7.744444444444446e-06], mom=[[0.9, 0.999]] [2022-12-19 18:30:01,946] [INFO] [timer.py:196:stop] epoch=0/micro_step=1520/global_step=1520, RunningAvgSamplesPerSec=5.037398556281625, CurrSamplesPerSec=4.619514582056782, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0039, 'learning_rate': 7.733333333333334e-06, 'epoch': 21.79} [2022-12-19 18:32:35,704] [INFO] [logging.py:68:log_dist] [Rank 0] step=1530, skipped=4, lr=[7.722222222222223e-06], mom=[[0.9, 0.999]] [2022-12-19 18:32:35,705] [INFO] [timer.py:196:stop] epoch=0/micro_step=1530/global_step=1530, RunningAvgSamplesPerSec=5.036572728664591, CurrSamplesPerSec=5.12912492502464, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 18:35:02,667] [INFO] [logging.py:68:log_dist] [Rank 0] step=1540, skipped=4, lr=[7.7e-06], mom=[[0.9, 0.999]] [2022-12-19 18:35:02,668] [INFO] [timer.py:196:stop] epoch=0/micro_step=1540/global_step=1540, RunningAvgSamplesPerSec=5.037306691282614, CurrSamplesPerSec=5.3328548436865395, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 18:37:30,298] [INFO] [logging.py:68:log_dist] [Rank 0] step=1550, skipped=4, lr=[7.677777777777778e-06], mom=[[0.9, 0.999]] [2022-12-19 18:37:30,300] [INFO] [timer.py:196:stop] epoch=0/micro_step=1550/global_step=1550, RunningAvgSamplesPerSec=5.038053458453126, CurrSamplesPerSec=5.160838294745109, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0059, 'learning_rate': 7.677777777777778e-06, 'epoch': 22.14} [2022-12-19 18:39:55,872] [INFO] [logging.py:68:log_dist] [Rank 0] step=1560, skipped=4, lr=[7.655555555555556e-06], mom=[[0.9, 0.999]] [2022-12-19 18:39:55,873] [INFO] [timer.py:196:stop] epoch=0/micro_step=1560/global_step=1560, RunningAvgSamplesPerSec=5.0392575103306765, CurrSamplesPerSec=5.181938180384615, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 18:42:22,825] [INFO] [logging.py:68:log_dist] [Rank 0] step=1570, skipped=4, lr=[7.633333333333334e-06], mom=[[0.9, 0.999]] [2022-12-19 18:42:22,826] [INFO] [timer.py:196:stop] epoch=0/micro_step=1570/global_step=1570, RunningAvgSamplesPerSec=5.039970012337326, CurrSamplesPerSec=4.951781314685514, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0044, 'learning_rate': 7.622222222222223e-06, 'epoch': 22.5} [2022-12-19 18:44:56,005] [INFO] [logging.py:68:log_dist] [Rank 0] step=1580, skipped=4, lr=[7.611111111111111e-06], mom=[[0.9, 0.999]] [2022-12-19 18:44:56,006] [INFO] [timer.py:196:stop] epoch=0/micro_step=1580/global_step=1580, RunningAvgSamplesPerSec=5.039153924492581, CurrSamplesPerSec=4.878267253173562, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 18:47:28,518] [INFO] [logging.py:68:log_dist] [Rank 0] step=1590, skipped=4, lr=[7.588888888888889e-06], mom=[[0.9, 0.999]] [2022-12-19 18:47:28,520] [INFO] [timer.py:196:stop] epoch=0/micro_step=1590/global_step=1590, RunningAvgSamplesPerSec=5.038529747410898, CurrSamplesPerSec=5.032858236022698, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 18:50:01,481] [INFO] [logging.py:68:log_dist] [Rank 0] step=1600, skipped=4, lr=[7.566666666666667e-06], mom=[[0.9, 0.999]] [2022-12-19 18:50:01,482] [INFO] [timer.py:196:stop] epoch=0/micro_step=1600/global_step=1600, RunningAvgSamplesPerSec=5.038026605408144, CurrSamplesPerSec=5.201494391150163, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0036, 'learning_rate': 7.566666666666667e-06, 'epoch': 22.86} [2022-12-19 18:52:29,195] [INFO] [logging.py:68:log_dist] [Rank 0] step=1610, skipped=4, lr=[7.544444444444445e-06], mom=[[0.9, 0.999]] [2022-12-19 18:52:29,196] [INFO] [timer.py:196:stop] epoch=0/micro_step=1610/global_step=1610, RunningAvgSamplesPerSec=5.038593388036559, CurrSamplesPerSec=5.242503604624011, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 18:54:57,983] [INFO] [logging.py:68:log_dist] [Rank 0] step=1620, skipped=4, lr=[7.5222222222222226e-06], mom=[[0.9, 0.999]] [2022-12-19 18:54:57,985] [INFO] [timer.py:196:stop] epoch=0/micro_step=1620/global_step=1620, RunningAvgSamplesPerSec=5.039092588050812, CurrSamplesPerSec=4.972800410931735, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0044, 'learning_rate': 7.511111111111111e-06, 'epoch': 23.21} [2022-12-19 18:57:32,417] [INFO] [logging.py:68:log_dist] [Rank 0] step=1630, skipped=4, lr=[7.500000000000001e-06], mom=[[0.9, 0.999]] [2022-12-19 18:57:32,419] [INFO] [timer.py:196:stop] epoch=0/micro_step=1630/global_step=1630, RunningAvgSamplesPerSec=5.03811736054203, CurrSamplesPerSec=5.009671008492361, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 18:59:58,487] [INFO] [logging.py:68:log_dist] [Rank 0] step=1640, skipped=4, lr=[7.477777777777779e-06], mom=[[0.9, 0.999]] [2022-12-19 18:59:58,488] [INFO] [timer.py:196:stop] epoch=0/micro_step=1640/global_step=1640, RunningAvgSamplesPerSec=5.038963626882841, CurrSamplesPerSec=5.27202534181593, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 19:02:25,603] [INFO] [logging.py:68:log_dist] [Rank 0] step=1650, skipped=4, lr=[7.455555555555556e-06], mom=[[0.9, 0.999]] [2022-12-19 19:02:25,605] [INFO] [timer.py:196:stop] epoch=0/micro_step=1650/global_step=1650, RunningAvgSamplesPerSec=5.039844547481239, CurrSamplesPerSec=5.089497264282554, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0052, 'learning_rate': 7.455555555555556e-06, 'epoch': 23.57} [2022-12-19 19:04:58,391] [INFO] [logging.py:68:log_dist] [Rank 0] step=1660, skipped=4, lr=[7.433333333333334e-06], mom=[[0.9, 0.999]] [2022-12-19 19:04:58,392] [INFO] [timer.py:196:stop] epoch=0/micro_step=1660/global_step=1660, RunningAvgSamplesPerSec=5.039225618417483, CurrSamplesPerSec=4.851855245184876, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 19:07:23,951] [INFO] [logging.py:68:log_dist] [Rank 0] step=1670, skipped=4, lr=[7.411111111111112e-06], mom=[[0.9, 0.999]] [2022-12-19 19:07:23,953] [INFO] [timer.py:196:stop] epoch=0/micro_step=1670/global_step=1670, RunningAvgSamplesPerSec=5.040339711057163, CurrSamplesPerSec=5.354476065900665, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0042, 'learning_rate': 7.4e-06, 'epoch': 23.93} [2022-12-19 19:09:54,646] [INFO] [logging.py:68:log_dist] [Rank 0] step=1680, skipped=4, lr=[7.38888888888889e-06], mom=[[0.9, 0.999]] [2022-12-19 19:09:54,648] [INFO] [timer.py:196:stop] epoch=0/micro_step=1680/global_step=1680, RunningAvgSamplesPerSec=5.040064982587634, CurrSamplesPerSec=5.0858070350307845, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 19:12:31,022] [INFO] [logging.py:68:log_dist] [Rank 0] step=1690, skipped=4, lr=[7.3666666666666676e-06], mom=[[0.9, 0.999]] [2022-12-19 19:12:31,024] [INFO] [timer.py:196:stop] epoch=0/micro_step=1690/global_step=1690, RunningAvgSamplesPerSec=5.038594415852604, CurrSamplesPerSec=4.842886167985758, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 19:15:08,420] [INFO] [logging.py:68:log_dist] [Rank 0] step=1700, skipped=4, lr=[7.344444444444445e-06], mom=[[0.9, 0.999]] [2022-12-19 19:15:08,422] [INFO] [timer.py:196:stop] epoch=0/micro_step=1700/global_step=1700, RunningAvgSamplesPerSec=5.03725834072836, CurrSamplesPerSec=5.027833379755982, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0025, 'learning_rate': 7.344444444444445e-06, 'epoch': 24.29} [2022-12-19 19:17:38,058] [INFO] [logging.py:68:log_dist] [Rank 0] step=1710, skipped=4, lr=[7.322222222222223e-06], mom=[[0.9, 0.999]] [2022-12-19 19:17:38,059] [INFO] [timer.py:196:stop] epoch=0/micro_step=1710/global_step=1710, RunningAvgSamplesPerSec=5.03742583567553, CurrSamplesPerSec=5.0947292535635995, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 19:20:13,847] [INFO] [logging.py:68:log_dist] [Rank 0] step=1720, skipped=4, lr=[7.3e-06], mom=[[0.9, 0.999]] [2022-12-19 19:20:13,848] [INFO] [timer.py:196:stop] epoch=0/micro_step=1720/global_step=1720, RunningAvgSamplesPerSec=5.0360691242583115, CurrSamplesPerSec=4.718946022335281, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0025, 'learning_rate': 7.28888888888889e-06, 'epoch': 24.64} [2022-12-19 19:22:49,792] [INFO] [logging.py:68:log_dist] [Rank 0] step=1730, skipped=4, lr=[7.277777777777778e-06], mom=[[0.9, 0.999]] [2022-12-19 19:22:49,794] [INFO] [timer.py:196:stop] epoch=0/micro_step=1730/global_step=1730, RunningAvgSamplesPerSec=5.034636285427808, CurrSamplesPerSec=4.839062764178785, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 19:25:22,009] [INFO] [logging.py:68:log_dist] [Rank 0] step=1740, skipped=4, lr=[7.255555555555556e-06], mom=[[0.9, 0.999]] [2022-12-19 19:25:22,010] [INFO] [timer.py:196:stop] epoch=0/micro_step=1740/global_step=1740, RunningAvgSamplesPerSec=5.034238050165783, CurrSamplesPerSec=5.2460325376781105, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 19:27:48,354] [INFO] [logging.py:68:log_dist] [Rank 0] step=1750, skipped=4, lr=[7.233333333333334e-06], mom=[[0.9, 0.999]] [2022-12-19 19:27:48,356] [INFO] [timer.py:196:stop] epoch=0/micro_step=1750/global_step=1750, RunningAvgSamplesPerSec=5.035059037529779, CurrSamplesPerSec=5.356405136018258, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0028, 'learning_rate': 7.233333333333334e-06, 'epoch': 25.0} [2022-12-19 19:30:14,518] [INFO] [logging.py:68:log_dist] [Rank 0] step=1760, skipped=4, lr=[7.211111111111112e-06], mom=[[0.9, 0.999]] [2022-12-19 19:30:14,520] [INFO] [timer.py:196:stop] epoch=0/micro_step=1760/global_step=1760, RunningAvgSamplesPerSec=5.035832039181515, CurrSamplesPerSec=5.240123919979665, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 19:32:41,229] [INFO] [logging.py:68:log_dist] [Rank 0] step=1770, skipped=4, lr=[7.188888888888889e-06], mom=[[0.9, 0.999]] [2022-12-19 19:32:41,231] [INFO] [timer.py:196:stop] epoch=0/micro_step=1770/global_step=1770, RunningAvgSamplesPerSec=5.036626792336216, CurrSamplesPerSec=5.2070103094303315, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0028, 'learning_rate': 7.177777777777778e-06, 'epoch': 25.36} [2022-12-19 19:35:07,466] [INFO] [logging.py:68:log_dist] [Rank 0] step=1780, skipped=4, lr=[7.166666666666667e-06], mom=[[0.9, 0.999]] [2022-12-19 19:35:07,468] [INFO] [timer.py:196:stop] epoch=0/micro_step=1780/global_step=1780, RunningAvgSamplesPerSec=5.037555229482233, CurrSamplesPerSec=5.3578219187047535, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 19:37:36,866] [INFO] [logging.py:68:log_dist] [Rank 0] step=1790, skipped=4, lr=[7.1444444444444446e-06], mom=[[0.9, 0.999]] [2022-12-19 19:37:36,867] [INFO] [timer.py:196:stop] epoch=0/micro_step=1790/global_step=1790, RunningAvgSamplesPerSec=5.037630697649268, CurrSamplesPerSec=4.929081755136833, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 19:40:11,678] [INFO] [logging.py:68:log_dist] [Rank 0] step=1800, skipped=4, lr=[7.122222222222222e-06], mom=[[0.9, 0.999]] [2022-12-19 19:40:11,680] [INFO] [timer.py:196:stop] epoch=0/micro_step=1800/global_step=1800, RunningAvgSamplesPerSec=5.0365151439189315, CurrSamplesPerSec=4.729848901816531, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0018, 'learning_rate': 7.122222222222222e-06, 'epoch': 25.71} [2022-12-19 19:42:45,372] [INFO] [logging.py:68:log_dist] [Rank 0] step=1810, skipped=4, lr=[7.100000000000001e-06], mom=[[0.9, 0.999]] [2022-12-19 19:42:45,373] [INFO] [timer.py:196:stop] epoch=0/micro_step=1810/global_step=1810, RunningAvgSamplesPerSec=5.035565687377094, CurrSamplesPerSec=5.146164373810718, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 19:45:18,964] [INFO] [logging.py:68:log_dist] [Rank 0] step=1820, skipped=4, lr=[7.077777777777778e-06], mom=[[0.9, 0.999]] [2022-12-19 19:45:18,965] [INFO] [timer.py:196:stop] epoch=0/micro_step=1820/global_step=1820, RunningAvgSamplesPerSec=5.0347239572198905, CurrSamplesPerSec=4.924589090991946, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0018, 'learning_rate': 7.066666666666667e-06, 'epoch': 26.07} [2022-12-19 19:47:54,057] [INFO] [logging.py:68:log_dist] [Rank 0] step=1830, skipped=4, lr=[7.055555555555557e-06], mom=[[0.9, 0.999]] [2022-12-19 19:47:54,058] [INFO] [timer.py:196:stop] epoch=0/micro_step=1830/global_step=1830, RunningAvgSamplesPerSec=5.033575677283027, CurrSamplesPerSec=4.910401868428197, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 19:50:28,025] [INFO] [logging.py:68:log_dist] [Rank 0] step=1840, skipped=4, lr=[7.033333333333334e-06], mom=[[0.9, 0.999]] [2022-12-19 19:50:28,027] [INFO] [timer.py:196:stop] epoch=0/micro_step=1840/global_step=1840, RunningAvgSamplesPerSec=5.032681136359626, CurrSamplesPerSec=4.832177040254063, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 19:53:02,025] [INFO] [logging.py:68:log_dist] [Rank 0] step=1850, skipped=4, lr=[7.011111111111112e-06], mom=[[0.9, 0.999]] [2022-12-19 19:53:02,027] [INFO] [timer.py:196:stop] epoch=0/micro_step=1850/global_step=1850, RunningAvgSamplesPerSec=5.0318871588207035, CurrSamplesPerSec=4.90743649178092, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.001, 'learning_rate': 7.011111111111112e-06, 'epoch': 26.43} [2022-12-19 19:55:36,290] [INFO] [logging.py:68:log_dist] [Rank 0] step=1860, skipped=4, lr=[6.9888888888888895e-06], mom=[[0.9, 0.999]] [2022-12-19 19:55:36,291] [INFO] [timer.py:196:stop] epoch=0/micro_step=1860/global_step=1860, RunningAvgSamplesPerSec=5.030966724572737, CurrSamplesPerSec=4.952871752953277, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 19:58:06,650] [INFO] [logging.py:68:log_dist] [Rank 0] step=1870, skipped=4, lr=[6.966666666666667e-06], mom=[[0.9, 0.999]] [2022-12-19 19:58:06,652] [INFO] [timer.py:196:stop] epoch=0/micro_step=1870/global_step=1870, RunningAvgSamplesPerSec=5.030922085361869, CurrSamplesPerSec=5.192533921299166, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.001, 'learning_rate': 6.955555555555557e-06, 'epoch': 26.79} [2022-12-19 20:00:32,959] [INFO] [logging.py:68:log_dist] [Rank 0] step=1880, skipped=4, lr=[6.944444444444445e-06], mom=[[0.9, 0.999]] [2022-12-19 20:00:32,960] [INFO] [timer.py:196:stop] epoch=0/micro_step=1880/global_step=1880, RunningAvgSamplesPerSec=5.031688584153497, CurrSamplesPerSec=5.111605095734249, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 20:02:58,759] [INFO] [logging.py:68:log_dist] [Rank 0] step=1890, skipped=4, lr=[6.922222222222222e-06], mom=[[0.9, 0.999]] [2022-12-19 20:02:58,761] [INFO] [timer.py:196:stop] epoch=0/micro_step=1890/global_step=1890, RunningAvgSamplesPerSec=5.032571034718714, CurrSamplesPerSec=5.410265041165288, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 20:05:24,222] [INFO] [logging.py:68:log_dist] [Rank 0] step=1900, skipped=4, lr=[6.9e-06], mom=[[0.9, 0.999]] [2022-12-19 20:05:24,223] [INFO] [timer.py:196:stop] epoch=0/micro_step=1900/global_step=1900, RunningAvgSamplesPerSec=5.033515074802237, CurrSamplesPerSec=5.348844580995101, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0008, 'learning_rate': 6.9e-06, 'epoch': 27.14} [2022-12-19 20:07:54,523] [INFO] [logging.py:68:log_dist] [Rank 0] step=1910, skipped=4, lr=[6.8777777777777785e-06], mom=[[0.9, 0.999]] [2022-12-19 20:07:54,525] [INFO] [timer.py:196:stop] epoch=0/micro_step=1910/global_step=1910, RunningAvgSamplesPerSec=5.033415110951943, CurrSamplesPerSec=4.834957819484288, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 20:10:29,744] [INFO] [logging.py:68:log_dist] [Rank 0] step=1920, skipped=4, lr=[6.855555555555556e-06], mom=[[0.9, 0.999]] [2022-12-19 20:10:29,745] [INFO] [timer.py:196:stop] epoch=0/micro_step=1920/global_step=1920, RunningAvgSamplesPerSec=5.032444281134745, CurrSamplesPerSec=4.842774510078423, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0009, 'learning_rate': 6.844444444444445e-06, 'epoch': 27.5} [2022-12-19 20:13:03,727] [INFO] [logging.py:68:log_dist] [Rank 0] step=1930, skipped=4, lr=[6.833333333333334e-06], mom=[[0.9, 0.999]] [2022-12-19 20:13:03,729] [INFO] [timer.py:196:stop] epoch=0/micro_step=1930/global_step=1930, RunningAvgSamplesPerSec=5.031589035431254, CurrSamplesPerSec=4.987772578098499, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 20:15:37,851] [INFO] [logging.py:68:log_dist] [Rank 0] step=1940, skipped=4, lr=[6.811111111111111e-06], mom=[[0.9, 0.999]] [2022-12-19 20:15:37,853] [INFO] [timer.py:196:stop] epoch=0/micro_step=1940/global_step=1940, RunningAvgSamplesPerSec=5.030830729777499, CurrSamplesPerSec=4.767701193993327, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 20:18:13,963] [INFO] [logging.py:68:log_dist] [Rank 0] step=1950, skipped=4, lr=[6.788888888888889e-06], mom=[[0.9, 0.999]] [2022-12-19 20:18:13,964] [INFO] [timer.py:196:stop] epoch=0/micro_step=1950/global_step=1950, RunningAvgSamplesPerSec=5.029536899455794, CurrSamplesPerSec=4.887162085044578, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0007, 'learning_rate': 6.788888888888889e-06, 'epoch': 27.86} [2022-12-19 20:20:40,486] [INFO] [logging.py:68:log_dist] [Rank 0] step=1960, skipped=4, lr=[6.7666666666666665e-06], mom=[[0.9, 0.999]] [2022-12-19 20:20:40,487] [INFO] [timer.py:196:stop] epoch=0/micro_step=1960/global_step=1960, RunningAvgSamplesPerSec=5.0303633009507145, CurrSamplesPerSec=5.424337984250442, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 20:23:05,252] [INFO] [logging.py:68:log_dist] [Rank 0] step=1970, skipped=4, lr=[6.744444444444444e-06], mom=[[0.9, 0.999]] [2022-12-19 20:23:05,253] [INFO] [timer.py:196:stop] epoch=0/micro_step=1970/global_step=1970, RunningAvgSamplesPerSec=5.031453930598145, CurrSamplesPerSec=5.358133129666003, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0008, 'learning_rate': 6.733333333333334e-06, 'epoch': 28.21} [2022-12-19 20:25:30,107] [INFO] [logging.py:68:log_dist] [Rank 0] step=1980, skipped=4, lr=[6.7222222222222235e-06], mom=[[0.9, 0.999]] [2022-12-19 20:25:30,108] [INFO] [timer.py:196:stop] epoch=0/micro_step=1980/global_step=1980, RunningAvgSamplesPerSec=5.032468391176715, CurrSamplesPerSec=5.292328019248439, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 20:27:54,491] [INFO] [logging.py:68:log_dist] [Rank 0] step=1990, skipped=4, lr=[6.700000000000001e-06], mom=[[0.9, 0.999]] [2022-12-19 20:27:54,492] [INFO] [timer.py:196:stop] epoch=0/micro_step=1990/global_step=1990, RunningAvgSamplesPerSec=5.033507364974611, CurrSamplesPerSec=5.41738267814836, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 20:30:20,024] [INFO] [logging.py:68:log_dist] [Rank 0] step=2000, skipped=4, lr=[6.677777777777779e-06], mom=[[0.9, 0.999]] [2022-12-19 20:30:20,026] [INFO] [timer.py:196:stop] epoch=0/micro_step=2000/global_step=2000, RunningAvgSamplesPerSec=5.034536971603142, CurrSamplesPerSec=5.407613219940482, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0013, 'learning_rate': 6.677777777777779e-06, 'epoch': 28.57} {'eval_loss': 0.434326171875, 'eval_wer': 24.036173250832938, 'eval_runtime': 808.1708, 'eval_samples_per_second': 2.805, 'eval_steps_per_second': 0.088, 'epoch': 28.57} [2022-12-19 20:43:51,070] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step2000 is begin to save! [2022-12-19 20:43:51,082] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: ./checkpoint-2000/global_step2000/mp_rank_00_model_states.pt [2022-12-19 20:43:51,082] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving ./checkpoint-2000/global_step2000/mp_rank_00_model_states.pt... [2022-12-19 20:43:53,897] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved ./checkpoint-2000/global_step2000/mp_rank_00_model_states.pt. [2022-12-19 20:43:53,899] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving ./checkpoint-2000/global_step2000/zero_pp_rank_0_mp_rank_00_optim_states.pt... [2022-12-19 20:44:10,315] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved ./checkpoint-2000/global_step2000/zero_pp_rank_0_mp_rank_00_optim_states.pt. [2022-12-19 20:44:10,316] [INFO] [engine.py:3394:_save_zero_checkpoint] zero checkpoint saved ./checkpoint-2000/global_step2000/zero_pp_rank_0_mp_rank_00_optim_states.pt [2022-12-19 20:44:10,316] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step2000 is ready now! [2022-12-19 20:48:49,363] [INFO] [logging.py:68:log_dist] [Rank 0] step=2010, skipped=4, lr=[6.655555555555556e-06], mom=[[0.9, 0.999]] [2022-12-19 20:48:49,364] [INFO] [timer.py:196:stop] epoch=0/micro_step=2010/global_step=2010, RunningAvgSamplesPerSec=5.038788625814539, CurrSamplesPerSec=5.741111115008353, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 20:51:10,071] [INFO] [logging.py:68:log_dist] [Rank 0] step=2020, skipped=4, lr=[6.633333333333334e-06], mom=[[0.9, 0.999]] [2022-12-19 20:51:10,073] [INFO] [timer.py:196:stop] epoch=0/micro_step=2020/global_step=2020, RunningAvgSamplesPerSec=5.040715890336116, CurrSamplesPerSec=5.254622747659462, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0009, 'learning_rate': 6.6222222222222236e-06, 'epoch': 28.93} [2022-12-19 20:53:42,301] [INFO] [logging.py:68:log_dist] [Rank 0] step=2030, skipped=4, lr=[6.6111111111111115e-06], mom=[[0.9, 0.999]] [2022-12-19 20:53:42,302] [INFO] [timer.py:196:stop] epoch=0/micro_step=2030/global_step=2030, RunningAvgSamplesPerSec=5.040270454083024, CurrSamplesPerSec=5.019867297026858, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 20:56:17,392] [INFO] [logging.py:68:log_dist] [Rank 0] step=2040, skipped=4, lr=[6.588888888888889e-06], mom=[[0.9, 0.999]] [2022-12-19 20:56:17,394] [INFO] [timer.py:196:stop] epoch=0/micro_step=2040/global_step=2040, RunningAvgSamplesPerSec=5.039258280926622, CurrSamplesPerSec=4.872748852305179, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 20:58:53,712] [INFO] [logging.py:68:log_dist] [Rank 0] step=2050, skipped=4, lr=[6.566666666666667e-06], mom=[[0.9, 0.999]] [2022-12-19 20:58:53,713] [INFO] [timer.py:196:stop] epoch=0/micro_step=2050/global_step=2050, RunningAvgSamplesPerSec=5.038154269213655, CurrSamplesPerSec=4.681309883510423, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0004, 'learning_rate': 6.566666666666667e-06, 'epoch': 29.29} [2022-12-19 21:01:27,234] [INFO] [logging.py:68:log_dist] [Rank 0] step=2060, skipped=4, lr=[6.544444444444445e-06], mom=[[0.9, 0.999]] [2022-12-19 21:01:27,235] [INFO] [timer.py:196:stop] epoch=0/micro_step=2060/global_step=2060, RunningAvgSamplesPerSec=5.037395132928792, CurrSamplesPerSec=4.886965900510803, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 21:04:05,176] [INFO] [logging.py:68:log_dist] [Rank 0] step=2070, skipped=4, lr=[6.522222222222223e-06], mom=[[0.9, 0.999]] [2022-12-19 21:04:05,177] [INFO] [timer.py:196:stop] epoch=0/micro_step=2070/global_step=2070, RunningAvgSamplesPerSec=5.036089401662635, CurrSamplesPerSec=4.816204538361026, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0003, 'learning_rate': 6.511111111111112e-06, 'epoch': 29.64} [2022-12-19 21:06:40,899] [INFO] [logging.py:68:log_dist] [Rank 0] step=2080, skipped=4, lr=[6.5000000000000004e-06], mom=[[0.9, 0.999]] [2022-12-19 21:06:40,900] [INFO] [timer.py:196:stop] epoch=0/micro_step=2080/global_step=2080, RunningAvgSamplesPerSec=5.034928038568274, CurrSamplesPerSec=4.694190470229832, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 21:09:16,575] [INFO] [logging.py:68:log_dist] [Rank 0] step=2090, skipped=4, lr=[6.477777777777778e-06], mom=[[0.9, 0.999]] [2022-12-19 21:09:16,577] [INFO] [timer.py:196:stop] epoch=0/micro_step=2090/global_step=2090, RunningAvgSamplesPerSec=5.033722883648811, CurrSamplesPerSec=4.85714578324611, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 21:11:52,443] [INFO] [logging.py:68:log_dist] [Rank 0] step=2100, skipped=4, lr=[6.455555555555556e-06], mom=[[0.9, 0.999]] [2022-12-19 21:11:52,444] [INFO] [timer.py:196:stop] epoch=0/micro_step=2100/global_step=2100, RunningAvgSamplesPerSec=5.032629475613512, CurrSamplesPerSec=5.084535161650288, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0012, 'learning_rate': 6.455555555555556e-06, 'epoch': 30.0} [2022-12-19 21:14:22,776] [INFO] [logging.py:68:log_dist] [Rank 0] step=2110, skipped=4, lr=[6.433333333333333e-06], mom=[[0.9, 0.999]] [2022-12-19 21:14:22,777] [INFO] [timer.py:196:stop] epoch=0/micro_step=2110/global_step=2110, RunningAvgSamplesPerSec=5.0325833367865185, CurrSamplesPerSec=5.142876958436827, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 21:16:54,494] [INFO] [logging.py:68:log_dist] [Rank 0] step=2120, skipped=4, lr=[6.411111111111111e-06], mom=[[0.9, 0.999]] [2022-12-19 21:16:54,495] [INFO] [timer.py:196:stop] epoch=0/micro_step=2120/global_step=2120, RunningAvgSamplesPerSec=5.03229975114763, CurrSamplesPerSec=4.753657521830555, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0008, 'learning_rate': 6.4000000000000006e-06, 'epoch': 30.36} [2022-12-19 21:19:30,416] [INFO] [logging.py:68:log_dist] [Rank 0] step=2130, skipped=4, lr=[6.3888888888888885e-06], mom=[[0.9, 0.999]] [2022-12-19 21:19:30,417] [INFO] [timer.py:196:stop] epoch=0/micro_step=2130/global_step=2130, RunningAvgSamplesPerSec=5.031186441737795, CurrSamplesPerSec=4.72291542319439, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 21:21:59,392] [INFO] [logging.py:68:log_dist] [Rank 0] step=2140, skipped=4, lr=[6.366666666666668e-06], mom=[[0.9, 0.999]] [2022-12-19 21:21:59,393] [INFO] [timer.py:196:stop] epoch=0/micro_step=2140/global_step=2140, RunningAvgSamplesPerSec=5.031371713224296, CurrSamplesPerSec=5.061862173402866, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 21:24:31,836] [INFO] [logging.py:68:log_dist] [Rank 0] step=2150, skipped=4, lr=[6.3444444444444454e-06], mom=[[0.9, 0.999]] [2022-12-19 21:24:31,838] [INFO] [timer.py:196:stop] epoch=0/micro_step=2150/global_step=2150, RunningAvgSamplesPerSec=5.030856711667038, CurrSamplesPerSec=4.770597975006471, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0003, 'learning_rate': 6.3444444444444454e-06, 'epoch': 30.71} [2022-12-19 21:27:06,954] [INFO] [logging.py:68:log_dist] [Rank 0] step=2160, skipped=4, lr=[6.322222222222223e-06], mom=[[0.9, 0.999]] [2022-12-19 21:27:06,956] [INFO] [timer.py:196:stop] epoch=0/micro_step=2160/global_step=2160, RunningAvgSamplesPerSec=5.029801717943637, CurrSamplesPerSec=4.936408751126107, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 21:29:43,922] [INFO] [logging.py:68:log_dist] [Rank 0] step=2170, skipped=4, lr=[6.300000000000001e-06], mom=[[0.9, 0.999]] [2022-12-19 21:29:43,923] [INFO] [timer.py:196:stop] epoch=0/micro_step=2170/global_step=2170, RunningAvgSamplesPerSec=5.028505606586086, CurrSamplesPerSec=4.881118034247176, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0003, 'learning_rate': 6.28888888888889e-06, 'epoch': 31.07} [2022-12-19 21:32:21,743] [INFO] [logging.py:68:log_dist] [Rank 0] step=2180, skipped=4, lr=[6.277777777777778e-06], mom=[[0.9, 0.999]] [2022-12-19 21:32:21,744] [INFO] [timer.py:196:stop] epoch=0/micro_step=2180/global_step=2180, RunningAvgSamplesPerSec=5.027105235023926, CurrSamplesPerSec=4.705058872248197, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 21:34:57,949] [INFO] [logging.py:68:log_dist] [Rank 0] step=2190, skipped=4, lr=[6.255555555555556e-06], mom=[[0.9, 0.999]] [2022-12-19 21:34:57,951] [INFO] [timer.py:196:stop] epoch=0/micro_step=2190/global_step=2190, RunningAvgSamplesPerSec=5.026135178611733, CurrSamplesPerSec=4.94455846679253, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 21:37:35,723] [INFO] [logging.py:68:log_dist] [Rank 0] step=2200, skipped=4, lr=[6.2333333333333335e-06], mom=[[0.9, 0.999]] [2022-12-19 21:37:35,725] [INFO] [timer.py:196:stop] epoch=0/micro_step=2200/global_step=2200, RunningAvgSamplesPerSec=5.024799073703429, CurrSamplesPerSec=4.887763103861499, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0002, 'learning_rate': 6.2333333333333335e-06, 'epoch': 31.43} [2022-12-19 21:40:10,718] [INFO] [logging.py:68:log_dist] [Rank 0] step=2210, skipped=4, lr=[6.211111111111111e-06], mom=[[0.9, 0.999]] [2022-12-19 21:40:10,719] [INFO] [timer.py:196:stop] epoch=0/micro_step=2210/global_step=2210, RunningAvgSamplesPerSec=5.023907776541282, CurrSamplesPerSec=4.806432721596209, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 21:42:47,338] [INFO] [logging.py:68:log_dist] [Rank 0] step=2220, skipped=4, lr=[6.18888888888889e-06], mom=[[0.9, 0.999]] [2022-12-19 21:42:47,340] [INFO] [timer.py:196:stop] epoch=0/micro_step=2220/global_step=2220, RunningAvgSamplesPerSec=5.022768396353744, CurrSamplesPerSec=4.9585817492027555, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0002, 'learning_rate': 6.177777777777778e-06, 'epoch': 31.79} [2022-12-19 21:45:23,026] [INFO] [logging.py:68:log_dist] [Rank 0] step=2230, skipped=4, lr=[6.166666666666667e-06], mom=[[0.9, 0.999]] [2022-12-19 21:45:23,028] [INFO] [timer.py:196:stop] epoch=0/micro_step=2230/global_step=2230, RunningAvgSamplesPerSec=5.02194653297855, CurrSamplesPerSec=4.888745928753426, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 21:47:55,516] [INFO] [logging.py:68:log_dist] [Rank 0] step=2240, skipped=4, lr=[6.144444444444445e-06], mom=[[0.9, 0.999]] [2022-12-19 21:47:55,518] [INFO] [timer.py:196:stop] epoch=0/micro_step=2240/global_step=2240, RunningAvgSamplesPerSec=5.021521486015416, CurrSamplesPerSec=5.249981830890771, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 21:50:30,110] [INFO] [logging.py:68:log_dist] [Rank 0] step=2250, skipped=4, lr=[6.1222222222222224e-06], mom=[[0.9, 0.999]] [2022-12-19 21:50:30,112] [INFO] [timer.py:196:stop] epoch=0/micro_step=2250/global_step=2250, RunningAvgSamplesPerSec=5.020656272766083, CurrSamplesPerSec=4.861155926895916, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0002, 'learning_rate': 6.1222222222222224e-06, 'epoch': 32.14} [2022-12-19 21:53:04,940] [INFO] [logging.py:68:log_dist] [Rank 0] step=2260, skipped=4, lr=[6.1e-06], mom=[[0.9, 0.999]] [2022-12-19 21:53:04,941] [INFO] [timer.py:196:stop] epoch=0/micro_step=2260/global_step=2260, RunningAvgSamplesPerSec=5.01979590939625, CurrSamplesPerSec=5.064675246824081, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 21:55:40,819] [INFO] [logging.py:68:log_dist] [Rank 0] step=2270, skipped=4, lr=[6.077777777777778e-06], mom=[[0.9, 0.999]] [2022-12-19 21:55:40,820] [INFO] [timer.py:196:stop] epoch=0/micro_step=2270/global_step=2270, RunningAvgSamplesPerSec=5.018871844157221, CurrSamplesPerSec=4.804179155390462, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0002, 'learning_rate': 6.066666666666667e-06, 'epoch': 32.5} [2022-12-19 21:58:16,154] [INFO] [logging.py:68:log_dist] [Rank 0] step=2280, skipped=4, lr=[6.055555555555555e-06], mom=[[0.9, 0.999]] [2022-12-19 21:58:16,156] [INFO] [timer.py:196:stop] epoch=0/micro_step=2280/global_step=2280, RunningAvgSamplesPerSec=5.018020324016142, CurrSamplesPerSec=5.01544438861877, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 22:00:47,842] [INFO] [logging.py:68:log_dist] [Rank 0] step=2290, skipped=4, lr=[6.033333333333335e-06], mom=[[0.9, 0.999]] [2022-12-19 22:00:47,843] [INFO] [timer.py:196:stop] epoch=0/micro_step=2290/global_step=2290, RunningAvgSamplesPerSec=5.017767410098977, CurrSamplesPerSec=4.919736793658102, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 22:03:13,587] [INFO] [logging.py:68:log_dist] [Rank 0] step=2300, skipped=4, lr=[6.011111111111112e-06], mom=[[0.9, 0.999]] [2022-12-19 22:03:13,589] [INFO] [timer.py:196:stop] epoch=0/micro_step=2300/global_step=2300, RunningAvgSamplesPerSec=5.01856742931561, CurrSamplesPerSec=5.429814844508236, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0002, 'learning_rate': 6.011111111111112e-06, 'epoch': 32.86} [2022-12-19 22:05:37,065] [INFO] [logging.py:68:log_dist] [Rank 0] step=2310, skipped=4, lr=[5.98888888888889e-06], mom=[[0.9, 0.999]] [2022-12-19 22:05:37,066] [INFO] [timer.py:196:stop] epoch=0/micro_step=2310/global_step=2310, RunningAvgSamplesPerSec=5.019755239622022, CurrSamplesPerSec=5.229748428555174, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 22:08:08,233] [INFO] [logging.py:68:log_dist] [Rank 0] step=2320, skipped=4, lr=[5.966666666666667e-06], mom=[[0.9, 0.999]] [2022-12-19 22:08:08,234] [INFO] [timer.py:196:stop] epoch=0/micro_step=2320/global_step=2320, RunningAvgSamplesPerSec=5.019518685208239, CurrSamplesPerSec=4.818575964276307, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0002, 'learning_rate': 5.955555555555555e-06, 'epoch': 33.21} [2022-12-19 22:10:39,324] [INFO] [logging.py:68:log_dist] [Rank 0] step=2330, skipped=4, lr=[5.944444444444445e-06], mom=[[0.9, 0.999]] [2022-12-19 22:10:39,325] [INFO] [timer.py:196:stop] epoch=0/micro_step=2330/global_step=2330, RunningAvgSamplesPerSec=5.01941881219482, CurrSamplesPerSec=5.380053932125595, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 22:13:01,481] [INFO] [logging.py:68:log_dist] [Rank 0] step=2340, skipped=4, lr=[5.922222222222223e-06], mom=[[0.9, 0.999]] [2022-12-19 22:13:01,482] [INFO] [timer.py:196:stop] epoch=0/micro_step=2340/global_step=2340, RunningAvgSamplesPerSec=5.0207384509667285, CurrSamplesPerSec=5.388863685739212, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 22:15:33,984] [INFO] [logging.py:68:log_dist] [Rank 0] step=2350, skipped=4, lr=[5.9e-06], mom=[[0.9, 0.999]] [2022-12-19 22:15:33,986] [INFO] [timer.py:196:stop] epoch=0/micro_step=2350/global_step=2350, RunningAvgSamplesPerSec=5.020404647594999, CurrSamplesPerSec=4.906101612998677, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0002, 'learning_rate': 5.9e-06, 'epoch': 33.57} [2022-12-19 22:18:08,131] [INFO] [logging.py:68:log_dist] [Rank 0] step=2360, skipped=4, lr=[5.877777777777778e-06], mom=[[0.9, 0.999]] [2022-12-19 22:18:08,133] [INFO] [timer.py:196:stop] epoch=0/micro_step=2360/global_step=2360, RunningAvgSamplesPerSec=5.0196505126327695, CurrSamplesPerSec=4.790363461506992, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 22:20:42,957] [INFO] [logging.py:68:log_dist] [Rank 0] step=2370, skipped=4, lr=[5.855555555555556e-06], mom=[[0.9, 0.999]] [2022-12-19 22:20:42,958] [INFO] [timer.py:196:stop] epoch=0/micro_step=2370/global_step=2370, RunningAvgSamplesPerSec=5.018941689446698, CurrSamplesPerSec=4.721525385971974, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0002, 'learning_rate': 5.844444444444445e-06, 'epoch': 33.93} [2022-12-19 22:23:18,124] [INFO] [logging.py:68:log_dist] [Rank 0] step=2380, skipped=4, lr=[5.833333333333334e-06], mom=[[0.9, 0.999]] [2022-12-19 22:23:18,126] [INFO] [timer.py:196:stop] epoch=0/micro_step=2380/global_step=2380, RunningAvgSamplesPerSec=5.018156049458438, CurrSamplesPerSec=4.840106212955945, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 22:25:52,836] [INFO] [logging.py:68:log_dist] [Rank 0] step=2390, skipped=4, lr=[5.8111111111111116e-06], mom=[[0.9, 0.999]] [2022-12-19 22:25:52,837] [INFO] [timer.py:196:stop] epoch=0/micro_step=2390/global_step=2390, RunningAvgSamplesPerSec=5.017432632220724, CurrSamplesPerSec=4.93816130474155, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 22:28:25,540] [INFO] [logging.py:68:log_dist] [Rank 0] step=2400, skipped=4, lr=[5.788888888888889e-06], mom=[[0.9, 0.999]] [2022-12-19 22:28:25,541] [INFO] [timer.py:196:stop] epoch=0/micro_step=2400/global_step=2400, RunningAvgSamplesPerSec=5.017062338894143, CurrSamplesPerSec=4.959936911431736, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0002, 'learning_rate': 5.788888888888889e-06, 'epoch': 34.29} [2022-12-19 22:31:00,571] [INFO] [logging.py:68:log_dist] [Rank 0] step=2410, skipped=4, lr=[5.766666666666667e-06], mom=[[0.9, 0.999]] [2022-12-19 22:31:00,572] [INFO] [timer.py:196:stop] epoch=0/micro_step=2410/global_step=2410, RunningAvgSamplesPerSec=5.016319599801056, CurrSamplesPerSec=4.862533133179454, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 22:33:36,295] [INFO] [logging.py:68:log_dist] [Rank 0] step=2420, skipped=4, lr=[5.744444444444444e-06], mom=[[0.9, 0.999]] [2022-12-19 22:33:36,297] [INFO] [timer.py:196:stop] epoch=0/micro_step=2420/global_step=2420, RunningAvgSamplesPerSec=5.015410422736218, CurrSamplesPerSec=4.755821630638045, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0002, 'learning_rate': 5.733333333333334e-06, 'epoch': 34.64} [2022-12-19 22:36:12,370] [INFO] [logging.py:68:log_dist] [Rank 0] step=2430, skipped=4, lr=[5.722222222222222e-06], mom=[[0.9, 0.999]] [2022-12-19 22:36:12,371] [INFO] [timer.py:196:stop] epoch=0/micro_step=2430/global_step=2430, RunningAvgSamplesPerSec=5.0144801043047, CurrSamplesPerSec=4.6761908766884455, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 22:38:47,257] [INFO] [logging.py:68:log_dist] [Rank 0] step=2440, skipped=4, lr=[5.7e-06], mom=[[0.9, 0.999]] [2022-12-19 22:38:47,259] [INFO] [timer.py:196:stop] epoch=0/micro_step=2440/global_step=2440, RunningAvgSamplesPerSec=5.01376407378459, CurrSamplesPerSec=4.998351376382583, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 22:41:23,787] [INFO] [logging.py:68:log_dist] [Rank 0] step=2450, skipped=4, lr=[5.677777777777779e-06], mom=[[0.9, 0.999]] [2022-12-19 22:41:23,788] [INFO] [timer.py:196:stop] epoch=0/micro_step=2450/global_step=2450, RunningAvgSamplesPerSec=5.0127283496244175, CurrSamplesPerSec=4.866878997012885, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0002, 'learning_rate': 5.677777777777779e-06, 'epoch': 35.0} [2022-12-19 22:43:58,811] [INFO] [logging.py:68:log_dist] [Rank 0] step=2460, skipped=4, lr=[5.6555555555555566e-06], mom=[[0.9, 0.999]] [2022-12-19 22:43:58,812] [INFO] [timer.py:196:stop] epoch=0/micro_step=2460/global_step=2460, RunningAvgSamplesPerSec=5.012109800229049, CurrSamplesPerSec=5.116301989290539, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 22:46:30,634] [INFO] [logging.py:68:log_dist] [Rank 0] step=2470, skipped=4, lr=[5.633333333333334e-06], mom=[[0.9, 0.999]] [2022-12-19 22:46:30,636] [INFO] [timer.py:196:stop] epoch=0/micro_step=2470/global_step=2470, RunningAvgSamplesPerSec=5.011957952418825, CurrSamplesPerSec=5.06643487454284, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0002, 'learning_rate': 5.622222222222222e-06, 'epoch': 35.36} [2022-12-19 22:49:02,827] [INFO] [logging.py:68:log_dist] [Rank 0] step=2480, skipped=4, lr=[5.611111111111112e-06], mom=[[0.9, 0.999]] [2022-12-19 22:49:02,829] [INFO] [timer.py:196:stop] epoch=0/micro_step=2480/global_step=2480, RunningAvgSamplesPerSec=5.011667997060813, CurrSamplesPerSec=4.91782691319557, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 22:51:35,685] [INFO] [logging.py:68:log_dist] [Rank 0] step=2490, skipped=4, lr=[5.588888888888889e-06], mom=[[0.9, 0.999]] [2022-12-19 22:51:35,687] [INFO] [timer.py:196:stop] epoch=0/micro_step=2490/global_step=2490, RunningAvgSamplesPerSec=5.011336986573131, CurrSamplesPerSec=4.797626182170902, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 22:54:11,004] [INFO] [logging.py:68:log_dist] [Rank 0] step=2500, skipped=4, lr=[5.566666666666667e-06], mom=[[0.9, 0.999]] [2022-12-19 22:54:11,006] [INFO] [timer.py:196:stop] epoch=0/micro_step=2500/global_step=2500, RunningAvgSamplesPerSec=5.010731941763919, CurrSamplesPerSec=5.002523406164106, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0002, 'learning_rate': 5.566666666666667e-06, 'epoch': 35.71} [2022-12-19 22:56:45,800] [INFO] [logging.py:68:log_dist] [Rank 0] step=2510, skipped=4, lr=[5.544444444444445e-06], mom=[[0.9, 0.999]] [2022-12-19 22:56:45,801] [INFO] [timer.py:196:stop] epoch=0/micro_step=2510/global_step=2510, RunningAvgSamplesPerSec=5.010213342335517, CurrSamplesPerSec=4.877912934612746, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 22:59:21,270] [INFO] [logging.py:68:log_dist] [Rank 0] step=2520, skipped=4, lr=[5.522222222222222e-06], mom=[[0.9, 0.999]] [2022-12-19 22:59:21,272] [INFO] [timer.py:196:stop] epoch=0/micro_step=2520/global_step=2520, RunningAvgSamplesPerSec=5.009610394799621, CurrSamplesPerSec=4.964960516761187, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0002, 'learning_rate': 5.511111111111112e-06, 'epoch': 36.07} [2022-12-19 23:01:56,716] [INFO] [logging.py:68:log_dist] [Rank 0] step=2530, skipped=4, lr=[5.500000000000001e-06], mom=[[0.9, 0.999]] [2022-12-19 23:01:56,717] [INFO] [timer.py:196:stop] epoch=0/micro_step=2530/global_step=2530, RunningAvgSamplesPerSec=5.008985708063108, CurrSamplesPerSec=4.763091216905591, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 23:04:31,862] [INFO] [logging.py:68:log_dist] [Rank 0] step=2540, skipped=4, lr=[5.477777777777778e-06], mom=[[0.9, 0.999]] [2022-12-19 23:04:31,864] [INFO] [timer.py:196:stop] epoch=0/micro_step=2540/global_step=2540, RunningAvgSamplesPerSec=5.008317613981477, CurrSamplesPerSec=4.901298100941244, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 23:07:00,763] [INFO] [logging.py:68:log_dist] [Rank 0] step=2550, skipped=4, lr=[5.455555555555556e-06], mom=[[0.9, 0.999]] [2022-12-19 23:07:00,765] [INFO] [timer.py:196:stop] epoch=0/micro_step=2550/global_step=2550, RunningAvgSamplesPerSec=5.00865642010649, CurrSamplesPerSec=5.043270031682921, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0002, 'learning_rate': 5.455555555555556e-06, 'epoch': 36.43} [2022-12-19 23:09:27,345] [INFO] [logging.py:68:log_dist] [Rank 0] step=2560, skipped=4, lr=[5.4333333333333335e-06], mom=[[0.9, 0.999]] [2022-12-19 23:09:27,347] [INFO] [timer.py:196:stop] epoch=0/micro_step=2560/global_step=2560, RunningAvgSamplesPerSec=5.009370143999925, CurrSamplesPerSec=5.301099846752491, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 23:11:54,770] [INFO] [logging.py:68:log_dist] [Rank 0] step=2570, skipped=4, lr=[5.411111111111111e-06], mom=[[0.9, 0.999]] [2022-12-19 23:11:54,771] [INFO] [timer.py:196:stop] epoch=0/micro_step=2570/global_step=2570, RunningAvgSamplesPerSec=5.009801452694927, CurrSamplesPerSec=5.219546892338568, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0002, 'learning_rate': 5.400000000000001e-06, 'epoch': 36.79} [2022-12-19 23:14:24,769] [INFO] [logging.py:68:log_dist] [Rank 0] step=2580, skipped=4, lr=[5.388888888888889e-06], mom=[[0.9, 0.999]] [2022-12-19 23:14:24,771] [INFO] [timer.py:196:stop] epoch=0/micro_step=2580/global_step=2580, RunningAvgSamplesPerSec=5.00994327941254, CurrSamplesPerSec=4.969306846600432, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 23:16:59,574] [INFO] [logging.py:68:log_dist] [Rank 0] step=2590, skipped=4, lr=[5.366666666666666e-06], mom=[[0.9, 0.999]] [2022-12-19 23:16:59,576] [INFO] [timer.py:196:stop] epoch=0/micro_step=2590/global_step=2590, RunningAvgSamplesPerSec=5.0092961418772415, CurrSamplesPerSec=4.946033920815317, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 23:19:36,052] [INFO] [logging.py:68:log_dist] [Rank 0] step=2600, skipped=4, lr=[5.344444444444446e-06], mom=[[0.9, 0.999]] [2022-12-19 23:19:36,053] [INFO] [timer.py:196:stop] epoch=0/micro_step=2600/global_step=2600, RunningAvgSamplesPerSec=5.008433915578442, CurrSamplesPerSec=4.634065922550783, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0002, 'learning_rate': 5.344444444444446e-06, 'epoch': 37.14} [2022-12-19 23:22:13,065] [INFO] [logging.py:68:log_dist] [Rank 0] step=2610, skipped=4, lr=[5.322222222222223e-06], mom=[[0.9, 0.999]] [2022-12-19 23:22:13,066] [INFO] [timer.py:196:stop] epoch=0/micro_step=2610/global_step=2610, RunningAvgSamplesPerSec=5.007431619790054, CurrSamplesPerSec=4.664613349416594, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 23:24:50,330] [INFO] [logging.py:68:log_dist] [Rank 0] step=2620, skipped=4, lr=[5.300000000000001e-06], mom=[[0.9, 0.999]] [2022-12-19 23:24:50,331] [INFO] [timer.py:196:stop] epoch=0/micro_step=2620/global_step=2620, RunningAvgSamplesPerSec=5.006406838649918, CurrSamplesPerSec=4.778275318510354, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0002, 'learning_rate': 5.288888888888889e-06, 'epoch': 37.5} [2022-12-19 23:27:20,571] [INFO] [logging.py:68:log_dist] [Rank 0] step=2630, skipped=4, lr=[5.2777777777777785e-06], mom=[[0.9, 0.999]] [2022-12-19 23:27:20,572] [INFO] [timer.py:196:stop] epoch=0/micro_step=2630/global_step=2630, RunningAvgSamplesPerSec=5.006470363602303, CurrSamplesPerSec=4.97380833500541, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 23:29:47,729] [INFO] [logging.py:68:log_dist] [Rank 0] step=2640, skipped=4, lr=[5.255555555555556e-06], mom=[[0.9, 0.999]] [2022-12-19 23:29:47,730] [INFO] [timer.py:196:stop] epoch=0/micro_step=2640/global_step=2640, RunningAvgSamplesPerSec=5.0070170730427, CurrSamplesPerSec=5.153741510913132, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 23:32:14,955] [INFO] [logging.py:68:log_dist] [Rank 0] step=2650, skipped=4, lr=[5.233333333333334e-06], mom=[[0.9, 0.999]] [2022-12-19 23:32:14,956] [INFO] [timer.py:196:stop] epoch=0/micro_step=2650/global_step=2650, RunningAvgSamplesPerSec=5.007566421512358, CurrSamplesPerSec=5.0691336616303175, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0002, 'learning_rate': 5.233333333333334e-06, 'epoch': 37.86} [2022-12-19 23:34:40,787] [INFO] [logging.py:68:log_dist] [Rank 0] step=2660, skipped=4, lr=[5.211111111111111e-06], mom=[[0.9, 0.999]] [2022-12-19 23:34:40,788] [INFO] [timer.py:196:stop] epoch=0/micro_step=2660/global_step=2660, RunningAvgSamplesPerSec=5.008339953720178, CurrSamplesPerSec=5.3756600634884935, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 23:37:07,686] [INFO] [logging.py:68:log_dist] [Rank 0] step=2670, skipped=4, lr=[5.188888888888889e-06], mom=[[0.9, 0.999]] [2022-12-19 23:37:07,687] [INFO] [timer.py:196:stop] epoch=0/micro_step=2670/global_step=2670, RunningAvgSamplesPerSec=5.008827207803368, CurrSamplesPerSec=5.242575070535937, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0002, 'learning_rate': 5.177777777777779e-06, 'epoch': 38.21} [2022-12-19 23:39:35,780] [INFO] [logging.py:68:log_dist] [Rank 0] step=2680, skipped=4, lr=[5.1666666666666675e-06], mom=[[0.9, 0.999]] [2022-12-19 23:39:35,782] [INFO] [timer.py:196:stop] epoch=0/micro_step=2680/global_step=2680, RunningAvgSamplesPerSec=5.009264848135848, CurrSamplesPerSec=5.190199374972943, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 23:42:05,892] [INFO] [logging.py:68:log_dist] [Rank 0] step=2690, skipped=4, lr=[5.144444444444445e-06], mom=[[0.9, 0.999]] [2022-12-19 23:42:05,894] [INFO] [timer.py:196:stop] epoch=0/micro_step=2690/global_step=2690, RunningAvgSamplesPerSec=5.009361432459275, CurrSamplesPerSec=5.065279144646982, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 23:44:32,396] [INFO] [logging.py:68:log_dist] [Rank 0] step=2700, skipped=4, lr=[5.122222222222223e-06], mom=[[0.9, 0.999]] [2022-12-19 23:44:32,397] [INFO] [timer.py:196:stop] epoch=0/micro_step=2700/global_step=2700, RunningAvgSamplesPerSec=5.010017424948131, CurrSamplesPerSec=5.245173124125204, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0002, 'learning_rate': 5.122222222222223e-06, 'epoch': 38.57} [2022-12-19 23:46:59,065] [INFO] [logging.py:68:log_dist] [Rank 0] step=2710, skipped=4, lr=[5.1e-06], mom=[[0.9, 0.999]] [2022-12-19 23:46:59,066] [INFO] [timer.py:196:stop] epoch=0/micro_step=2710/global_step=2710, RunningAvgSamplesPerSec=5.010662010364094, CurrSamplesPerSec=5.211371255469591, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 23:49:25,011] [INFO] [logging.py:68:log_dist] [Rank 0] step=2720, skipped=4, lr=[5.077777777777778e-06], mom=[[0.9, 0.999]] [2022-12-19 23:49:25,012] [INFO] [timer.py:196:stop] epoch=0/micro_step=2720/global_step=2720, RunningAvgSamplesPerSec=5.011298393732254, CurrSamplesPerSec=5.284884197670851, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0002, 'learning_rate': 5.0666666666666676e-06, 'epoch': 38.93} [2022-12-19 23:51:48,708] [INFO] [logging.py:68:log_dist] [Rank 0] step=2730, skipped=4, lr=[5.0555555555555555e-06], mom=[[0.9, 0.999]] [2022-12-19 23:51:48,709] [INFO] [timer.py:196:stop] epoch=0/micro_step=2730/global_step=2730, RunningAvgSamplesPerSec=5.012349964989924, CurrSamplesPerSec=5.519505543944101, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 23:54:11,970] [INFO] [logging.py:68:log_dist] [Rank 0] step=2740, skipped=4, lr=[5.033333333333333e-06], mom=[[0.9, 0.999]] [2022-12-19 23:54:11,972] [INFO] [timer.py:196:stop] epoch=0/micro_step=2740/global_step=2740, RunningAvgSamplesPerSec=5.01342195898538, CurrSamplesPerSec=5.08839069432763, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-19 23:56:36,141] [INFO] [logging.py:68:log_dist] [Rank 0] step=2750, skipped=4, lr=[5.011111111111111e-06], mom=[[0.9, 0.999]] [2022-12-19 23:56:36,142] [INFO] [timer.py:196:stop] epoch=0/micro_step=2750/global_step=2750, RunningAvgSamplesPerSec=5.014321986245809, CurrSamplesPerSec=5.154240947984718, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 5.011111111111111e-06, 'epoch': 39.29} [2022-12-19 23:59:02,607] [INFO] [logging.py:68:log_dist] [Rank 0] step=2760, skipped=4, lr=[4.988888888888889e-06], mom=[[0.9, 0.999]] [2022-12-19 23:59:02,609] [INFO] [timer.py:196:stop] epoch=0/micro_step=2760/global_step=2760, RunningAvgSamplesPerSec=5.014927987066132, CurrSamplesPerSec=5.203088365496668, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 00:01:26,310] [INFO] [logging.py:68:log_dist] [Rank 0] step=2770, skipped=4, lr=[4.966666666666667e-06], mom=[[0.9, 0.999]] [2022-12-20 00:01:26,312] [INFO] [timer.py:196:stop] epoch=0/micro_step=2770/global_step=2770, RunningAvgSamplesPerSec=5.0158771558393225, CurrSamplesPerSec=5.229649293711673, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 4.9555555555555565e-06, 'epoch': 39.64} [2022-12-20 00:03:58,155] [INFO] [logging.py:68:log_dist] [Rank 0] step=2780, skipped=4, lr=[4.944444444444445e-06], mom=[[0.9, 0.999]] [2022-12-20 00:03:58,157] [INFO] [timer.py:196:stop] epoch=0/micro_step=2780/global_step=2780, RunningAvgSamplesPerSec=5.015730765175181, CurrSamplesPerSec=4.594587869326162, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 00:06:32,692] [INFO] [logging.py:68:log_dist] [Rank 0] step=2790, skipped=4, lr=[4.922222222222223e-06], mom=[[0.9, 0.999]] [2022-12-20 00:06:32,694] [INFO] [timer.py:196:stop] epoch=0/micro_step=2790/global_step=2790, RunningAvgSamplesPerSec=5.015144665257556, CurrSamplesPerSec=4.9152991219988955, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 00:09:03,845] [INFO] [logging.py:68:log_dist] [Rank 0] step=2800, skipped=4, lr=[4.9000000000000005e-06], mom=[[0.9, 0.999]] [2022-12-20 00:09:03,847] [INFO] [timer.py:196:stop] epoch=0/micro_step=2800/global_step=2800, RunningAvgSamplesPerSec=5.015032466013009, CurrSamplesPerSec=5.193487800872225, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 4.9000000000000005e-06, 'epoch': 40.0} [2022-12-20 00:11:36,173] [INFO] [logging.py:68:log_dist] [Rank 0] step=2810, skipped=4, lr=[4.877777777777778e-06], mom=[[0.9, 0.999]] [2022-12-20 00:11:36,174] [INFO] [timer.py:196:stop] epoch=0/micro_step=2810/global_step=2810, RunningAvgSamplesPerSec=5.014714177296181, CurrSamplesPerSec=4.782209054554591, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 00:14:09,927] [INFO] [logging.py:68:log_dist] [Rank 0] step=2820, skipped=4, lr=[4.855555555555556e-06], mom=[[0.9, 0.999]] [2022-12-20 00:14:09,928] [INFO] [timer.py:196:stop] epoch=0/micro_step=2820/global_step=2820, RunningAvgSamplesPerSec=5.01425980612752, CurrSamplesPerSec=4.820768258564785, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 4.8444444444444446e-06, 'epoch': 40.36} [2022-12-20 00:16:43,478] [INFO] [logging.py:68:log_dist] [Rank 0] step=2830, skipped=4, lr=[4.833333333333333e-06], mom=[[0.9, 0.999]] [2022-12-20 00:16:43,479] [INFO] [timer.py:196:stop] epoch=0/micro_step=2830/global_step=2830, RunningAvgSamplesPerSec=5.013831511278839, CurrSamplesPerSec=4.808101763537837, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 00:19:18,404] [INFO] [logging.py:68:log_dist] [Rank 0] step=2840, skipped=4, lr=[4.811111111111111e-06], mom=[[0.9, 0.999]] [2022-12-20 00:19:18,405] [INFO] [timer.py:196:stop] epoch=0/micro_step=2840/global_step=2840, RunningAvgSamplesPerSec=5.013238963339354, CurrSamplesPerSec=4.838881587260785, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 00:21:52,163] [INFO] [logging.py:68:log_dist] [Rank 0] step=2850, skipped=4, lr=[4.7888888888888894e-06], mom=[[0.9, 0.999]] [2022-12-20 00:21:52,165] [INFO] [timer.py:196:stop] epoch=0/micro_step=2850/global_step=2850, RunningAvgSamplesPerSec=5.012756911124032, CurrSamplesPerSec=4.88607414897751, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 4.7888888888888894e-06, 'epoch': 40.71} [2022-12-20 00:24:24,134] [INFO] [logging.py:68:log_dist] [Rank 0] step=2860, skipped=4, lr=[4.766666666666667e-06], mom=[[0.9, 0.999]] [2022-12-20 00:24:24,136] [INFO] [timer.py:196:stop] epoch=0/micro_step=2860/global_step=2860, RunningAvgSamplesPerSec=5.0125249144255175, CurrSamplesPerSec=5.4792851489773735, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 00:26:48,484] [INFO] [logging.py:68:log_dist] [Rank 0] step=2870, skipped=4, lr=[4.744444444444445e-06], mom=[[0.9, 0.999]] [2022-12-20 00:26:48,486] [INFO] [timer.py:196:stop] epoch=0/micro_step=2870/global_step=2870, RunningAvgSamplesPerSec=5.013373417984457, CurrSamplesPerSec=5.344692104664637, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 4.7333333333333335e-06, 'epoch': 41.07} [2022-12-20 00:29:16,181] [INFO] [logging.py:68:log_dist] [Rank 0] step=2880, skipped=4, lr=[4.722222222222222e-06], mom=[[0.9, 0.999]] [2022-12-20 00:29:16,182] [INFO] [timer.py:196:stop] epoch=0/micro_step=2880/global_step=2880, RunningAvgSamplesPerSec=5.013730265235321, CurrSamplesPerSec=5.090376302823883, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 00:31:50,742] [INFO] [logging.py:68:log_dist] [Rank 0] step=2890, skipped=4, lr=[4.7e-06], mom=[[0.9, 0.999]] [2022-12-20 00:31:50,744] [INFO] [timer.py:196:stop] epoch=0/micro_step=2890/global_step=2890, RunningAvgSamplesPerSec=5.013187485413348, CurrSamplesPerSec=4.784031912905905, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 00:34:24,690] [INFO] [logging.py:68:log_dist] [Rank 0] step=2900, skipped=4, lr=[4.677777777777778e-06], mom=[[0.9, 0.999]] [2022-12-20 00:34:24,691] [INFO] [timer.py:196:stop] epoch=0/micro_step=2900/global_step=2900, RunningAvgSamplesPerSec=5.012677710170453, CurrSamplesPerSec=4.806129462138566, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 4.677777777777778e-06, 'epoch': 41.43} [2022-12-20 00:36:58,929] [INFO] [logging.py:68:log_dist] [Rank 0] step=2910, skipped=4, lr=[4.655555555555556e-06], mom=[[0.9, 0.999]] [2022-12-20 00:36:58,930] [INFO] [timer.py:196:stop] epoch=0/micro_step=2910/global_step=2910, RunningAvgSamplesPerSec=5.012175218834425, CurrSamplesPerSec=4.949866649692317, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 00:39:34,842] [INFO] [logging.py:68:log_dist] [Rank 0] step=2920, skipped=4, lr=[4.633333333333334e-06], mom=[[0.9, 0.999]] [2022-12-20 00:39:34,844] [INFO] [timer.py:196:stop] epoch=0/micro_step=2920/global_step=2920, RunningAvgSamplesPerSec=5.011376232570457, CurrSamplesPerSec=4.766758305139065, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 4.622222222222222e-06, 'epoch': 41.79} [2022-12-20 00:42:11,337] [INFO] [logging.py:68:log_dist] [Rank 0] step=2930, skipped=4, lr=[4.611111111111112e-06], mom=[[0.9, 0.999]] [2022-12-20 00:42:11,339] [INFO] [timer.py:196:stop] epoch=0/micro_step=2930/global_step=2930, RunningAvgSamplesPerSec=5.010588638931028, CurrSamplesPerSec=4.893571602264024, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 00:44:48,407] [INFO] [logging.py:68:log_dist] [Rank 0] step=2940, skipped=4, lr=[4.58888888888889e-06], mom=[[0.9, 0.999]] [2022-12-20 00:44:48,408] [INFO] [timer.py:196:stop] epoch=0/micro_step=2940/global_step=2940, RunningAvgSamplesPerSec=5.009699564397107, CurrSamplesPerSec=4.905965143445071, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 00:47:19,183] [INFO] [logging.py:68:log_dist] [Rank 0] step=2950, skipped=4, lr=[4.566666666666667e-06], mom=[[0.9, 0.999]] [2022-12-20 00:47:19,184] [INFO] [timer.py:196:stop] epoch=0/micro_step=2950/global_step=2950, RunningAvgSamplesPerSec=5.009685400094273, CurrSamplesPerSec=4.83992417251654, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 4.566666666666667e-06, 'epoch': 42.14} [2022-12-20 00:49:54,107] [INFO] [logging.py:68:log_dist] [Rank 0] step=2960, skipped=4, lr=[4.544444444444445e-06], mom=[[0.9, 0.999]] [2022-12-20 00:49:54,108] [INFO] [timer.py:196:stop] epoch=0/micro_step=2960/global_step=2960, RunningAvgSamplesPerSec=5.009172890038306, CurrSamplesPerSec=4.705702135196774, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 00:52:28,979] [INFO] [logging.py:68:log_dist] [Rank 0] step=2970, skipped=4, lr=[4.5222222222222225e-06], mom=[[0.9, 0.999]] [2022-12-20 00:52:28,981] [INFO] [timer.py:196:stop] epoch=0/micro_step=2970/global_step=2970, RunningAvgSamplesPerSec=5.008596354717981, CurrSamplesPerSec=4.7760038856306695, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 4.511111111111111e-06, 'epoch': 42.5} [2022-12-20 00:55:06,499] [INFO] [logging.py:68:log_dist] [Rank 0] step=2980, skipped=4, lr=[4.5e-06], mom=[[0.9, 0.999]] [2022-12-20 00:55:06,500] [INFO] [timer.py:196:stop] epoch=0/micro_step=2980/global_step=2980, RunningAvgSamplesPerSec=5.007756223244268, CurrSamplesPerSec=4.745128361129826, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 00:57:41,768] [INFO] [logging.py:68:log_dist] [Rank 0] step=2990, skipped=4, lr=[4.477777777777778e-06], mom=[[0.9, 0.999]] [2022-12-20 00:57:41,769] [INFO] [timer.py:196:stop] epoch=0/micro_step=2990/global_step=2990, RunningAvgSamplesPerSec=5.00713455911804, CurrSamplesPerSec=4.9674775162969205, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 01:00:08,179] [INFO] [logging.py:68:log_dist] [Rank 0] step=3000, skipped=4, lr=[4.455555555555555e-06], mom=[[0.9, 0.999]] [2022-12-20 01:00:08,181] [INFO] [timer.py:196:stop] epoch=0/micro_step=3000/global_step=3000, RunningAvgSamplesPerSec=5.007697570376488, CurrSamplesPerSec=5.329942529205776, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 4.455555555555555e-06, 'epoch': 42.86} {'eval_loss': 0.45654296875, 'eval_wer': 23.322227510709187, 'eval_runtime': 844.8019, 'eval_samples_per_second': 2.683, 'eval_steps_per_second': 0.084, 'epoch': 42.86} [2022-12-20 01:14:16,407] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step3000 is begin to save! [2022-12-20 01:14:16,420] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: ./checkpoint-3000/global_step3000/mp_rank_00_model_states.pt [2022-12-20 01:14:16,420] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving ./checkpoint-3000/global_step3000/mp_rank_00_model_states.pt... [2022-12-20 01:14:19,595] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved ./checkpoint-3000/global_step3000/mp_rank_00_model_states.pt. [2022-12-20 01:14:19,597] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving ./checkpoint-3000/global_step3000/zero_pp_rank_0_mp_rank_00_optim_states.pt... [2022-12-20 01:14:33,082] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved ./checkpoint-3000/global_step3000/zero_pp_rank_0_mp_rank_00_optim_states.pt. [2022-12-20 01:14:33,082] [INFO] [engine.py:3394:_save_zero_checkpoint] zero checkpoint saved ./checkpoint-3000/global_step3000/zero_pp_rank_0_mp_rank_00_optim_states.pt [2022-12-20 01:14:33,082] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step3000 is ready now! [2022-12-20 01:18:29,011] [INFO] [stage_1_and_2.py:1767:step] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536.0, reducing to 65536.0 [2022-12-20 01:18:41,053] [INFO] [stage_1_and_2.py:1767:step] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536.0, reducing to 32768.0 [2022-12-20 01:19:34,609] [INFO] [logging.py:68:log_dist] [Rank 0] step=3010, skipped=6, lr=[4.437777777777778e-06], mom=[[0.9, 0.999]] [2022-12-20 01:19:34,611] [INFO] [timer.py:196:stop] epoch=0/micro_step=3010/global_step=3010, RunningAvgSamplesPerSec=5.010784671686443, CurrSamplesPerSec=5.7881583152353295, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 01:22:00,245] [INFO] [logging.py:68:log_dist] [Rank 0] step=3020, skipped=6, lr=[4.415555555555556e-06], mom=[[0.9, 0.999]] [2022-12-20 01:22:00,246] [INFO] [timer.py:196:stop] epoch=0/micro_step=3020/global_step=3020, RunningAvgSamplesPerSec=5.011494782563396, CurrSamplesPerSec=5.182315133261153, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 4.404444444444445e-06, 'epoch': 43.21} [2022-12-20 01:24:28,024] [INFO] [logging.py:68:log_dist] [Rank 0] step=3030, skipped=6, lr=[4.393333333333334e-06], mom=[[0.9, 0.999]] [2022-12-20 01:24:28,025] [INFO] [timer.py:196:stop] epoch=0/micro_step=3030/global_step=3030, RunningAvgSamplesPerSec=5.011850955905219, CurrSamplesPerSec=5.017293001245616, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 01:26:56,035] [INFO] [logging.py:68:log_dist] [Rank 0] step=3040, skipped=6, lr=[4.371111111111112e-06], mom=[[0.9, 0.999]] [2022-12-20 01:26:56,037] [INFO] [timer.py:196:stop] epoch=0/micro_step=3040/global_step=3040, RunningAvgSamplesPerSec=5.012175038461916, CurrSamplesPerSec=4.827043409240733, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 01:29:29,556] [INFO] [logging.py:68:log_dist] [Rank 0] step=3050, skipped=6, lr=[4.348888888888889e-06], mom=[[0.9, 0.999]] [2022-12-20 01:29:29,558] [INFO] [timer.py:196:stop] epoch=0/micro_step=3050/global_step=3050, RunningAvgSamplesPerSec=5.011714249674453, CurrSamplesPerSec=5.013898113078914, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 4.348888888888889e-06, 'epoch': 43.57} [2022-12-20 01:31:59,512] [INFO] [logging.py:68:log_dist] [Rank 0] step=3060, skipped=6, lr=[4.326666666666667e-06], mom=[[0.9, 0.999]] [2022-12-20 01:31:59,514] [INFO] [timer.py:196:stop] epoch=0/micro_step=3060/global_step=3060, RunningAvgSamplesPerSec=5.0118575458496135, CurrSamplesPerSec=5.7010847557137065, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 01:34:29,706] [INFO] [logging.py:68:log_dist] [Rank 0] step=3070, skipped=6, lr=[4.304444444444445e-06], mom=[[0.9, 0.999]] [2022-12-20 01:34:29,708] [INFO] [timer.py:196:stop] epoch=0/micro_step=3070/global_step=3070, RunningAvgSamplesPerSec=5.011864060021234, CurrSamplesPerSec=4.510793917369373, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 4.2933333333333334e-06, 'epoch': 43.93} [2022-12-20 01:37:10,981] [INFO] [logging.py:68:log_dist] [Rank 0] step=3080, skipped=6, lr=[4.282222222222222e-06], mom=[[0.9, 0.999]] [2022-12-20 01:37:10,982] [INFO] [timer.py:196:stop] epoch=0/micro_step=3080/global_step=3080, RunningAvgSamplesPerSec=5.0104830321936324, CurrSamplesPerSec=4.9590286848937275, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 01:39:46,952] [INFO] [logging.py:68:log_dist] [Rank 0] step=3090, skipped=6, lr=[4.26e-06], mom=[[0.9, 0.999]] [2022-12-20 01:39:46,954] [INFO] [timer.py:196:stop] epoch=0/micro_step=3090/global_step=3090, RunningAvgSamplesPerSec=5.009856571118483, CurrSamplesPerSec=4.942705625853167, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 01:42:15,154] [INFO] [logging.py:68:log_dist] [Rank 0] step=3100, skipped=6, lr=[4.2377777777777775e-06], mom=[[0.9, 0.999]] [2022-12-20 01:42:15,155] [INFO] [timer.py:196:stop] epoch=0/micro_step=3100/global_step=3100, RunningAvgSamplesPerSec=5.010175794540146, CurrSamplesPerSec=5.249672686832617, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 4.2377777777777775e-06, 'epoch': 44.29} [2022-12-20 01:44:45,522] [INFO] [logging.py:68:log_dist] [Rank 0] step=3110, skipped=6, lr=[4.215555555555556e-06], mom=[[0.9, 0.999]] [2022-12-20 01:44:45,524] [INFO] [timer.py:196:stop] epoch=0/micro_step=3110/global_step=3110, RunningAvgSamplesPerSec=5.010168572741285, CurrSamplesPerSec=4.82225035907906, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 01:47:20,205] [INFO] [logging.py:68:log_dist] [Rank 0] step=3120, skipped=6, lr=[4.1933333333333336e-06], mom=[[0.9, 0.999]] [2022-12-20 01:47:20,207] [INFO] [timer.py:196:stop] epoch=0/micro_step=3120/global_step=3120, RunningAvgSamplesPerSec=5.009661435535492, CurrSamplesPerSec=5.020791655184806, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 4.182222222222222e-06, 'epoch': 44.64} [2022-12-20 01:49:49,881] [INFO] [logging.py:68:log_dist] [Rank 0] step=3130, skipped=6, lr=[4.171111111111111e-06], mom=[[0.9, 0.999]] [2022-12-20 01:49:49,882] [INFO] [timer.py:196:stop] epoch=0/micro_step=3130/global_step=3130, RunningAvgSamplesPerSec=5.009877115926111, CurrSamplesPerSec=5.246828135694998, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 01:52:15,719] [INFO] [logging.py:68:log_dist] [Rank 0] step=3140, skipped=6, lr=[4.148888888888889e-06], mom=[[0.9, 0.999]] [2022-12-20 01:52:15,720] [INFO] [timer.py:196:stop] epoch=0/micro_step=3140/global_step=3140, RunningAvgSamplesPerSec=5.010416027361077, CurrSamplesPerSec=5.122332311953563, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 01:54:43,394] [INFO] [logging.py:68:log_dist] [Rank 0] step=3150, skipped=6, lr=[4.126666666666667e-06], mom=[[0.9, 0.999]] [2022-12-20 01:54:43,396] [INFO] [timer.py:196:stop] epoch=0/micro_step=3150/global_step=3150, RunningAvgSamplesPerSec=5.0107946397971945, CurrSamplesPerSec=5.1281769053064306, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 4.126666666666667e-06, 'epoch': 45.0} [2022-12-20 01:57:10,843] [INFO] [logging.py:68:log_dist] [Rank 0] step=3160, skipped=6, lr=[4.104444444444445e-06], mom=[[0.9, 0.999]] [2022-12-20 01:57:10,845] [INFO] [timer.py:196:stop] epoch=0/micro_step=3160/global_step=3160, RunningAvgSamplesPerSec=5.0111903358827785, CurrSamplesPerSec=5.168605423919754, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 01:59:37,211] [INFO] [logging.py:68:log_dist] [Rank 0] step=3170, skipped=6, lr=[4.0822222222222225e-06], mom=[[0.9, 0.999]] [2022-12-20 01:59:37,212] [INFO] [timer.py:196:stop] epoch=0/micro_step=3170/global_step=3170, RunningAvgSamplesPerSec=5.011832383634813, CurrSamplesPerSec=5.17065105423624, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 4.071111111111111e-06, 'epoch': 45.36} [2022-12-20 02:02:02,760] [INFO] [logging.py:68:log_dist] [Rank 0] step=3180, skipped=6, lr=[4.060000000000001e-06], mom=[[0.9, 0.999]] [2022-12-20 02:02:02,761] [INFO] [timer.py:196:stop] epoch=0/micro_step=3180/global_step=3180, RunningAvgSamplesPerSec=5.012414169129676, CurrSamplesPerSec=5.239538770686272, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 02:04:28,303] [INFO] [logging.py:68:log_dist] [Rank 0] step=3190, skipped=6, lr=[4.0377777777777786e-06], mom=[[0.9, 0.999]] [2022-12-20 02:04:28,304] [INFO] [timer.py:196:stop] epoch=0/micro_step=3190/global_step=3190, RunningAvgSamplesPerSec=5.013003951326583, CurrSamplesPerSec=5.178411107945444, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 02:06:56,787] [INFO] [logging.py:68:log_dist] [Rank 0] step=3200, skipped=6, lr=[4.015555555555556e-06], mom=[[0.9, 0.999]] [2022-12-20 02:06:56,789] [INFO] [timer.py:196:stop] epoch=0/micro_step=3200/global_step=3200, RunningAvgSamplesPerSec=5.013173386209827, CurrSamplesPerSec=4.8972863458813185, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 4.015555555555556e-06, 'epoch': 45.71} [2022-12-20 02:09:29,989] [INFO] [logging.py:68:log_dist] [Rank 0] step=3210, skipped=6, lr=[3.993333333333334e-06], mom=[[0.9, 0.999]] [2022-12-20 02:09:29,990] [INFO] [timer.py:196:stop] epoch=0/micro_step=3210/global_step=3210, RunningAvgSamplesPerSec=5.012791790383584, CurrSamplesPerSec=4.8984594609760315, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 02:12:03,668] [INFO] [logging.py:68:log_dist] [Rank 0] step=3220, skipped=6, lr=[3.971111111111111e-06], mom=[[0.9, 0.999]] [2022-12-20 02:12:03,669] [INFO] [timer.py:196:stop] epoch=0/micro_step=3220/global_step=3220, RunningAvgSamplesPerSec=5.012371857760337, CurrSamplesPerSec=5.132404001446214, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 3.96e-06, 'epoch': 46.07} [2022-12-20 02:14:35,956] [INFO] [logging.py:68:log_dist] [Rank 0] step=3230, skipped=6, lr=[3.948888888888889e-06], mom=[[0.9, 0.999]] [2022-12-20 02:14:35,957] [INFO] [timer.py:196:stop] epoch=0/micro_step=3230/global_step=3230, RunningAvgSamplesPerSec=5.012138963050575, CurrSamplesPerSec=4.897640088990262, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 02:17:09,501] [INFO] [logging.py:68:log_dist] [Rank 0] step=3240, skipped=6, lr=[3.926666666666667e-06], mom=[[0.9, 0.999]] [2022-12-20 02:17:09,503] [INFO] [timer.py:196:stop] epoch=0/micro_step=3240/global_step=3240, RunningAvgSamplesPerSec=5.011795806427403, CurrSamplesPerSec=4.918891450300068, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 02:19:27,538] [INFO] [logging.py:68:log_dist] [Rank 0] step=3250, skipped=6, lr=[3.904444444444444e-06], mom=[[0.9, 0.999]] [2022-12-20 02:19:27,540] [INFO] [timer.py:196:stop] epoch=0/micro_step=3250/global_step=3250, RunningAvgSamplesPerSec=5.013307950894662, CurrSamplesPerSec=5.518680249132624, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 3.904444444444444e-06, 'epoch': 46.43} [2022-12-20 02:21:50,333] [INFO] [logging.py:68:log_dist] [Rank 0] step=3260, skipped=6, lr=[3.882222222222223e-06], mom=[[0.9, 0.999]] [2022-12-20 02:21:50,334] [INFO] [timer.py:196:stop] epoch=0/micro_step=3260/global_step=3260, RunningAvgSamplesPerSec=5.014276951656808, CurrSamplesPerSec=5.244702226717013, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 02:24:15,987] [INFO] [logging.py:68:log_dist] [Rank 0] step=3270, skipped=6, lr=[3.86e-06], mom=[[0.9, 0.999]] [2022-12-20 02:24:15,989] [INFO] [timer.py:196:stop] epoch=0/micro_step=3270/global_step=3270, RunningAvgSamplesPerSec=5.014870559660052, CurrSamplesPerSec=4.903460097765076, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 3.848888888888889e-06, 'epoch': 46.79} [2022-12-20 02:26:44,569] [INFO] [logging.py:68:log_dist] [Rank 0] step=3280, skipped=6, lr=[3.837777777777778e-06], mom=[[0.9, 0.999]] [2022-12-20 02:26:44,571] [INFO] [timer.py:196:stop] epoch=0/micro_step=3280/global_step=3280, RunningAvgSamplesPerSec=5.0151254618912855, CurrSamplesPerSec=5.13210128825685, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 02:29:10,914] [INFO] [logging.py:68:log_dist] [Rank 0] step=3290, skipped=6, lr=[3.8155555555555555e-06], mom=[[0.9, 0.999]] [2022-12-20 02:29:10,915] [INFO] [timer.py:196:stop] epoch=0/micro_step=3290/global_step=3290, RunningAvgSamplesPerSec=5.0155869116890734, CurrSamplesPerSec=5.244405589391571, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 02:31:34,598] [INFO] [logging.py:68:log_dist] [Rank 0] step=3300, skipped=6, lr=[3.793333333333334e-06], mom=[[0.9, 0.999]] [2022-12-20 02:31:34,599] [INFO] [timer.py:196:stop] epoch=0/micro_step=3300/global_step=3300, RunningAvgSamplesPerSec=5.016568403527747, CurrSamplesPerSec=5.4285716076894746, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 3.793333333333334e-06, 'epoch': 47.14} [2022-12-20 02:34:02,010] [INFO] [logging.py:68:log_dist] [Rank 0] step=3310, skipped=6, lr=[3.7711111111111116e-06], mom=[[0.9, 0.999]] [2022-12-20 02:34:02,011] [INFO] [timer.py:196:stop] epoch=0/micro_step=3310/global_step=3310, RunningAvgSamplesPerSec=5.016859073104873, CurrSamplesPerSec=5.228046642973993, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 02:36:36,456] [INFO] [logging.py:68:log_dist] [Rank 0] step=3320, skipped=6, lr=[3.7488888888888892e-06], mom=[[0.9, 0.999]] [2022-12-20 02:36:36,458] [INFO] [timer.py:196:stop] epoch=0/micro_step=3320/global_step=3320, RunningAvgSamplesPerSec=5.016446215770842, CurrSamplesPerSec=4.811722676247159, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 3.737777777777778e-06, 'epoch': 47.5} [2022-12-20 02:39:04,992] [INFO] [logging.py:68:log_dist] [Rank 0] step=3330, skipped=6, lr=[3.726666666666667e-06], mom=[[0.9, 0.999]] [2022-12-20 02:39:04,994] [INFO] [timer.py:196:stop] epoch=0/micro_step=3330/global_step=3330, RunningAvgSamplesPerSec=5.016684021365297, CurrSamplesPerSec=5.210960829388422, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 02:41:33,573] [INFO] [logging.py:68:log_dist] [Rank 0] step=3340, skipped=6, lr=[3.704444444444445e-06], mom=[[0.9, 0.999]] [2022-12-20 02:41:33,575] [INFO] [timer.py:196:stop] epoch=0/micro_step=3340/global_step=3340, RunningAvgSamplesPerSec=5.016909309171439, CurrSamplesPerSec=5.07443598071175, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 02:44:00,851] [INFO] [logging.py:68:log_dist] [Rank 0] step=3350, skipped=6, lr=[3.6822222222222225e-06], mom=[[0.9, 0.999]] [2022-12-20 02:44:00,852] [INFO] [timer.py:196:stop] epoch=0/micro_step=3350/global_step=3350, RunningAvgSamplesPerSec=5.017265218865273, CurrSamplesPerSec=5.0747266527004085, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 3.6822222222222225e-06, 'epoch': 47.86} [2022-12-20 02:46:27,651] [INFO] [logging.py:68:log_dist] [Rank 0] step=3360, skipped=6, lr=[3.66e-06], mom=[[0.9, 0.999]] [2022-12-20 02:46:27,653] [INFO] [timer.py:196:stop] epoch=0/micro_step=3360/global_step=3360, RunningAvgSamplesPerSec=5.017696493100695, CurrSamplesPerSec=5.445819859803268, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 02:48:54,300] [INFO] [logging.py:68:log_dist] [Rank 0] step=3370, skipped=6, lr=[3.6377777777777777e-06], mom=[[0.9, 0.999]] [2022-12-20 02:48:54,301] [INFO] [timer.py:196:stop] epoch=0/micro_step=3370/global_step=3370, RunningAvgSamplesPerSec=5.018120277766622, CurrSamplesPerSec=5.119012007093434, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 3.6266666666666674e-06, 'epoch': 48.21} [2022-12-20 02:51:20,831] [INFO] [logging.py:68:log_dist] [Rank 0] step=3380, skipped=6, lr=[3.615555555555556e-06], mom=[[0.9, 0.999]] [2022-12-20 02:51:20,832] [INFO] [timer.py:196:stop] epoch=0/micro_step=3380/global_step=3380, RunningAvgSamplesPerSec=5.018567967339129, CurrSamplesPerSec=5.209512158887764, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 02:53:51,344] [INFO] [logging.py:68:log_dist] [Rank 0] step=3390, skipped=6, lr=[3.593333333333334e-06], mom=[[0.9, 0.999]] [2022-12-20 02:53:51,346] [INFO] [timer.py:196:stop] epoch=0/micro_step=3390/global_step=3390, RunningAvgSamplesPerSec=5.018556949270108, CurrSamplesPerSec=4.892052921959464, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 02:56:19,431] [INFO] [logging.py:68:log_dist] [Rank 0] step=3400, skipped=6, lr=[3.5711111111111114e-06], mom=[[0.9, 0.999]] [2022-12-20 02:56:19,432] [INFO] [timer.py:196:stop] epoch=0/micro_step=3400/global_step=3400, RunningAvgSamplesPerSec=5.018898400832044, CurrSamplesPerSec=5.22343017008523, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 3.5711111111111114e-06, 'epoch': 48.57} [2022-12-20 02:58:47,138] [INFO] [logging.py:68:log_dist] [Rank 0] step=3410, skipped=6, lr=[3.548888888888889e-06], mom=[[0.9, 0.999]] [2022-12-20 02:58:47,139] [INFO] [timer.py:196:stop] epoch=0/micro_step=3410/global_step=3410, RunningAvgSamplesPerSec=5.019209831341465, CurrSamplesPerSec=5.115540313699374, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 03:01:14,393] [INFO] [logging.py:68:log_dist] [Rank 0] step=3420, skipped=6, lr=[3.526666666666667e-06], mom=[[0.9, 0.999]] [2022-12-20 03:01:14,395] [INFO] [timer.py:196:stop] epoch=0/micro_step=3420/global_step=3420, RunningAvgSamplesPerSec=5.019558125349715, CurrSamplesPerSec=5.199672750564973, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 3.515555555555556e-06, 'epoch': 48.93} [2022-12-20 03:03:40,216] [INFO] [logging.py:68:log_dist] [Rank 0] step=3430, skipped=6, lr=[3.5044444444444447e-06], mom=[[0.9, 0.999]] [2022-12-20 03:03:40,218] [INFO] [timer.py:196:stop] epoch=0/micro_step=3430/global_step=3430, RunningAvgSamplesPerSec=5.020104358212663, CurrSamplesPerSec=5.294184786664645, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 03:06:08,019] [INFO] [logging.py:68:log_dist] [Rank 0] step=3440, skipped=6, lr=[3.4822222222222223e-06], mom=[[0.9, 0.999]] [2022-12-20 03:06:08,020] [INFO] [timer.py:196:stop] epoch=0/micro_step=3440/global_step=3440, RunningAvgSamplesPerSec=5.020483455479745, CurrSamplesPerSec=5.126783095607625, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 03:08:34,247] [INFO] [logging.py:68:log_dist] [Rank 0] step=3450, skipped=6, lr=[3.46e-06], mom=[[0.9, 0.999]] [2022-12-20 03:08:34,248] [INFO] [timer.py:196:stop] epoch=0/micro_step=3450/global_step=3450, RunningAvgSamplesPerSec=5.020923426859924, CurrSamplesPerSec=5.169087042205798, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 3.46e-06, 'epoch': 49.29} [2022-12-20 03:11:00,700] [INFO] [logging.py:68:log_dist] [Rank 0] step=3460, skipped=6, lr=[3.4377777777777784e-06], mom=[[0.9, 0.999]] [2022-12-20 03:11:00,701] [INFO] [timer.py:196:stop] epoch=0/micro_step=3460/global_step=3460, RunningAvgSamplesPerSec=5.021448536640391, CurrSamplesPerSec=5.173228938874232, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 03:13:27,986] [INFO] [logging.py:68:log_dist] [Rank 0] step=3470, skipped=6, lr=[3.415555555555556e-06], mom=[[0.9, 0.999]] [2022-12-20 03:13:27,987] [INFO] [timer.py:196:stop] epoch=0/micro_step=3470/global_step=3470, RunningAvgSamplesPerSec=5.021813255367895, CurrSamplesPerSec=5.241449759871188, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 3.404444444444445e-06, 'epoch': 49.64} [2022-12-20 03:15:58,218] [INFO] [logging.py:68:log_dist] [Rank 0] step=3480, skipped=6, lr=[3.3933333333333336e-06], mom=[[0.9, 0.999]] [2022-12-20 03:15:58,220] [INFO] [timer.py:196:stop] epoch=0/micro_step=3480/global_step=3480, RunningAvgSamplesPerSec=5.021783181313551, CurrSamplesPerSec=4.697068445288297, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 03:18:32,363] [INFO] [logging.py:68:log_dist] [Rank 0] step=3490, skipped=6, lr=[3.371111111111111e-06], mom=[[0.9, 0.999]] [2022-12-20 03:18:32,364] [INFO] [timer.py:196:stop] epoch=0/micro_step=3490/global_step=3490, RunningAvgSamplesPerSec=5.0213357193232415, CurrSamplesPerSec=4.850845469838286, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 03:21:05,658] [INFO] [logging.py:68:log_dist] [Rank 0] step=3500, skipped=6, lr=[3.3488888888888892e-06], mom=[[0.9, 0.999]] [2022-12-20 03:21:05,659] [INFO] [timer.py:196:stop] epoch=0/micro_step=3500/global_step=3500, RunningAvgSamplesPerSec=5.021104853037733, CurrSamplesPerSec=5.175201005614641, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 3.3488888888888892e-06, 'epoch': 50.0} [2022-12-20 03:23:34,614] [INFO] [logging.py:68:log_dist] [Rank 0] step=3510, skipped=6, lr=[3.326666666666667e-06], mom=[[0.9, 0.999]] [2022-12-20 03:23:34,616] [INFO] [timer.py:196:stop] epoch=0/micro_step=3510/global_step=3510, RunningAvgSamplesPerSec=5.021260020015854, CurrSamplesPerSec=5.109705019563074, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 03:26:02,060] [INFO] [logging.py:68:log_dist] [Rank 0] step=3520, skipped=6, lr=[3.3044444444444445e-06], mom=[[0.9, 0.999]] [2022-12-20 03:26:02,062] [INFO] [timer.py:196:stop] epoch=0/micro_step=3520/global_step=3520, RunningAvgSamplesPerSec=5.021535536224096, CurrSamplesPerSec=4.963441904576861, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 3.2933333333333333e-06, 'epoch': 50.36} [2022-12-20 03:28:35,024] [INFO] [logging.py:68:log_dist] [Rank 0] step=3530, skipped=6, lr=[3.282222222222223e-06], mom=[[0.9, 0.999]] [2022-12-20 03:28:35,026] [INFO] [timer.py:196:stop] epoch=0/micro_step=3530/global_step=3530, RunningAvgSamplesPerSec=5.021174952910811, CurrSamplesPerSec=4.771169222501317, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 03:31:03,479] [INFO] [logging.py:68:log_dist] [Rank 0] step=3540, skipped=6, lr=[3.2600000000000006e-06], mom=[[0.9, 0.999]] [2022-12-20 03:31:03,480] [INFO] [timer.py:196:stop] epoch=0/micro_step=3540/global_step=3540, RunningAvgSamplesPerSec=5.021356333675896, CurrSamplesPerSec=5.1336201202148795, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 03:33:37,593] [INFO] [logging.py:68:log_dist] [Rank 0] step=3550, skipped=6, lr=[3.237777777777778e-06], mom=[[0.9, 0.999]] [2022-12-20 03:33:37,594] [INFO] [timer.py:196:stop] epoch=0/micro_step=3550/global_step=3550, RunningAvgSamplesPerSec=5.0209923301417545, CurrSamplesPerSec=4.8032228159450945, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 3.237777777777778e-06, 'epoch': 50.71} [2022-12-20 03:36:08,250] [INFO] [logging.py:68:log_dist] [Rank 0] step=3560, skipped=6, lr=[3.2155555555555558e-06], mom=[[0.9, 0.999]] [2022-12-20 03:36:08,252] [INFO] [timer.py:196:stop] epoch=0/micro_step=3560/global_step=3560, RunningAvgSamplesPerSec=5.020966895146841, CurrSamplesPerSec=5.417848791827928, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 03:38:35,914] [INFO] [logging.py:68:log_dist] [Rank 0] step=3570, skipped=6, lr=[3.193333333333334e-06], mom=[[0.9, 0.999]] [2022-12-20 03:38:35,916] [INFO] [timer.py:196:stop] epoch=0/micro_step=3570/global_step=3570, RunningAvgSamplesPerSec=5.021199745560539, CurrSamplesPerSec=5.189035747598663, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 3.1822222222222226e-06, 'epoch': 51.07} [2022-12-20 03:41:09,887] [INFO] [logging.py:68:log_dist] [Rank 0] step=3580, skipped=6, lr=[3.1711111111111114e-06], mom=[[0.9, 0.999]] [2022-12-20 03:41:09,889] [INFO] [timer.py:196:stop] epoch=0/micro_step=3580/global_step=3580, RunningAvgSamplesPerSec=5.020780373418236, CurrSamplesPerSec=4.836567258368214, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 03:43:43,941] [INFO] [logging.py:68:log_dist] [Rank 0] step=3590, skipped=6, lr=[3.148888888888889e-06], mom=[[0.9, 0.999]] [2022-12-20 03:43:43,943] [INFO] [timer.py:196:stop] epoch=0/micro_step=3590/global_step=3590, RunningAvgSamplesPerSec=5.020349086789647, CurrSamplesPerSec=4.811152887097264, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 03:46:14,664] [INFO] [logging.py:68:log_dist] [Rank 0] step=3600, skipped=6, lr=[3.1266666666666667e-06], mom=[[0.9, 0.999]] [2022-12-20 03:46:14,666] [INFO] [timer.py:196:stop] epoch=0/micro_step=3600/global_step=3600, RunningAvgSamplesPerSec=5.020307607070033, CurrSamplesPerSec=5.033031110111137, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 3.1266666666666667e-06, 'epoch': 51.43} [2022-12-20 03:48:44,894] [INFO] [logging.py:68:log_dist] [Rank 0] step=3610, skipped=6, lr=[3.104444444444445e-06], mom=[[0.9, 0.999]] [2022-12-20 03:48:44,895] [INFO] [timer.py:196:stop] epoch=0/micro_step=3610/global_step=3610, RunningAvgSamplesPerSec=5.020289021142404, CurrSamplesPerSec=5.1586213756487345, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 03:51:12,552] [INFO] [logging.py:68:log_dist] [Rank 0] step=3620, skipped=6, lr=[3.0822222222222227e-06], mom=[[0.9, 0.999]] [2022-12-20 03:51:12,554] [INFO] [timer.py:196:stop] epoch=0/micro_step=3620/global_step=3620, RunningAvgSamplesPerSec=5.020543813282841, CurrSamplesPerSec=5.158831748899703, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 3.0711111111111115e-06, 'epoch': 51.79} [2022-12-20 03:53:38,443] [INFO] [logging.py:68:log_dist] [Rank 0] step=3630, skipped=6, lr=[3.0600000000000003e-06], mom=[[0.9, 0.999]] [2022-12-20 03:53:38,445] [INFO] [timer.py:196:stop] epoch=0/micro_step=3630/global_step=3630, RunningAvgSamplesPerSec=5.021014302786805, CurrSamplesPerSec=5.244716982610335, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 03:56:03,659] [INFO] [logging.py:68:log_dist] [Rank 0] step=3640, skipped=6, lr=[3.037777777777778e-06], mom=[[0.9, 0.999]] [2022-12-20 03:56:03,660] [INFO] [timer.py:196:stop] epoch=0/micro_step=3640/global_step=3640, RunningAvgSamplesPerSec=5.021504442933683, CurrSamplesPerSec=5.442504694249898, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 03:58:30,311] [INFO] [logging.py:68:log_dist] [Rank 0] step=3650, skipped=6, lr=[3.015555555555556e-06], mom=[[0.9, 0.999]] [2022-12-20 03:58:30,312] [INFO] [timer.py:196:stop] epoch=0/micro_step=3650/global_step=3650, RunningAvgSamplesPerSec=5.021887378964869, CurrSamplesPerSec=5.044828980616264, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 3.015555555555556e-06, 'epoch': 52.14} [2022-12-20 04:00:57,179] [INFO] [logging.py:68:log_dist] [Rank 0] step=3660, skipped=6, lr=[2.9933333333333336e-06], mom=[[0.9, 0.999]] [2022-12-20 04:00:57,181] [INFO] [timer.py:196:stop] epoch=0/micro_step=3660/global_step=3660, RunningAvgSamplesPerSec=5.02226319641877, CurrSamplesPerSec=5.175864284088572, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 04:03:22,926] [INFO] [logging.py:68:log_dist] [Rank 0] step=3670, skipped=6, lr=[2.9711111111111112e-06], mom=[[0.9, 0.999]] [2022-12-20 04:03:22,927] [INFO] [timer.py:196:stop] epoch=0/micro_step=3670/global_step=3670, RunningAvgSamplesPerSec=5.022750665852945, CurrSamplesPerSec=5.266301154382003, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 2.96e-06, 'epoch': 52.5} [2022-12-20 04:05:48,717] [INFO] [logging.py:68:log_dist] [Rank 0] step=3680, skipped=6, lr=[2.948888888888889e-06], mom=[[0.9, 0.999]] [2022-12-20 04:05:48,718] [INFO] [timer.py:196:stop] epoch=0/micro_step=3680/global_step=3680, RunningAvgSamplesPerSec=5.023179933908105, CurrSamplesPerSec=5.156948914181069, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 04:08:17,201] [INFO] [logging.py:68:log_dist] [Rank 0] step=3690, skipped=6, lr=[2.9266666666666673e-06], mom=[[0.9, 0.999]] [2022-12-20 04:08:17,202] [INFO] [timer.py:196:stop] epoch=0/micro_step=3690/global_step=3690, RunningAvgSamplesPerSec=5.023450580454851, CurrSamplesPerSec=5.209431279570267, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 04:10:41,168] [INFO] [logging.py:68:log_dist] [Rank 0] step=3700, skipped=6, lr=[2.904444444444445e-06], mom=[[0.9, 0.999]] [2022-12-20 04:10:41,170] [INFO] [timer.py:196:stop] epoch=0/micro_step=3700/global_step=3700, RunningAvgSamplesPerSec=5.024155608459135, CurrSamplesPerSec=5.400806545363619, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 2.904444444444445e-06, 'epoch': 52.86} [2022-12-20 04:13:01,639] [INFO] [logging.py:68:log_dist] [Rank 0] step=3710, skipped=6, lr=[2.8822222222222225e-06], mom=[[0.9, 0.999]] [2022-12-20 04:13:01,640] [INFO] [timer.py:196:stop] epoch=0/micro_step=3710/global_step=3710, RunningAvgSamplesPerSec=5.025176566364872, CurrSamplesPerSec=5.572944957849582, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 04:15:28,880] [INFO] [logging.py:68:log_dist] [Rank 0] step=3720, skipped=6, lr=[2.86e-06], mom=[[0.9, 0.999]] [2022-12-20 04:15:28,882] [INFO] [timer.py:196:stop] epoch=0/micro_step=3720/global_step=3720, RunningAvgSamplesPerSec=5.025486477194859, CurrSamplesPerSec=5.093894625605406, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 2.8488888888888894e-06, 'epoch': 53.21} [2022-12-20 04:17:49,939] [INFO] [logging.py:68:log_dist] [Rank 0] step=3730, skipped=6, lr=[2.837777777777778e-06], mom=[[0.9, 0.999]] [2022-12-20 04:17:49,941] [INFO] [timer.py:196:stop] epoch=0/micro_step=3730/global_step=3730, RunningAvgSamplesPerSec=5.02646865664269, CurrSamplesPerSec=5.477750536846597, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 04:20:15,258] [INFO] [logging.py:68:log_dist] [Rank 0] step=3740, skipped=6, lr=[2.815555555555556e-06], mom=[[0.9, 0.999]] [2022-12-20 04:20:15,259] [INFO] [timer.py:196:stop] epoch=0/micro_step=3740/global_step=3740, RunningAvgSamplesPerSec=5.026993730402327, CurrSamplesPerSec=5.186795143590519, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 04:22:41,188] [INFO] [logging.py:68:log_dist] [Rank 0] step=3750, skipped=6, lr=[2.7933333333333334e-06], mom=[[0.9, 0.999]] [2022-12-20 04:22:41,190] [INFO] [timer.py:196:stop] epoch=0/micro_step=3750/global_step=3750, RunningAvgSamplesPerSec=5.027447785923126, CurrSamplesPerSec=5.177586687999467, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 2.7933333333333334e-06, 'epoch': 53.57} [2022-12-20 04:25:07,312] [INFO] [logging.py:68:log_dist] [Rank 0] step=3760, skipped=6, lr=[2.771111111111111e-06], mom=[[0.9, 0.999]] [2022-12-20 04:25:07,314] [INFO] [timer.py:196:stop] epoch=0/micro_step=3760/global_step=3760, RunningAvgSamplesPerSec=5.027896698630529, CurrSamplesPerSec=5.194247237040942, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 04:27:32,511] [INFO] [logging.py:68:log_dist] [Rank 0] step=3770, skipped=6, lr=[2.748888888888889e-06], mom=[[0.9, 0.999]] [2022-12-20 04:27:32,513] [INFO] [timer.py:196:stop] epoch=0/micro_step=3770/global_step=3770, RunningAvgSamplesPerSec=5.028361731968657, CurrSamplesPerSec=5.233565275989457, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 2.7377777777777783e-06, 'epoch': 53.93} [2022-12-20 04:29:58,724] [INFO] [logging.py:68:log_dist] [Rank 0] step=3780, skipped=6, lr=[2.726666666666667e-06], mom=[[0.9, 0.999]] [2022-12-20 04:29:58,725] [INFO] [timer.py:196:stop] epoch=0/micro_step=3780/global_step=3780, RunningAvgSamplesPerSec=5.028746024654157, CurrSamplesPerSec=5.256519744739989, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 04:32:25,797] [INFO] [logging.py:68:log_dist] [Rank 0] step=3790, skipped=6, lr=[2.7044444444444447e-06], mom=[[0.9, 0.999]] [2022-12-20 04:32:25,798] [INFO] [timer.py:196:stop] epoch=0/micro_step=3790/global_step=3790, RunningAvgSamplesPerSec=5.029074062073731, CurrSamplesPerSec=5.087535000375166, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 04:34:52,804] [INFO] [logging.py:68:log_dist] [Rank 0] step=3800, skipped=6, lr=[2.6822222222222223e-06], mom=[[0.9, 0.999]] [2022-12-20 04:34:52,805] [INFO] [timer.py:196:stop] epoch=0/micro_step=3800/global_step=3800, RunningAvgSamplesPerSec=5.029438184394117, CurrSamplesPerSec=5.30480101511469, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 2.6822222222222223e-06, 'epoch': 54.29} [2022-12-20 04:37:17,870] [INFO] [logging.py:68:log_dist] [Rank 0] step=3810, skipped=6, lr=[2.6600000000000004e-06], mom=[[0.9, 0.999]] [2022-12-20 04:37:17,871] [INFO] [timer.py:196:stop] epoch=0/micro_step=3810/global_step=3810, RunningAvgSamplesPerSec=5.029967397170501, CurrSamplesPerSec=5.061086466083689, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 04:39:46,226] [INFO] [logging.py:68:log_dist] [Rank 0] step=3820, skipped=6, lr=[2.637777777777778e-06], mom=[[0.9, 0.999]] [2022-12-20 04:39:46,227] [INFO] [timer.py:196:stop] epoch=0/micro_step=3820/global_step=3820, RunningAvgSamplesPerSec=5.030143967156818, CurrSamplesPerSec=5.040433735500809, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 2.6266666666666668e-06, 'epoch': 54.64} [2022-12-20 04:42:13,269] [INFO] [logging.py:68:log_dist] [Rank 0] step=3830, skipped=6, lr=[2.6155555555555556e-06], mom=[[0.9, 0.999]] [2022-12-20 04:42:13,270] [INFO] [timer.py:196:stop] epoch=0/micro_step=3830/global_step=3830, RunningAvgSamplesPerSec=5.030496972716335, CurrSamplesPerSec=5.0892962707836595, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 04:44:34,950] [INFO] [logging.py:68:log_dist] [Rank 0] step=3840, skipped=6, lr=[2.5933333333333336e-06], mom=[[0.9, 0.999]] [2022-12-20 04:44:34,951] [INFO] [timer.py:196:stop] epoch=0/micro_step=3840/global_step=3840, RunningAvgSamplesPerSec=5.031346325629816, CurrSamplesPerSec=5.6023436575508425, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 04:46:57,172] [INFO] [logging.py:68:log_dist] [Rank 0] step=3850, skipped=6, lr=[2.5711111111111112e-06], mom=[[0.9, 0.999]] [2022-12-20 04:46:57,173] [INFO] [timer.py:196:stop] epoch=0/micro_step=3850/global_step=3850, RunningAvgSamplesPerSec=5.032155181340912, CurrSamplesPerSec=5.437804914413287, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 2.5711111111111112e-06, 'epoch': 55.0} [2022-12-20 04:49:21,798] [INFO] [logging.py:68:log_dist] [Rank 0] step=3860, skipped=6, lr=[2.5488888888888893e-06], mom=[[0.9, 0.999]] [2022-12-20 04:49:21,800] [INFO] [timer.py:196:stop] epoch=0/micro_step=3860/global_step=3860, RunningAvgSamplesPerSec=5.032747341875238, CurrSamplesPerSec=5.111648800094207, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 04:51:46,829] [INFO] [logging.py:68:log_dist] [Rank 0] step=3870, skipped=6, lr=[2.526666666666667e-06], mom=[[0.9, 0.999]] [2022-12-20 04:51:46,830] [INFO] [timer.py:196:stop] epoch=0/micro_step=3870/global_step=3870, RunningAvgSamplesPerSec=5.033401435078496, CurrSamplesPerSec=5.408632181237298, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 2.5155555555555557e-06, 'epoch': 55.36} [2022-12-20 04:54:07,001] [INFO] [logging.py:68:log_dist] [Rank 0] step=3880, skipped=6, lr=[2.504444444444445e-06], mom=[[0.9, 0.999]] [2022-12-20 04:54:07,002] [INFO] [timer.py:196:stop] epoch=0/micro_step=3880/global_step=3880, RunningAvgSamplesPerSec=5.034569704497929, CurrSamplesPerSec=5.424155049886138, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 04:56:26,657] [INFO] [logging.py:68:log_dist] [Rank 0] step=3890, skipped=6, lr=[2.4822222222222225e-06], mom=[[0.9, 0.999]] [2022-12-20 04:56:26,659] [INFO] [timer.py:196:stop] epoch=0/micro_step=3890/global_step=3890, RunningAvgSamplesPerSec=5.035784002353498, CurrSamplesPerSec=5.575746512850949, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 04:58:46,813] [INFO] [logging.py:68:log_dist] [Rank 0] step=3900, skipped=6, lr=[2.46e-06], mom=[[0.9, 0.999]] [2022-12-20 04:58:46,815] [INFO] [timer.py:196:stop] epoch=0/micro_step=3900/global_step=3900, RunningAvgSamplesPerSec=5.036970868267857, CurrSamplesPerSec=5.571676606976653, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 2.46e-06, 'epoch': 55.71} [2022-12-20 05:01:08,974] [INFO] [logging.py:68:log_dist] [Rank 0] step=3910, skipped=6, lr=[2.437777777777778e-06], mom=[[0.9, 0.999]] [2022-12-20 05:01:08,975] [INFO] [timer.py:196:stop] epoch=0/micro_step=3910/global_step=3910, RunningAvgSamplesPerSec=5.038019236661488, CurrSamplesPerSec=5.640549183284285, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 05:03:28,700] [INFO] [logging.py:68:log_dist] [Rank 0] step=3920, skipped=6, lr=[2.415555555555556e-06], mom=[[0.9, 0.999]] [2022-12-20 05:03:28,701] [INFO] [timer.py:196:stop] epoch=0/micro_step=3920/global_step=3920, RunningAvgSamplesPerSec=5.039116343902467, CurrSamplesPerSec=5.5876312844457, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 2.4044444444444446e-06, 'epoch': 56.07} [2022-12-20 05:05:52,728] [INFO] [logging.py:68:log_dist] [Rank 0] step=3930, skipped=6, lr=[2.3933333333333334e-06], mom=[[0.9, 0.999]] [2022-12-20 05:05:52,730] [INFO] [timer.py:196:stop] epoch=0/micro_step=3930/global_step=3930, RunningAvgSamplesPerSec=5.039683225300292, CurrSamplesPerSec=5.160122717807519, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 05:08:18,747] [INFO] [logging.py:68:log_dist] [Rank 0] step=3940, skipped=6, lr=[2.371111111111111e-06], mom=[[0.9, 0.999]] [2022-12-20 05:08:18,749] [INFO] [timer.py:196:stop] epoch=0/micro_step=3940/global_step=3940, RunningAvgSamplesPerSec=5.040075715280884, CurrSamplesPerSec=5.238380926506598, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 05:10:44,757] [INFO] [logging.py:68:log_dist] [Rank 0] step=3950, skipped=6, lr=[2.348888888888889e-06], mom=[[0.9, 0.999]] [2022-12-20 05:10:44,759] [INFO] [timer.py:196:stop] epoch=0/micro_step=3950/global_step=3950, RunningAvgSamplesPerSec=5.040560949881336, CurrSamplesPerSec=5.267823659463463, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 2.348888888888889e-06, 'epoch': 56.43} [2022-12-20 05:13:08,651] [INFO] [logging.py:68:log_dist] [Rank 0] step=3960, skipped=6, lr=[2.3266666666666667e-06], mom=[[0.9, 0.999]] [2022-12-20 05:13:08,652] [INFO] [timer.py:196:stop] epoch=0/micro_step=3960/global_step=3960, RunningAvgSamplesPerSec=5.0412194992522865, CurrSamplesPerSec=5.200243387288677, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 05:15:36,139] [INFO] [logging.py:68:log_dist] [Rank 0] step=3970, skipped=6, lr=[2.3044444444444447e-06], mom=[[0.9, 0.999]] [2022-12-20 05:15:36,141] [INFO] [timer.py:196:stop] epoch=0/micro_step=3970/global_step=3970, RunningAvgSamplesPerSec=5.0415317780884195, CurrSamplesPerSec=4.9707619477102325, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 2.2933333333333335e-06, 'epoch': 56.79} [2022-12-20 05:18:09,664] [INFO] [logging.py:68:log_dist] [Rank 0] step=3980, skipped=6, lr=[2.2822222222222223e-06], mom=[[0.9, 0.999]] [2022-12-20 05:18:09,665] [INFO] [timer.py:196:stop] epoch=0/micro_step=3980/global_step=3980, RunningAvgSamplesPerSec=5.041339650106527, CurrSamplesPerSec=4.984928729113024, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 05:20:42,295] [INFO] [logging.py:68:log_dist] [Rank 0] step=3990, skipped=6, lr=[2.2600000000000004e-06], mom=[[0.9, 0.999]] [2022-12-20 05:20:42,297] [INFO] [timer.py:196:stop] epoch=0/micro_step=3990/global_step=3990, RunningAvgSamplesPerSec=5.041122736363477, CurrSamplesPerSec=5.142247817170418, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 05:23:18,112] [INFO] [logging.py:68:log_dist] [Rank 0] step=4000, skipped=6, lr=[2.237777777777778e-06], mom=[[0.9, 0.999]] [2022-12-20 05:23:18,114] [INFO] [timer.py:196:stop] epoch=0/micro_step=4000/global_step=4000, RunningAvgSamplesPerSec=5.04069535036469, CurrSamplesPerSec=4.9986659756607175, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 2.237777777777778e-06, 'epoch': 57.14} {'eval_loss': 0.469970703125, 'eval_wer': 23.39362208472156, 'eval_runtime': 827.6996, 'eval_samples_per_second': 2.739, 'eval_steps_per_second': 0.086, 'epoch': 57.14} [2022-12-20 05:37:09,346] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step4000 is begin to save! [2022-12-20 05:37:09,358] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: ./checkpoint-4000/global_step4000/mp_rank_00_model_states.pt [2022-12-20 05:37:09,358] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving ./checkpoint-4000/global_step4000/mp_rank_00_model_states.pt... [2022-12-20 05:37:12,799] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved ./checkpoint-4000/global_step4000/mp_rank_00_model_states.pt. [2022-12-20 05:37:12,801] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving ./checkpoint-4000/global_step4000/zero_pp_rank_0_mp_rank_00_optim_states.pt... [2022-12-20 05:37:30,732] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved ./checkpoint-4000/global_step4000/zero_pp_rank_0_mp_rank_00_optim_states.pt. [2022-12-20 05:37:30,734] [INFO] [engine.py:3394:_save_zero_checkpoint] zero checkpoint saved ./checkpoint-4000/global_step4000/zero_pp_rank_0_mp_rank_00_optim_states.pt [2022-12-20 05:37:30,734] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step4000 is ready now! [2022-12-20 05:41:55,491] [INFO] [stage_1_and_2.py:1767:step] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536.0, reducing to 65536.0 [2022-12-20 05:42:08,160] [INFO] [stage_1_and_2.py:1767:step] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536.0, reducing to 32768.0 [2022-12-20 05:42:35,713] [INFO] [logging.py:68:log_dist] [Rank 0] step=4010, skipped=8, lr=[2.2200000000000003e-06], mom=[[0.9, 0.999]] [2022-12-20 05:42:35,714] [INFO] [timer.py:196:stop] epoch=0/micro_step=4010/global_step=4010, RunningAvgSamplesPerSec=5.042979461159848, CurrSamplesPerSec=5.696590514613029, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 05:45:04,141] [INFO] [logging.py:68:log_dist] [Rank 0] step=4020, skipped=8, lr=[2.197777777777778e-06], mom=[[0.9, 0.999]] [2022-12-20 05:45:04,143] [INFO] [timer.py:196:stop] epoch=0/micro_step=4020/global_step=4020, RunningAvgSamplesPerSec=5.04338680465847, CurrSamplesPerSec=5.146485423364737, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 2.1866666666666668e-06, 'epoch': 57.5} [2022-12-20 05:47:36,085] [INFO] [logging.py:68:log_dist] [Rank 0] step=4030, skipped=8, lr=[2.1755555555555556e-06], mom=[[0.9, 0.999]] [2022-12-20 05:47:36,086] [INFO] [timer.py:196:stop] epoch=0/micro_step=4030/global_step=4030, RunningAvgSamplesPerSec=5.0433669842670845, CurrSamplesPerSec=5.079788047157066, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 05:50:10,634] [INFO] [logging.py:68:log_dist] [Rank 0] step=4040, skipped=8, lr=[2.153333333333333e-06], mom=[[0.9, 0.999]] [2022-12-20 05:50:10,636] [INFO] [timer.py:196:stop] epoch=0/micro_step=4040/global_step=4040, RunningAvgSamplesPerSec=5.04310395870082, CurrSamplesPerSec=5.0191439155807736, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 05:52:46,940] [INFO] [logging.py:68:log_dist] [Rank 0] step=4050, skipped=8, lr=[2.1311111111111112e-06], mom=[[0.9, 0.999]] [2022-12-20 05:52:46,941] [INFO] [timer.py:196:stop] epoch=0/micro_step=4050/global_step=4050, RunningAvgSamplesPerSec=5.042768502101079, CurrSamplesPerSec=4.842702171084141, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 2.1311111111111112e-06, 'epoch': 57.86} [2022-12-20 05:55:21,002] [INFO] [logging.py:68:log_dist] [Rank 0] step=4060, skipped=8, lr=[2.108888888888889e-06], mom=[[0.9, 0.999]] [2022-12-20 05:55:21,004] [INFO] [timer.py:196:stop] epoch=0/micro_step=4060/global_step=4060, RunningAvgSamplesPerSec=5.042487681984381, CurrSamplesPerSec=4.99908525471156, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 05:57:52,840] [INFO] [logging.py:68:log_dist] [Rank 0] step=4070, skipped=8, lr=[2.086666666666667e-06], mom=[[0.9, 0.999]] [2022-12-20 05:57:52,842] [INFO] [timer.py:196:stop] epoch=0/micro_step=4070/global_step=4070, RunningAvgSamplesPerSec=5.042344945988743, CurrSamplesPerSec=5.111629527255383, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 2.0755555555555557e-06, 'epoch': 58.21} [2022-12-20 06:00:27,903] [INFO] [logging.py:68:log_dist] [Rank 0] step=4080, skipped=8, lr=[2.064444444444445e-06], mom=[[0.9, 0.999]] [2022-12-20 06:00:27,905] [INFO] [timer.py:196:stop] epoch=0/micro_step=4080/global_step=4080, RunningAvgSamplesPerSec=5.0419804402842665, CurrSamplesPerSec=4.930916151588083, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 06:02:52,754] [INFO] [logging.py:68:log_dist] [Rank 0] step=4090, skipped=8, lr=[2.0422222222222225e-06], mom=[[0.9, 0.999]] [2022-12-20 06:02:52,756] [INFO] [timer.py:196:stop] epoch=0/micro_step=4090/global_step=4090, RunningAvgSamplesPerSec=5.042556557819047, CurrSamplesPerSec=5.363131248975815, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 06:05:17,984] [INFO] [logging.py:68:log_dist] [Rank 0] step=4100, skipped=8, lr=[2.02e-06], mom=[[0.9, 0.999]] [2022-12-20 06:05:17,986] [INFO] [timer.py:196:stop] epoch=0/micro_step=4100/global_step=4100, RunningAvgSamplesPerSec=5.0431467991488805, CurrSamplesPerSec=5.085720797651708, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 2.02e-06, 'epoch': 58.57} [2022-12-20 06:07:41,649] [INFO] [logging.py:68:log_dist] [Rank 0] step=4110, skipped=8, lr=[1.9977777777777778e-06], mom=[[0.9, 0.999]] [2022-12-20 06:07:41,650] [INFO] [timer.py:196:stop] epoch=0/micro_step=4110/global_step=4110, RunningAvgSamplesPerSec=5.043948241474721, CurrSamplesPerSec=5.412140246565025, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 06:10:10,464] [INFO] [logging.py:68:log_dist] [Rank 0] step=4120, skipped=8, lr=[1.975555555555556e-06], mom=[[0.9, 0.999]] [2022-12-20 06:10:10,465] [INFO] [timer.py:196:stop] epoch=0/micro_step=4120/global_step=4120, RunningAvgSamplesPerSec=5.044459483834623, CurrSamplesPerSec=5.313749224322798, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 1.9644444444444446e-06, 'epoch': 58.93} [2022-12-20 06:12:39,140] [INFO] [logging.py:68:log_dist] [Rank 0] step=4130, skipped=8, lr=[1.9533333333333334e-06], mom=[[0.9, 0.999]] [2022-12-20 06:12:39,141] [INFO] [timer.py:196:stop] epoch=0/micro_step=4130/global_step=4130, RunningAvgSamplesPerSec=5.0447642989840675, CurrSamplesPerSec=5.2478424910556045, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 06:15:02,533] [INFO] [logging.py:68:log_dist] [Rank 0] step=4140, skipped=8, lr=[1.9311111111111114e-06], mom=[[0.9, 0.999]] [2022-12-20 06:15:02,535] [INFO] [timer.py:196:stop] epoch=0/micro_step=4140/global_step=4140, RunningAvgSamplesPerSec=5.045575952263303, CurrSamplesPerSec=5.5830596404364705, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 06:17:27,095] [INFO] [logging.py:68:log_dist] [Rank 0] step=4150, skipped=8, lr=[1.908888888888889e-06], mom=[[0.9, 0.999]] [2022-12-20 06:17:27,101] [INFO] [timer.py:196:stop] epoch=0/micro_step=4150/global_step=4150, RunningAvgSamplesPerSec=5.046241123558999, CurrSamplesPerSec=5.299716771107708, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 1.908888888888889e-06, 'epoch': 59.29} [2022-12-20 06:20:25,241] [INFO] [logging.py:68:log_dist] [Rank 0] step=4160, skipped=8, lr=[1.8866666666666669e-06], mom=[[0.9, 0.999]] [2022-12-20 06:20:25,244] [INFO] [timer.py:196:stop] epoch=0/micro_step=4160/global_step=4160, RunningAvgSamplesPerSec=5.043822466629399, CurrSamplesPerSec=5.56617254195275, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 06:22:50,983] [INFO] [logging.py:68:log_dist] [Rank 0] step=4170, skipped=8, lr=[1.8644444444444445e-06], mom=[[0.9, 0.999]] [2022-12-20 06:22:50,984] [INFO] [timer.py:196:stop] epoch=0/micro_step=4170/global_step=4170, RunningAvgSamplesPerSec=5.044478496015336, CurrSamplesPerSec=5.2003204555095985, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 1.8533333333333333e-06, 'epoch': 59.64} [2022-12-20 06:25:17,347] [INFO] [logging.py:68:log_dist] [Rank 0] step=4180, skipped=8, lr=[1.8422222222222225e-06], mom=[[0.9, 0.999]] [2022-12-20 06:25:17,348] [INFO] [timer.py:196:stop] epoch=0/micro_step=4180/global_step=4180, RunningAvgSamplesPerSec=5.045012622651632, CurrSamplesPerSec=5.299783527305324, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 06:27:42,485] [INFO] [logging.py:68:log_dist] [Rank 0] step=4190, skipped=8, lr=[1.8200000000000002e-06], mom=[[0.9, 0.999]] [2022-12-20 06:27:42,487] [INFO] [timer.py:196:stop] epoch=0/micro_step=4190/global_step=4190, RunningAvgSamplesPerSec=5.045662826265351, CurrSamplesPerSec=5.721540380522979, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 06:30:02,274] [INFO] [logging.py:68:log_dist] [Rank 0] step=4200, skipped=8, lr=[1.797777777777778e-06], mom=[[0.9, 0.999]] [2022-12-20 06:30:02,276] [INFO] [timer.py:196:stop] epoch=0/micro_step=4200/global_step=4200, RunningAvgSamplesPerSec=5.046791681237073, CurrSamplesPerSec=5.624625885538937, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 1.797777777777778e-06, 'epoch': 60.0} [2022-12-20 06:32:26,649] [INFO] [logging.py:68:log_dist] [Rank 0] step=4210, skipped=8, lr=[1.7755555555555556e-06], mom=[[0.9, 0.999]] [2022-12-20 06:32:26,651] [INFO] [timer.py:196:stop] epoch=0/micro_step=4210/global_step=4210, RunningAvgSamplesPerSec=5.047454122770001, CurrSamplesPerSec=5.141653397468583, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 06:34:52,412] [INFO] [logging.py:68:log_dist] [Rank 0] step=4220, skipped=8, lr=[1.7533333333333336e-06], mom=[[0.9, 0.999]] [2022-12-20 06:34:52,414] [INFO] [timer.py:196:stop] epoch=0/micro_step=4220/global_step=4220, RunningAvgSamplesPerSec=5.047994245909812, CurrSamplesPerSec=5.319056367099886, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 1.7422222222222224e-06, 'epoch': 60.36} [2022-12-20 06:37:15,987] [INFO] [logging.py:68:log_dist] [Rank 0] step=4230, skipped=8, lr=[1.7311111111111112e-06], mom=[[0.9, 0.999]] [2022-12-20 06:37:15,989] [INFO] [timer.py:196:stop] epoch=0/micro_step=4230/global_step=4230, RunningAvgSamplesPerSec=5.04866732774536, CurrSamplesPerSec=5.388713208948523, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 06:39:39,883] [INFO] [logging.py:68:log_dist] [Rank 0] step=4240, skipped=8, lr=[1.708888888888889e-06], mom=[[0.9, 0.999]] [2022-12-20 06:39:39,884] [INFO] [timer.py:196:stop] epoch=0/micro_step=4240/global_step=4240, RunningAvgSamplesPerSec=5.049283859017793, CurrSamplesPerSec=5.132029172229295, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 06:42:06,810] [INFO] [logging.py:68:log_dist] [Rank 0] step=4250, skipped=8, lr=[1.6866666666666667e-06], mom=[[0.9, 0.999]] [2022-12-20 06:42:06,811] [INFO] [timer.py:196:stop] epoch=0/micro_step=4250/global_step=4250, RunningAvgSamplesPerSec=5.049603812984208, CurrSamplesPerSec=5.245656612106342, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 1.6866666666666667e-06, 'epoch': 60.71} [2022-12-20 06:44:32,622] [INFO] [logging.py:68:log_dist] [Rank 0] step=4260, skipped=8, lr=[1.6644444444444447e-06], mom=[[0.9, 0.999]] [2022-12-20 06:44:32,623] [INFO] [timer.py:196:stop] epoch=0/micro_step=4260/global_step=4260, RunningAvgSamplesPerSec=5.05001146839292, CurrSamplesPerSec=5.207611653499008, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 06:46:59,516] [INFO] [logging.py:68:log_dist] [Rank 0] step=4270, skipped=8, lr=[1.6422222222222223e-06], mom=[[0.9, 0.999]] [2022-12-20 06:46:59,518] [INFO] [timer.py:196:stop] epoch=0/micro_step=4270/global_step=4270, RunningAvgSamplesPerSec=5.050310918921533, CurrSamplesPerSec=5.3487399204314645, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 1.6311111111111114e-06, 'epoch': 61.07} [2022-12-20 06:49:25,738] [INFO] [logging.py:68:log_dist] [Rank 0] step=4280, skipped=8, lr=[1.6200000000000002e-06], mom=[[0.9, 0.999]] [2022-12-20 06:49:25,739] [INFO] [timer.py:196:stop] epoch=0/micro_step=4280/global_step=4280, RunningAvgSamplesPerSec=5.050636049742784, CurrSamplesPerSec=5.267817766999322, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 06:51:52,463] [INFO] [logging.py:68:log_dist] [Rank 0] step=4290, skipped=8, lr=[1.5977777777777778e-06], mom=[[0.9, 0.999]] [2022-12-20 06:51:52,464] [INFO] [timer.py:196:stop] epoch=0/micro_step=4290/global_step=4290, RunningAvgSamplesPerSec=5.050927321793066, CurrSamplesPerSec=5.091219627389776, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 06:54:20,387] [INFO] [logging.py:68:log_dist] [Rank 0] step=4300, skipped=8, lr=[1.5755555555555558e-06], mom=[[0.9, 0.999]] [2022-12-20 06:54:20,389] [INFO] [timer.py:196:stop] epoch=0/micro_step=4300/global_step=4300, RunningAvgSamplesPerSec=5.051116554864754, CurrSamplesPerSec=5.131412687451037, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 1.5755555555555558e-06, 'epoch': 61.43} [2022-12-20 06:56:49,899] [INFO] [logging.py:68:log_dist] [Rank 0] step=4310, skipped=8, lr=[1.5533333333333334e-06], mom=[[0.9, 0.999]] [2022-12-20 06:56:49,901] [INFO] [timer.py:196:stop] epoch=0/micro_step=4310/global_step=4310, RunningAvgSamplesPerSec=5.051115029415557, CurrSamplesPerSec=5.207083942120238, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 06:59:24,979] [INFO] [logging.py:68:log_dist] [Rank 0] step=4320, skipped=8, lr=[1.5311111111111113e-06], mom=[[0.9, 0.999]] [2022-12-20 06:59:24,980] [INFO] [timer.py:196:stop] epoch=0/micro_step=4320/global_step=4320, RunningAvgSamplesPerSec=5.050576559473915, CurrSamplesPerSec=4.720912328409414, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 1.52e-06, 'epoch': 61.79} [2022-12-20 07:02:01,098] [INFO] [logging.py:68:log_dist] [Rank 0] step=4330, skipped=8, lr=[1.5088888888888889e-06], mom=[[0.9, 0.999]] [2022-12-20 07:02:01,099] [INFO] [timer.py:196:stop] epoch=0/micro_step=4330/global_step=4330, RunningAvgSamplesPerSec=5.0499269186777225, CurrSamplesPerSec=4.822616218008508, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 07:04:34,365] [INFO] [logging.py:68:log_dist] [Rank 0] step=4340, skipped=8, lr=[1.486666666666667e-06], mom=[[0.9, 0.999]] [2022-12-20 07:04:34,366] [INFO] [timer.py:196:stop] epoch=0/micro_step=4340/global_step=4340, RunningAvgSamplesPerSec=5.049580360469089, CurrSamplesPerSec=5.267244917024575, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 07:07:06,150] [INFO] [logging.py:68:log_dist] [Rank 0] step=4350, skipped=8, lr=[1.4644444444444445e-06], mom=[[0.9, 0.999]] [2022-12-20 07:07:06,152] [INFO] [timer.py:196:stop] epoch=0/micro_step=4350/global_step=4350, RunningAvgSamplesPerSec=5.04936238047649, CurrSamplesPerSec=4.829278603413176, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 1.4644444444444445e-06, 'epoch': 62.14} [2022-12-20 07:09:42,081] [INFO] [logging.py:68:log_dist] [Rank 0] step=4360, skipped=8, lr=[1.4422222222222223e-06], mom=[[0.9, 0.999]] [2022-12-20 07:09:42,082] [INFO] [timer.py:196:stop] epoch=0/micro_step=4360/global_step=4360, RunningAvgSamplesPerSec=5.048784732951024, CurrSamplesPerSec=4.751107522016899, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 07:12:19,735] [INFO] [logging.py:68:log_dist] [Rank 0] step=4370, skipped=8, lr=[1.42e-06], mom=[[0.9, 0.999]] [2022-12-20 07:12:19,736] [INFO] [timer.py:196:stop] epoch=0/micro_step=4370/global_step=4370, RunningAvgSamplesPerSec=5.048028850314156, CurrSamplesPerSec=4.923453726629133, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 1.4088888888888892e-06, 'epoch': 62.5} [2022-12-20 07:14:51,218] [INFO] [logging.py:68:log_dist] [Rank 0] step=4380, skipped=8, lr=[1.397777777777778e-06], mom=[[0.9, 0.999]] [2022-12-20 07:14:51,219] [INFO] [timer.py:196:stop] epoch=0/micro_step=4380/global_step=4380, RunningAvgSamplesPerSec=5.047867689289667, CurrSamplesPerSec=4.890229593461905, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 07:17:28,428] [INFO] [logging.py:68:log_dist] [Rank 0] step=4390, skipped=8, lr=[1.3755555555555556e-06], mom=[[0.9, 0.999]] [2022-12-20 07:17:28,430] [INFO] [timer.py:196:stop] epoch=0/micro_step=4390/global_step=4390, RunningAvgSamplesPerSec=5.047159814157629, CurrSamplesPerSec=4.948499742634576, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 07:20:06,640] [INFO] [logging.py:68:log_dist] [Rank 0] step=4400, skipped=8, lr=[1.3533333333333334e-06], mom=[[0.9, 0.999]] [2022-12-20 07:20:06,642] [INFO] [timer.py:196:stop] epoch=0/micro_step=4400/global_step=4400, RunningAvgSamplesPerSec=5.046366139085683, CurrSamplesPerSec=4.623666005682293, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 1.3533333333333334e-06, 'epoch': 62.86} [2022-12-20 07:22:42,638] [INFO] [logging.py:68:log_dist] [Rank 0] step=4410, skipped=8, lr=[1.3311111111111113e-06], mom=[[0.9, 0.999]] [2022-12-20 07:22:42,640] [INFO] [timer.py:196:stop] epoch=0/micro_step=4410/global_step=4410, RunningAvgSamplesPerSec=5.045758620454289, CurrSamplesPerSec=5.076388247096151, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 07:25:17,610] [INFO] [logging.py:68:log_dist] [Rank 0] step=4420, skipped=8, lr=[1.308888888888889e-06], mom=[[0.9, 0.999]] [2022-12-20 07:25:17,612] [INFO] [timer.py:196:stop] epoch=0/micro_step=4420/global_step=4420, RunningAvgSamplesPerSec=5.045221775215709, CurrSamplesPerSec=4.847442318721658, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 1.2977777777777779e-06, 'epoch': 63.21} [2022-12-20 07:27:55,202] [INFO] [logging.py:68:log_dist] [Rank 0] step=4430, skipped=8, lr=[1.286666666666667e-06], mom=[[0.9, 0.999]] [2022-12-20 07:27:55,203] [INFO] [timer.py:196:stop] epoch=0/micro_step=4430/global_step=4430, RunningAvgSamplesPerSec=5.0444825303875485, CurrSamplesPerSec=4.71763327003565, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 07:30:32,300] [INFO] [logging.py:68:log_dist] [Rank 0] step=4440, skipped=8, lr=[1.2644444444444445e-06], mom=[[0.9, 0.999]] [2022-12-20 07:30:32,302] [INFO] [timer.py:196:stop] epoch=0/micro_step=4440/global_step=4440, RunningAvgSamplesPerSec=5.043765816300071, CurrSamplesPerSec=4.756459465106135, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 07:33:08,239] [INFO] [logging.py:68:log_dist] [Rank 0] step=4450, skipped=8, lr=[1.2422222222222224e-06], mom=[[0.9, 0.999]] [2022-12-20 07:33:08,241] [INFO] [timer.py:196:stop] epoch=0/micro_step=4450/global_step=4450, RunningAvgSamplesPerSec=5.0432365230532445, CurrSamplesPerSec=5.085449771164148, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 1.2422222222222224e-06, 'epoch': 63.57} [2022-12-20 07:35:43,710] [INFO] [logging.py:68:log_dist] [Rank 0] step=4460, skipped=8, lr=[1.2200000000000002e-06], mom=[[0.9, 0.999]] [2022-12-20 07:35:43,711] [INFO] [timer.py:196:stop] epoch=0/micro_step=4460/global_step=4460, RunningAvgSamplesPerSec=5.042712521939606, CurrSamplesPerSec=4.778885074217907, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 07:38:17,792] [INFO] [logging.py:68:log_dist] [Rank 0] step=4470, skipped=8, lr=[1.1977777777777778e-06], mom=[[0.9, 0.999]] [2022-12-20 07:38:17,794] [INFO] [timer.py:196:stop] epoch=0/micro_step=4470/global_step=4470, RunningAvgSamplesPerSec=5.042281683097091, CurrSamplesPerSec=5.02905169577589, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 1.1866666666666668e-06, 'epoch': 63.93} [2022-12-20 07:40:49,311] [INFO] [logging.py:68:log_dist] [Rank 0] step=4480, skipped=8, lr=[1.1755555555555556e-06], mom=[[0.9, 0.999]] [2022-12-20 07:40:49,312] [INFO] [timer.py:196:stop] epoch=0/micro_step=4480/global_step=4480, RunningAvgSamplesPerSec=5.042148780780479, CurrSamplesPerSec=5.1255290111304, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 07:43:11,925] [INFO] [logging.py:68:log_dist] [Rank 0] step=4490, skipped=8, lr=[1.1533333333333334e-06], mom=[[0.9, 0.999]] [2022-12-20 07:43:11,926] [INFO] [timer.py:196:stop] epoch=0/micro_step=4490/global_step=4490, RunningAvgSamplesPerSec=5.042788143087416, CurrSamplesPerSec=5.203591461607821, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 07:45:36,568] [INFO] [logging.py:68:log_dist] [Rank 0] step=4500, skipped=8, lr=[1.131111111111111e-06], mom=[[0.9, 0.999]] [2022-12-20 07:45:36,570] [INFO] [timer.py:196:stop] epoch=0/micro_step=4500/global_step=4500, RunningAvgSamplesPerSec=5.043178181242045, CurrSamplesPerSec=5.263032480306624, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 1.131111111111111e-06, 'epoch': 64.29} [2022-12-20 07:48:03,486] [INFO] [logging.py:68:log_dist] [Rank 0] step=4510, skipped=8, lr=[1.1088888888888889e-06], mom=[[0.9, 0.999]] [2022-12-20 07:48:03,488] [INFO] [timer.py:196:stop] epoch=0/micro_step=4510/global_step=4510, RunningAvgSamplesPerSec=5.043365421470672, CurrSamplesPerSec=4.939414436159629, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 07:50:29,784] [INFO] [logging.py:68:log_dist] [Rank 0] step=4520, skipped=8, lr=[1.0866666666666667e-06], mom=[[0.9, 0.999]] [2022-12-20 07:50:29,785] [INFO] [timer.py:196:stop] epoch=0/micro_step=4520/global_step=4520, RunningAvgSamplesPerSec=5.043697590182427, CurrSamplesPerSec=5.449464959209118, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 1.0755555555555557e-06, 'epoch': 64.64} [2022-12-20 07:52:56,799] [INFO] [logging.py:68:log_dist] [Rank 0] step=4530, skipped=8, lr=[1.0644444444444445e-06], mom=[[0.9, 0.999]] [2022-12-20 07:52:56,801] [INFO] [timer.py:196:stop] epoch=0/micro_step=4530/global_step=4530, RunningAvgSamplesPerSec=5.043944921085424, CurrSamplesPerSec=4.848887789663811, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 07:55:33,034] [INFO] [logging.py:68:log_dist] [Rank 0] step=4540, skipped=8, lr=[1.0422222222222221e-06], mom=[[0.9, 0.999]] [2022-12-20 07:55:33,036] [INFO] [timer.py:196:stop] epoch=0/micro_step=4540/global_step=4540, RunningAvgSamplesPerSec=5.043375303491235, CurrSamplesPerSec=4.778829859739575, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 07:58:03,913] [INFO] [logging.py:68:log_dist] [Rank 0] step=4550, skipped=8, lr=[1.02e-06], mom=[[0.9, 0.999]] [2022-12-20 07:58:03,914] [INFO] [timer.py:196:stop] epoch=0/micro_step=4550/global_step=4550, RunningAvgSamplesPerSec=5.043328963013769, CurrSamplesPerSec=5.608318938320819, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 1.02e-06, 'epoch': 65.0} [2022-12-20 08:00:25,713] [INFO] [logging.py:68:log_dist] [Rank 0] step=4560, skipped=8, lr=[9.97777777777778e-07], mom=[[0.9, 0.999]] [2022-12-20 08:00:25,714] [INFO] [timer.py:196:stop] epoch=0/micro_step=4560/global_step=4560, RunningAvgSamplesPerSec=5.044098201656415, CurrSamplesPerSec=5.341246991766891, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 08:02:46,105] [INFO] [logging.py:68:log_dist] [Rank 0] step=4570, skipped=8, lr=[9.755555555555556e-07], mom=[[0.9, 0.999]] [2022-12-20 08:02:46,106] [INFO] [timer.py:196:stop] epoch=0/micro_step=4570/global_step=4570, RunningAvgSamplesPerSec=5.044950401998271, CurrSamplesPerSec=5.438711647812863, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 9.644444444444444e-07, 'epoch': 65.36} [2022-12-20 08:05:10,797] [INFO] [logging.py:68:log_dist] [Rank 0] step=4580, skipped=8, lr=[9.533333333333335e-07], mom=[[0.9, 0.999]] [2022-12-20 08:05:10,798] [INFO] [timer.py:196:stop] epoch=0/micro_step=4580/global_step=4580, RunningAvgSamplesPerSec=5.045476504759206, CurrSamplesPerSec=5.188928621390404, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 08:07:36,800] [INFO] [logging.py:68:log_dist] [Rank 0] step=4590, skipped=8, lr=[9.311111111111113e-07], mom=[[0.9, 0.999]] [2022-12-20 08:07:36,802] [INFO] [timer.py:196:stop] epoch=0/micro_step=4590/global_step=4590, RunningAvgSamplesPerSec=5.045847427660788, CurrSamplesPerSec=5.12575734566362, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 08:10:02,459] [INFO] [logging.py:68:log_dist] [Rank 0] step=4600, skipped=8, lr=[9.08888888888889e-07], mom=[[0.9, 0.999]] [2022-12-20 08:10:02,460] [INFO] [timer.py:196:stop] epoch=0/micro_step=4600/global_step=4600, RunningAvgSamplesPerSec=5.046269260328224, CurrSamplesPerSec=5.21262751846709, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 9.08888888888889e-07, 'epoch': 65.71} [2022-12-20 08:12:26,062] [INFO] [logging.py:68:log_dist] [Rank 0] step=4610, skipped=8, lr=[8.866666666666668e-07], mom=[[0.9, 0.999]] [2022-12-20 08:12:26,063] [INFO] [timer.py:196:stop] epoch=0/micro_step=4610/global_step=4610, RunningAvgSamplesPerSec=5.046797422211321, CurrSamplesPerSec=5.262579211303562, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 08:14:48,783] [INFO] [logging.py:68:log_dist] [Rank 0] step=4620, skipped=8, lr=[8.644444444444445e-07], mom=[[0.9, 0.999]] [2022-12-20 08:14:48,785] [INFO] [timer.py:196:stop] epoch=0/micro_step=4620/global_step=4620, RunningAvgSamplesPerSec=5.0474143889414895, CurrSamplesPerSec=5.478198811861338, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 8.533333333333334e-07, 'epoch': 66.07} [2022-12-20 08:17:14,091] [INFO] [logging.py:68:log_dist] [Rank 0] step=4630, skipped=8, lr=[8.422222222222224e-07], mom=[[0.9, 0.999]] [2022-12-20 08:17:14,092] [INFO] [timer.py:196:stop] epoch=0/micro_step=4630/global_step=4630, RunningAvgSamplesPerSec=5.047884003074243, CurrSamplesPerSec=5.288036006059478, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 08:19:37,655] [INFO] [logging.py:68:log_dist] [Rank 0] step=4640, skipped=8, lr=[8.200000000000001e-07], mom=[[0.9, 0.999]] [2022-12-20 08:19:37,657] [INFO] [timer.py:196:stop] epoch=0/micro_step=4640/global_step=4640, RunningAvgSamplesPerSec=5.048458691418149, CurrSamplesPerSec=5.257712499872688, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 08:22:00,831] [INFO] [logging.py:68:log_dist] [Rank 0] step=4650, skipped=8, lr=[7.977777777777779e-07], mom=[[0.9, 0.999]] [2022-12-20 08:22:00,833] [INFO] [timer.py:196:stop] epoch=0/micro_step=4650/global_step=4650, RunningAvgSamplesPerSec=5.049032023727786, CurrSamplesPerSec=5.335302655950034, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 7.977777777777779e-07, 'epoch': 66.43} [2022-12-20 08:24:23,381] [INFO] [logging.py:68:log_dist] [Rank 0] step=4660, skipped=8, lr=[7.755555555555556e-07], mom=[[0.9, 0.999]] [2022-12-20 08:24:23,382] [INFO] [timer.py:196:stop] epoch=0/micro_step=4660/global_step=4660, RunningAvgSamplesPerSec=5.049679844711924, CurrSamplesPerSec=5.473512879448466, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 08:26:48,255] [INFO] [logging.py:68:log_dist] [Rank 0] step=4670, skipped=8, lr=[7.533333333333335e-07], mom=[[0.9, 0.999]] [2022-12-20 08:26:48,257] [INFO] [timer.py:196:stop] epoch=0/micro_step=4670/global_step=4670, RunningAvgSamplesPerSec=5.050125782334478, CurrSamplesPerSec=5.377273288074986, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 7.422222222222223e-07, 'epoch': 66.79} [2022-12-20 08:29:15,132] [INFO] [logging.py:68:log_dist] [Rank 0] step=4680, skipped=8, lr=[7.311111111111112e-07], mom=[[0.9, 0.999]] [2022-12-20 08:29:15,134] [INFO] [timer.py:196:stop] epoch=0/micro_step=4680/global_step=4680, RunningAvgSamplesPerSec=5.050512474845127, CurrSamplesPerSec=5.138506888939374, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 08:31:41,228] [INFO] [logging.py:68:log_dist] [Rank 0] step=4690, skipped=8, lr=[7.08888888888889e-07], mom=[[0.9, 0.999]] [2022-12-20 08:31:41,230] [INFO] [timer.py:196:stop] epoch=0/micro_step=4690/global_step=4690, RunningAvgSamplesPerSec=5.050892558272724, CurrSamplesPerSec=5.392371818165035, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 08:34:05,021] [INFO] [logging.py:68:log_dist] [Rank 0] step=4700, skipped=8, lr=[6.866666666666667e-07], mom=[[0.9, 0.999]] [2022-12-20 08:34:05,022] [INFO] [timer.py:196:stop] epoch=0/micro_step=4700/global_step=4700, RunningAvgSamplesPerSec=5.051463391225271, CurrSamplesPerSec=5.1622127629198085, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 6.866666666666667e-07, 'epoch': 67.14} [2022-12-20 08:36:30,437] [INFO] [logging.py:68:log_dist] [Rank 0] step=4710, skipped=8, lr=[6.644444444444446e-07], mom=[[0.9, 0.999]] [2022-12-20 08:36:30,438] [INFO] [timer.py:196:stop] epoch=0/micro_step=4710/global_step=4710, RunningAvgSamplesPerSec=5.051893912055331, CurrSamplesPerSec=5.445218800319529, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 08:38:59,891] [INFO] [logging.py:68:log_dist] [Rank 0] step=4720, skipped=8, lr=[6.422222222222223e-07], mom=[[0.9, 0.999]] [2022-12-20 08:38:59,893] [INFO] [timer.py:196:stop] epoch=0/micro_step=4720/global_step=4720, RunningAvgSamplesPerSec=5.052178801787182, CurrSamplesPerSec=5.115131684190571, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 6.311111111111112e-07, 'epoch': 67.5} [2022-12-20 08:41:26,544] [INFO] [logging.py:68:log_dist] [Rank 0] step=4730, skipped=8, lr=[6.200000000000001e-07], mom=[[0.9, 0.999]] [2022-12-20 08:41:26,545] [INFO] [timer.py:196:stop] epoch=0/micro_step=4730/global_step=4730, RunningAvgSamplesPerSec=5.05254451772132, CurrSamplesPerSec=5.304123565487315, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 08:43:52,849] [INFO] [logging.py:68:log_dist] [Rank 0] step=4740, skipped=8, lr=[5.977777777777778e-07], mom=[[0.9, 0.999]] [2022-12-20 08:43:52,851] [INFO] [timer.py:196:stop] epoch=0/micro_step=4740/global_step=4740, RunningAvgSamplesPerSec=5.053035885215057, CurrSamplesPerSec=5.329902208617103, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 08:46:17,184] [INFO] [logging.py:68:log_dist] [Rank 0] step=4750, skipped=8, lr=[5.755555555555555e-07], mom=[[0.9, 0.999]] [2022-12-20 08:46:17,185] [INFO] [timer.py:196:stop] epoch=0/micro_step=4750/global_step=4750, RunningAvgSamplesPerSec=5.053785097454139, CurrSamplesPerSec=5.481972930186799, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 5.755555555555555e-07, 'epoch': 67.86} [2022-12-20 08:48:33,665] [INFO] [logging.py:68:log_dist] [Rank 0] step=4760, skipped=8, lr=[5.533333333333334e-07], mom=[[0.9, 0.999]] [2022-12-20 08:48:33,666] [INFO] [timer.py:196:stop] epoch=0/micro_step=4760/global_step=4760, RunningAvgSamplesPerSec=5.055487957320154, CurrSamplesPerSec=6.146145092417891, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 08:50:52,761] [INFO] [logging.py:68:log_dist] [Rank 0] step=4770, skipped=8, lr=[5.311111111111111e-07], mom=[[0.9, 0.999]] [2022-12-20 08:50:52,762] [INFO] [timer.py:196:stop] epoch=0/micro_step=4770/global_step=4770, RunningAvgSamplesPerSec=5.056655527182182, CurrSamplesPerSec=5.561989132264451, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 5.2e-07, 'epoch': 68.21} [2022-12-20 08:53:12,380] [INFO] [logging.py:68:log_dist] [Rank 0] step=4780, skipped=8, lr=[5.088888888888889e-07], mom=[[0.9, 0.999]] [2022-12-20 08:53:12,382] [INFO] [timer.py:196:stop] epoch=0/micro_step=4780/global_step=4780, RunningAvgSamplesPerSec=5.057621824492988, CurrSamplesPerSec=5.652548091745247, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 08:55:31,207] [INFO] [logging.py:68:log_dist] [Rank 0] step=4790, skipped=8, lr=[4.866666666666666e-07], mom=[[0.9, 0.999]] [2022-12-20 08:55:31,208] [INFO] [timer.py:196:stop] epoch=0/micro_step=4790/global_step=4790, RunningAvgSamplesPerSec=5.058667203586027, CurrSamplesPerSec=5.664042371231427, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 08:57:50,720] [INFO] [logging.py:68:log_dist] [Rank 0] step=4800, skipped=8, lr=[4.6444444444444446e-07], mom=[[0.9, 0.999]] [2022-12-20 08:57:50,721] [INFO] [timer.py:196:stop] epoch=0/micro_step=4800/global_step=4800, RunningAvgSamplesPerSec=5.059570925383668, CurrSamplesPerSec=5.519459240106216, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 4.6444444444444446e-07, 'epoch': 68.57} [2022-12-20 09:00:12,990] [INFO] [logging.py:68:log_dist] [Rank 0] step=4810, skipped=8, lr=[4.422222222222223e-07], mom=[[0.9, 0.999]] [2022-12-20 09:00:12,992] [INFO] [timer.py:196:stop] epoch=0/micro_step=4810/global_step=4810, RunningAvgSamplesPerSec=5.060229523268901, CurrSamplesPerSec=5.224763032139902, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 09:02:36,045] [INFO] [logging.py:68:log_dist] [Rank 0] step=4820, skipped=8, lr=[4.2000000000000006e-07], mom=[[0.9, 0.999]] [2022-12-20 09:02:36,046] [INFO] [timer.py:196:stop] epoch=0/micro_step=4820/global_step=4820, RunningAvgSamplesPerSec=5.0608145350618585, CurrSamplesPerSec=5.335017418001835, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 4.0888888888888897e-07, 'epoch': 68.93} [2022-12-20 09:05:03,069] [INFO] [logging.py:68:log_dist] [Rank 0] step=4830, skipped=8, lr=[3.9777777777777783e-07], mom=[[0.9, 0.999]] [2022-12-20 09:05:03,071] [INFO] [timer.py:196:stop] epoch=0/micro_step=4830/global_step=4830, RunningAvgSamplesPerSec=5.061128322114403, CurrSamplesPerSec=5.27507112151907, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 09:07:29,555] [INFO] [logging.py:68:log_dist] [Rank 0] step=4840, skipped=8, lr=[3.755555555555556e-07], mom=[[0.9, 0.999]] [2022-12-20 09:07:29,557] [INFO] [timer.py:196:stop] epoch=0/micro_step=4840/global_step=4840, RunningAvgSamplesPerSec=5.061443297556752, CurrSamplesPerSec=5.330757008706792, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 09:09:54,208] [INFO] [logging.py:68:log_dist] [Rank 0] step=4850, skipped=8, lr=[3.533333333333334e-07], mom=[[0.9, 0.999]] [2022-12-20 09:09:54,210] [INFO] [timer.py:196:stop] epoch=0/micro_step=4850/global_step=4850, RunningAvgSamplesPerSec=5.061879695879943, CurrSamplesPerSec=5.24628147577871, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 3.533333333333334e-07, 'epoch': 69.29} [2022-12-20 09:12:18,737] [INFO] [logging.py:68:log_dist] [Rank 0] step=4860, skipped=8, lr=[3.3111111111111115e-07], mom=[[0.9, 0.999]] [2022-12-20 09:12:18,738] [INFO] [timer.py:196:stop] epoch=0/micro_step=4860/global_step=4860, RunningAvgSamplesPerSec=5.062324568211323, CurrSamplesPerSec=5.235211029343304, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 09:14:42,180] [INFO] [logging.py:68:log_dist] [Rank 0] step=4870, skipped=8, lr=[3.088888888888889e-07], mom=[[0.9, 0.999]] [2022-12-20 09:14:42,181] [INFO] [timer.py:196:stop] epoch=0/micro_step=4870/global_step=4870, RunningAvgSamplesPerSec=5.062819565868312, CurrSamplesPerSec=5.294168184906786, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 2.977777777777778e-07, 'epoch': 69.64} [2022-12-20 09:17:07,652] [INFO] [logging.py:68:log_dist] [Rank 0] step=4880, skipped=8, lr=[2.866666666666667e-07], mom=[[0.9, 0.999]] [2022-12-20 09:17:07,654] [INFO] [timer.py:196:stop] epoch=0/micro_step=4880/global_step=4880, RunningAvgSamplesPerSec=5.063114334971396, CurrSamplesPerSec=5.268436237516054, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 09:19:33,593] [INFO] [logging.py:68:log_dist] [Rank 0] step=4890, skipped=8, lr=[2.6444444444444447e-07], mom=[[0.9, 0.999]] [2022-12-20 09:19:33,595] [INFO] [timer.py:196:stop] epoch=0/micro_step=4890/global_step=4890, RunningAvgSamplesPerSec=5.063346113868746, CurrSamplesPerSec=5.148231572465386, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 09:21:59,702] [INFO] [logging.py:68:log_dist] [Rank 0] step=4900, skipped=8, lr=[2.4222222222222224e-07], mom=[[0.9, 0.999]] [2022-12-20 09:21:59,704] [INFO] [timer.py:196:stop] epoch=0/micro_step=4900/global_step=4900, RunningAvgSamplesPerSec=5.063642263622397, CurrSamplesPerSec=5.281941504872331, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 2.4222222222222224e-07, 'epoch': 70.0} [2022-12-20 09:24:26,305] [INFO] [logging.py:68:log_dist] [Rank 0] step=4910, skipped=8, lr=[2.2e-07], mom=[[0.9, 0.999]] [2022-12-20 09:24:26,307] [INFO] [timer.py:196:stop] epoch=0/micro_step=4910/global_step=4910, RunningAvgSamplesPerSec=5.063868324555161, CurrSamplesPerSec=5.059670421090867, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 09:26:55,101] [INFO] [logging.py:68:log_dist] [Rank 0] step=4920, skipped=8, lr=[1.9777777777777778e-07], mom=[[0.9, 0.999]] [2022-12-20 09:26:55,103] [INFO] [timer.py:196:stop] epoch=0/micro_step=4920/global_step=4920, RunningAvgSamplesPerSec=5.063894210846909, CurrSamplesPerSec=4.78239546982397, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 1.866666666666667e-07, 'epoch': 70.36} [2022-12-20 09:29:28,157] [INFO] [logging.py:68:log_dist] [Rank 0] step=4930, skipped=8, lr=[1.7555555555555558e-07], mom=[[0.9, 0.999]] [2022-12-20 09:29:28,158] [INFO] [timer.py:196:stop] epoch=0/micro_step=4930/global_step=4930, RunningAvgSamplesPerSec=5.063701372391198, CurrSamplesPerSec=5.047858548350161, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 09:31:48,059] [INFO] [logging.py:68:log_dist] [Rank 0] step=4940, skipped=8, lr=[1.5333333333333333e-07], mom=[[0.9, 0.999]] [2022-12-20 09:31:48,061] [INFO] [timer.py:196:stop] epoch=0/micro_step=4940/global_step=4940, RunningAvgSamplesPerSec=5.064516563171399, CurrSamplesPerSec=5.5484346820325605, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 09:34:08,499] [INFO] [logging.py:68:log_dist] [Rank 0] step=4950, skipped=8, lr=[1.3111111111111113e-07], mom=[[0.9, 0.999]] [2022-12-20 09:34:08,501] [INFO] [timer.py:196:stop] epoch=0/micro_step=4950/global_step=4950, RunningAvgSamplesPerSec=5.065306002780262, CurrSamplesPerSec=5.450649164948605, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 1.3111111111111113e-07, 'epoch': 70.71} [2022-12-20 09:36:28,150] [INFO] [logging.py:68:log_dist] [Rank 0] step=4960, skipped=8, lr=[1.088888888888889e-07], mom=[[0.9, 0.999]] [2022-12-20 09:36:28,152] [INFO] [timer.py:196:stop] epoch=0/micro_step=4960/global_step=4960, RunningAvgSamplesPerSec=5.066117476082817, CurrSamplesPerSec=5.618326183471552, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 09:38:47,744] [INFO] [logging.py:68:log_dist] [Rank 0] step=4970, skipped=8, lr=[8.666666666666668e-08], mom=[[0.9, 0.999]] [2022-12-20 09:38:47,746] [INFO] [timer.py:196:stop] epoch=0/micro_step=4970/global_step=4970, RunningAvgSamplesPerSec=5.066950118398116, CurrSamplesPerSec=5.674350640600091, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 7.555555555555556e-08, 'epoch': 71.07} [2022-12-20 09:41:10,723] [INFO] [logging.py:68:log_dist] [Rank 0] step=4980, skipped=8, lr=[6.444444444444445e-08], mom=[[0.9, 0.999]] [2022-12-20 09:41:10,724] [INFO] [timer.py:196:stop] epoch=0/micro_step=4980/global_step=4980, RunningAvgSamplesPerSec=5.067469981862319, CurrSamplesPerSec=5.1892294486294315, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 09:43:36,226] [INFO] [logging.py:68:log_dist] [Rank 0] step=4990, skipped=8, lr=[4.222222222222222e-08], mom=[[0.9, 0.999]] [2022-12-20 09:43:36,228] [INFO] [timer.py:196:stop] epoch=0/micro_step=4990/global_step=4990, RunningAvgSamplesPerSec=5.067770417735178, CurrSamplesPerSec=5.121883309677963, MemAllocated=1.52GB, MaxMemAllocated=26.06GB [2022-12-20 09:46:01,735] [INFO] [logging.py:68:log_dist] [Rank 0] step=5000, skipped=8, lr=[2e-08], mom=[[0.9, 0.999]] [2022-12-20 09:46:01,736] [INFO] [timer.py:196:stop] epoch=0/micro_step=5000/global_step=5000, RunningAvgSamplesPerSec=5.068131732607621, CurrSamplesPerSec=5.1918555188372055, MemAllocated=1.52GB, MaxMemAllocated=26.06GB {'loss': 0.0001, 'learning_rate': 2e-08, 'epoch': 71.43} {'eval_loss': 0.475341796875, 'eval_wer': 23.453117563065206, 'eval_runtime': 788.365, 'eval_samples_per_second': 2.876, 'eval_steps_per_second': 0.09, 'epoch': 71.43} [2022-12-20 09:59:13,106] [INFO] [logging.py:68:log_dist] [Rank 0] [Torch] Checkpoint global_step5000 is begin to save! [2022-12-20 09:59:13,118] [INFO] [logging.py:68:log_dist] [Rank 0] Saving model checkpoint: ./checkpoint-5000/global_step5000/mp_rank_00_model_states.pt [2022-12-20 09:59:13,119] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving ./checkpoint-5000/global_step5000/mp_rank_00_model_states.pt... [2022-12-20 09:59:15,961] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved ./checkpoint-5000/global_step5000/mp_rank_00_model_states.pt. [2022-12-20 09:59:15,963] [INFO] [torch_checkpoint_engine.py:15:save] [Torch] Saving ./checkpoint-5000/global_step5000/zero_pp_rank_0_mp_rank_00_optim_states.pt... [2022-12-20 09:59:28,039] [INFO] [torch_checkpoint_engine.py:17:save] [Torch] Saved ./checkpoint-5000/global_step5000/zero_pp_rank_0_mp_rank_00_optim_states.pt. [2022-12-20 09:59:28,048] [INFO] [engine.py:3394:_save_zero_checkpoint] zero checkpoint saved ./checkpoint-5000/global_step5000/zero_pp_rank_0_mp_rank_00_optim_states.pt [2022-12-20 09:59:28,048] [INFO] [torch_checkpoint_engine.py:27:commit] [Torch] Checkpoint global_step5000 is ready now! [2022-12-20 10:02:48,229] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed info: version=0.8.0+a25c31b6, git-hash=a25c31b6, git-branch=master [2022-12-20 10:02:48,322] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False Adam Optimizer #1 is created with AVX2 arithmetic capability. Config: alpha=0.000010, betas=(0.900000, 0.999000), weight_decay=0.000000, adam_w=1 [2022-12-20 10:02:50,625] [INFO] [logging.py:68:log_dist] [Rank 0] Using DeepSpeed Optimizer param name adamw as basic optimizer [2022-12-20 10:02:50,781] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Basic Optimizer = DeepSpeedCPUAdam [2022-12-20 10:02:50,781] [INFO] [utils.py:52:is_zero_supported_optimizer] Checking ZeRO support for optimizer=DeepSpeedCPUAdam type= [2022-12-20 10:02:50,781] [INFO] [logging.py:68:log_dist] [Rank 0] Creating fp16 ZeRO stage 2 optimizer [2022-12-20 10:02:50,781] [INFO] [stage_1_and_2.py:141:__init__] Reduce bucket size 200000000 [2022-12-20 10:02:50,781] [INFO] [stage_1_and_2.py:142:__init__] Allgather bucket size 200000000 [2022-12-20 10:02:50,781] [INFO] [stage_1_and_2.py:143:__init__] CPU Offload: True [2022-12-20 10:02:50,781] [INFO] [stage_1_and_2.py:144:__init__] Round robin gradient partitioning: False Rank: 0 partition count [1] and sizes[(763857920, False)] [2022-12-20 10:02:54,600] [INFO] [utils.py:831:see_memory_usage] Before initializing optimizer states [2022-12-20 10:02:54,601] [INFO] [utils.py:832:see_memory_usage] MA 3.04 GB Max_MA 26.06 GB CA 3.05 GB Max_CA 30 GB [2022-12-20 10:02:54,601] [INFO] [utils.py:840:see_memory_usage] CPU Virtual Memory: used = 268.59 GB, percent = 53.3% [2022-12-20 10:02:57,860] [INFO] [utils.py:831:see_memory_usage] After initializing optimizer states [2022-12-20 10:02:57,861] [INFO] [utils.py:832:see_memory_usage] MA 3.04 GB Max_MA 3.04 GB CA 3.05 GB Max_CA 3 GB [2022-12-20 10:02:57,861] [INFO] [utils.py:840:see_memory_usage] CPU Virtual Memory: used = 278.51 GB, percent = 55.3% [2022-12-20 10:02:57,861] [INFO] [stage_1_and_2.py:527:__init__] optimizer state initialized [2022-12-20 10:02:57,938] [INFO] [utils.py:831:see_memory_usage] After initializing ZeRO optimizer [2022-12-20 10:02:57,939] [INFO] [utils.py:832:see_memory_usage] MA 3.04 GB Max_MA 3.04 GB CA 3.05 GB Max_CA 3 GB [2022-12-20 10:02:57,940] [INFO] [utils.py:840:see_memory_usage] CPU Virtual Memory: used = 278.52 GB, percent = 55.3% [2022-12-20 10:02:57,979] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed Final Optimizer = adamw [2022-12-20 10:02:57,980] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed using configured LR scheduler = WarmupDecayLR [2022-12-20 10:02:57,980] [INFO] [logging.py:68:log_dist] [Rank 0] DeepSpeed LR Scheduler = [2022-12-20 10:02:57,980] [INFO] [logging.py:68:log_dist] [Rank 0] step=0, skipped=0, lr=[1e-05], mom=[[0.9, 0.999]] [2022-12-20 10:02:57,982] [INFO] [config.py:1008:print] DeepSpeedEngine configuration: [2022-12-20 10:02:57,982] [INFO] [config.py:1012:print] activation_checkpointing_config { "partition_activations": false, "contiguous_memory_optimization": false, "cpu_checkpointing": false, "number_checkpoints": null, "synchronize_checkpoint_boundary": false, "profile": false } [2022-12-20 10:02:57,982] [INFO] [config.py:1012:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} [2022-12-20 10:02:57,982] [INFO] [config.py:1012:print] amp_enabled .................. False [2022-12-20 10:02:57,982] [INFO] [config.py:1012:print] amp_params ................... False [2022-12-20 10:02:57,983] [INFO] [config.py:1012:print] autotuning_config ............ { "enabled": false, "start_step": null, "end_step": null, "metric_path": null, "arg_mappings": null, "metric": "throughput", "model_info": null, "results_dir": "autotuning_results", "exps_dir": "autotuning_exps", "overwrite": true, "fast": true, "start_profile_step": 3, "end_profile_step": 5, "tuner_type": "gridsearch", "tuner_early_stopping": 5, "tuner_num_trials": 50, "model_info_path": null, "mp_size": 1, "max_train_batch_size": null, "min_train_batch_size": 1, "max_train_micro_batch_size_per_gpu": 1.024000e+03, "min_train_micro_batch_size_per_gpu": 1, "num_tuning_micro_batch_sizes": 3 } [2022-12-20 10:02:57,983] [INFO] [config.py:1012:print] bfloat16_enabled ............. False [2022-12-20 10:02:57,983] [INFO] [config.py:1012:print] checkpoint_parallel_write_pipeline False [2022-12-20 10:02:57,983] [INFO] [config.py:1012:print] checkpoint_tag_validation_enabled True [2022-12-20 10:02:57,983] [INFO] [config.py:1012:print] checkpoint_tag_validation_fail False [2022-12-20 10:02:57,983] [INFO] [config.py:1012:print] comms_config ................. [2022-12-20 10:02:57,983] [INFO] [config.py:1012:print] communication_data_type ...... None [2022-12-20 10:02:57,983] [INFO] [config.py:1012:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} [2022-12-20 10:02:57,983] [INFO] [config.py:1012:print] curriculum_enabled_legacy .... False [2022-12-20 10:02:57,983] [INFO] [config.py:1012:print] curriculum_params_legacy ..... False [2022-12-20 10:02:57,983] [INFO] [config.py:1012:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} [2022-12-20 10:02:57,983] [INFO] [config.py:1012:print] data_efficiency_enabled ...... False [2022-12-20 10:02:57,983] [INFO] [config.py:1012:print] dataloader_drop_last ......... False [2022-12-20 10:02:57,983] [INFO] [config.py:1012:print] disable_allgather ............ False [2022-12-20 10:02:57,983] [INFO] [config.py:1012:print] dump_state ................... False [2022-12-20 10:02:57,983] [INFO] [config.py:1012:print] dynamic_loss_scale_args ...... {'init_scale': 65536, 'scale_window': 1000, 'delayed_shift': 2, 'min_scale': 1} [2022-12-20 10:02:57,983] [INFO] [config.py:1012:print] eigenvalue_enabled ........... False [2022-12-20 10:02:57,983] [INFO] [config.py:1012:print] eigenvalue_gas_boundary_resolution 1 [2022-12-20 10:02:57,984] [INFO] [config.py:1012:print] eigenvalue_layer_name ........ bert.encoder.layer [2022-12-20 10:02:57,984] [INFO] [config.py:1012:print] eigenvalue_layer_num ......... 0 [2022-12-20 10:02:57,984] [INFO] [config.py:1012:print] eigenvalue_max_iter .......... 100 [2022-12-20 10:02:57,984] [INFO] [config.py:1012:print] eigenvalue_stability ......... 1e-06 [2022-12-20 10:02:57,984] [INFO] [config.py:1012:print] eigenvalue_tol ............... 0.01 [2022-12-20 10:02:57,984] [INFO] [config.py:1012:print] eigenvalue_verbose ........... False [2022-12-20 10:02:57,984] [INFO] [config.py:1012:print] elasticity_enabled ........... False [2022-12-20 10:02:57,984] [INFO] [config.py:1012:print] flops_profiler_config ........ { "enabled": false, "profile_step": 1, "module_depth": -1, "top_modules": 1, "detailed": true, "output_file": null } [2022-12-20 10:02:57,984] [INFO] [config.py:1012:print] fp16_auto_cast ............... False [2022-12-20 10:02:57,984] [INFO] [config.py:1012:print] fp16_enabled ................. True [2022-12-20 10:02:57,984] [INFO] [config.py:1012:print] fp16_master_weights_and_gradients False [2022-12-20 10:02:57,984] [INFO] [config.py:1012:print] global_rank .................. 0 [2022-12-20 10:02:57,984] [INFO] [config.py:1012:print] grad_accum_dtype ............. None [2022-12-20 10:02:57,984] [INFO] [config.py:1012:print] gradient_accumulation_steps .. 1 [2022-12-20 10:02:57,984] [INFO] [config.py:1012:print] gradient_clipping ............ 1.0 [2022-12-20 10:02:57,984] [INFO] [config.py:1012:print] gradient_predivide_factor .... 1.0 [2022-12-20 10:02:57,984] [INFO] [config.py:1012:print] initial_dynamic_scale ........ 65536 [2022-12-20 10:02:57,984] [INFO] [config.py:1012:print] load_universal_checkpoint .... False [2022-12-20 10:02:57,984] [INFO] [config.py:1012:print] loss_scale ................... 0 [2022-12-20 10:02:57,984] [INFO] [config.py:1012:print] memory_breakdown ............. False [2022-12-20 10:02:57,984] [INFO] [config.py:1012:print] monitor_config ............... [2022-12-20 10:02:57,985] [INFO] [config.py:1012:print] nebula_config ................ { "enabled": false, "persistent_storage_path": null, "persistent_time_interval": 100, "num_of_version_in_retention": 2, "enable_nebula_load": true, "load_path": null } [2022-12-20 10:02:57,985] [INFO] [config.py:1012:print] optimizer_legacy_fusion ...... False [2022-12-20 10:02:57,985] [INFO] [config.py:1012:print] optimizer_name ............... adamw [2022-12-20 10:02:57,985] [INFO] [config.py:1012:print] optimizer_params ............. {'lr': 1e-05, 'betas': [0.9, 0.999], 'eps': 1e-08, 'weight_decay': 0.0} [2022-12-20 10:02:57,985] [INFO] [config.py:1012:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} [2022-12-20 10:02:57,985] [INFO] [config.py:1012:print] pld_enabled .................. False [2022-12-20 10:02:57,985] [INFO] [config.py:1012:print] pld_params ................... False [2022-12-20 10:02:57,985] [INFO] [config.py:1012:print] prescale_gradients ........... False [2022-12-20 10:02:57,985] [INFO] [config.py:1012:print] scheduler_name ............... WarmupDecayLR [2022-12-20 10:02:57,985] [INFO] [config.py:1012:print] scheduler_params ............. {'last_batch_iteration': -1, 'total_num_steps': 5000, 'warmup_min_lr': 0, 'warmup_max_lr': 1e-05, 'warmup_num_steps': 500} [2022-12-20 10:02:57,985] [INFO] [config.py:1012:print] sparse_attention ............. None [2022-12-20 10:02:57,985] [INFO] [config.py:1012:print] sparse_gradients_enabled ..... False [2022-12-20 10:02:57,985] [INFO] [config.py:1012:print] steps_per_print .............. 10 [2022-12-20 10:02:57,985] [INFO] [config.py:1012:print] train_batch_size ............. 64 [2022-12-20 10:02:57,985] [INFO] [config.py:1012:print] train_micro_batch_size_per_gpu 64 [2022-12-20 10:02:57,985] [INFO] [config.py:1012:print] use_node_local_storage ....... False [2022-12-20 10:02:57,985] [INFO] [config.py:1012:print] wall_clock_breakdown ......... False [2022-12-20 10:02:57,985] [INFO] [config.py:1012:print] world_size ................... 1 [2022-12-20 10:02:57,985] [INFO] [config.py:1012:print] zero_allow_untested_optimizer False [2022-12-20 10:02:57,986] [INFO] [config.py:1012:print] zero_config .................. stage=2 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=200000000 allgather_partitions=True allgather_bucket_size=200000000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=None offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='cpu', nvme_path=None, buffer_count=4, pin_memory=True, pipeline=False, pipeline_read=False, pipeline_write=False, fast_init=False) sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=50,000,000 param_persistence_threshold=100,000 model_persistence_threshold=sys.maxsize max_live_parameters=1,000,000,000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False [2022-12-20 10:02:57,986] [INFO] [config.py:1012:print] zero_enabled ................. True [2022-12-20 10:02:57,986] [INFO] [config.py:1012:print] zero_optimization_stage ...... 2 [2022-12-20 10:02:57,986] [INFO] [config.py:997:print_user_config] json = { "fp16": { "enabled": true, "loss_scale": 0, "loss_scale_window": 1000, "initial_scale_power": 16, "hysteresis": 2, "min_loss_scale": 1 }, "optimizer": { "type": "AdamW", "params": { "lr": 1e-05, "betas": [0.9, 0.999], "eps": 1e-08, "weight_decay": 0.0 } }, "scheduler": { "type": "WarmupDecayLR", "params": { "last_batch_iteration": -1, "total_num_steps": 5.000000e+03, "warmup_min_lr": 0, "warmup_max_lr": 1e-05, "warmup_num_steps": 500 } }, "zero_optimization": { "stage": 2, "offload_optimizer": { "device": "cpu", "pin_memory": true }, "allgather_partitions": true, "allgather_bucket_size": 2.000000e+08, "overlap_comm": true, "reduce_scatter": true, "reduce_bucket_size": 2.000000e+08, "contiguous_gradients": true }, "gradient_accumulation_steps": 1, "gradient_clipping": 1.0, "train_batch_size": 64, "train_micro_batch_size_per_gpu": 64 } [2022-12-20 10:02:57,988] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from ./checkpoint-1000/global_step1000/mp_rank_00_model_states.pt... [2022-12-20 10:03:01,242] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from ./checkpoint-1000/global_step1000/mp_rank_00_model_states.pt. [2022-12-20 10:03:01,317] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from ./checkpoint-1000/global_step1000/mp_rank_00_model_states.pt... [2022-12-20 10:03:02,262] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from ./checkpoint-1000/global_step1000/mp_rank_00_model_states.pt. [2022-12-20 10:03:02,628] [INFO] [torch_checkpoint_engine.py:21:load] [Torch] Loading checkpoint from ./checkpoint-1000/global_step1000/zero_pp_rank_0_mp_rank_00_optim_states.pt... [2022-12-20 10:03:22,658] [INFO] [torch_checkpoint_engine.py:23:load] [Torch] Loaded checkpoint from ./checkpoint-1000/global_step1000/zero_pp_rank_0_mp_rank_00_optim_states.pt. [2022-12-20 10:03:22,658] [INFO] [engine.py:3020:_get_all_zero_checkpoint_state_dicts] successfully read 1 ZeRO state_dicts for rank 0 [2022-12-20 10:03:23,547] [INFO] [engine.py:2960:_load_zero_checkpoint] loading 1 zero partition checkpoints for rank 0 {'train_runtime': 79780.349, 'train_samples_per_second': 4.011, 'train_steps_per_second': 0.063, 'train_loss': 0.011088945543766021, 'epoch': 71.43}