[2023-04-14 06:57:51,009] [WARNING] [runner.py:190:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only. [2023-04-14 06:57:51,024] [INFO] [runner.py:540:main] cmd = /usr/local/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMF19 --master_addr=127.0.0.1 --master_port=29500 --enable_each_rank_log=None main.py --model_name_or_path facebook/opt-1.3b --gradient_accumulation_steps 2 --lora_dim 128 --zero_stage 0 --deepspeed --output_dir /output/DeepSpeedExamples/applications/DeepSpeed-Chat/output/actor-models/1.3b [2023-04-14 06:57:54,173] [INFO] [launch.py:222:main] 0 NV_LIBNCCL_DEV_PACKAGE=libnccl-dev=2.13.4-1+cuda11.7 [2023-04-14 06:57:54,173] [INFO] [launch.py:222:main] 0 NCCL_VERSION=2.13.4-1 [2023-04-14 06:57:54,173] [INFO] [launch.py:222:main] 0 NV_LIBNCCL_PACKAGE_VERSION=2.13.4-1 [2023-04-14 06:57:54,173] [INFO] [launch.py:222:main] 0 NV_LIBNCCL_PACKAGE=libnccl2=2.13.4-1+cuda11.7 [2023-04-14 06:57:54,173] [INFO] [launch.py:222:main] 0 NV_LIBNCCL_DEV_PACKAGE_NAME=libnccl-dev [2023-04-14 06:57:54,173] [INFO] [launch.py:222:main] 0 NV_LIBNCCL_PACKAGE_NAME=libnccl2 [2023-04-14 06:57:54,173] [INFO] [launch.py:222:main] 0 NV_LIBNCCL_DEV_PACKAGE_VERSION=2.13.4-1 [2023-04-14 06:57:54,173] [INFO] [launch.py:229:main] WORLD INFO DICT: {'localhost': [0]} [2023-04-14 06:57:54,173] [INFO] [launch.py:235:main] nnodes=1, num_local_procs=1, node_rank=0 [2023-04-14 06:57:54,173] [INFO] [launch.py:246:main] global_rank_mapping=defaultdict(, {'localhost': [0]}) [2023-04-14 06:57:54,173] [INFO] [launch.py:247:main] dist_world_size=1 [2023-04-14 06:57:54,173] [INFO] [launch.py:249:main] Setting CUDA_VISIBLE_DEVICES=0 [2023-04-14 06:57:58,372] [INFO] [comm.py:586:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl Downloading (…)okenizer_config.json: 0%| | 0.00/685 [00:00 [2023-04-14 07:00:29,718] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[0.001, 0.001], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:00:29,718] [INFO] [config.py:953:print] DeepSpeedEngine configuration: [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] activation_checkpointing_config { "partition_activations": false, "contiguous_memory_optimization": false, "cpu_checkpointing": false, "number_checkpoints": null, "synchronize_checkpoint_boundary": false, "profile": false } [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] amp_enabled .................. False [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] amp_params ................... False [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] autotuning_config ............ { "enabled": false, "start_step": null, "end_step": null, "metric_path": null, "arg_mappings": null, "metric": "throughput", "model_info": null, "results_dir": "autotuning_results", "exps_dir": "autotuning_exps", "overwrite": true, "fast": true, "start_profile_step": 3, "end_profile_step": 5, "tuner_type": "gridsearch", "tuner_early_stopping": 5, "tuner_num_trials": 50, "model_info_path": null, "mp_size": 1, "max_train_batch_size": null, "min_train_batch_size": 1, "max_train_micro_batch_size_per_gpu": 1.024000e+03, "min_train_micro_batch_size_per_gpu": 1, "num_tuning_micro_batch_sizes": 3 } [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] bfloat16_enabled ............. False [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] checkpoint_parallel_write_pipeline False [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] checkpoint_tag_validation_enabled True [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] checkpoint_tag_validation_fail False [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] comms_config ................. [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] communication_data_type ...... None [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] curriculum_enabled_legacy .... False [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] curriculum_params_legacy ..... False [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] data_efficiency_enabled ...... False [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] dataloader_drop_last ......... False [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] disable_allgather ............ False [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] dump_state ................... False [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] dynamic_loss_scale_args ...... {'init_scale': 65536, 'scale_window': 100, 'delayed_shift': 2, 'min_scale': 1} [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] eigenvalue_enabled ........... False [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] eigenvalue_gas_boundary_resolution 1 [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] eigenvalue_layer_name ........ bert.encoder.layer [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] eigenvalue_layer_num ......... 0 [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] eigenvalue_max_iter .......... 100 [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] eigenvalue_stability ......... 1e-06 [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] eigenvalue_tol ............... 0.01 [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] eigenvalue_verbose ........... False [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] elasticity_enabled ........... False [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] flops_profiler_config ........ { "enabled": false, "profile_step": 1, "module_depth": -1, "top_modules": 1, "detailed": true, "output_file": null } [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] fp16_auto_cast ............... False [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] fp16_enabled ................. True [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] fp16_master_weights_and_gradients False [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] global_rank .................. 0 [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] grad_accum_dtype ............. None [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] gradient_accumulation_steps .. 2 [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] gradient_clipping ............ 1.0 [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] gradient_predivide_factor .... 1.0 [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] initial_dynamic_scale ........ 65536 [2023-04-14 07:00:29,719] [INFO] [config.py:957:print] load_universal_checkpoint .... False [2023-04-14 07:00:29,720] [INFO] [config.py:957:print] loss_scale ................... 0 [2023-04-14 07:00:29,720] [INFO] [config.py:957:print] memory_breakdown ............. False [2023-04-14 07:00:29,720] [INFO] [config.py:957:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False [2023-04-14 07:00:29,720] [INFO] [config.py:957:print] nebula_config ................ { "enabled": false, "persistent_storage_path": null, "persistent_time_interval": 100, "num_of_version_in_retention": 2, "enable_nebula_load": true, "load_path": null } [2023-04-14 07:00:29,720] [INFO] [config.py:957:print] optimizer_legacy_fusion ...... False [2023-04-14 07:00:29,720] [INFO] [config.py:957:print] optimizer_name ............... None [2023-04-14 07:00:29,720] [INFO] [config.py:957:print] optimizer_params ............. None [2023-04-14 07:00:29,720] [INFO] [config.py:957:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} [2023-04-14 07:00:29,720] [INFO] [config.py:957:print] pld_enabled .................. False [2023-04-14 07:00:29,720] [INFO] [config.py:957:print] pld_params ................... False [2023-04-14 07:00:29,720] [INFO] [config.py:957:print] prescale_gradients ........... False [2023-04-14 07:00:29,720] [INFO] [config.py:957:print] scheduler_name ............... None [2023-04-14 07:00:29,720] [INFO] [config.py:957:print] scheduler_params ............. None [2023-04-14 07:00:29,720] [INFO] [config.py:957:print] sparse_attention ............. None [2023-04-14 07:00:29,720] [INFO] [config.py:957:print] sparse_gradients_enabled ..... False [2023-04-14 07:00:29,720] [INFO] [config.py:957:print] steps_per_print .............. 10 [2023-04-14 07:00:29,720] [INFO] [config.py:957:print] train_batch_size ............. 32 [2023-04-14 07:00:29,720] [INFO] [config.py:957:print] train_micro_batch_size_per_gpu 16 [2023-04-14 07:00:29,720] [INFO] [config.py:957:print] use_node_local_storage ....... False [2023-04-14 07:00:29,720] [INFO] [config.py:957:print] wall_clock_breakdown ......... False [2023-04-14 07:00:29,720] [INFO] [config.py:957:print] world_size ................... 1 [2023-04-14 07:00:29,720] [INFO] [config.py:957:print] zero_allow_untested_optimizer False [2023-04-14 07:00:29,720] [INFO] [config.py:957:print] zero_config .................. stage=0 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500,000,000 allgather_partitions=True allgather_bucket_size=500,000,000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='none', nvme_path=None, buffer_count=5, buffer_size=100,000,000, max_in_cpu=1,000,000,000, pin_memory=False) offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='none', nvme_path=None, buffer_count=4, pin_memory=False, pipeline=False, pipeline_read=False, pipeline_write=False, fast_init=False) sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=30000000 param_persistence_threshold=10000 model_persistence_threshold=sys.maxsize max_live_parameters=30000000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False memory_efficient_linear=False [2023-04-14 07:00:29,720] [INFO] [config.py:957:print] zero_enabled ................. False [2023-04-14 07:00:29,720] [INFO] [config.py:957:print] zero_force_ds_cpu_optimizer .. True [2023-04-14 07:00:29,720] [INFO] [config.py:957:print] zero_optimization_stage ...... 0 [2023-04-14 07:00:29,720] [INFO] [config.py:943:print_user_config] json = { "train_batch_size": 32, "train_micro_batch_size_per_gpu": 16, "steps_per_print": 10, "zero_optimization": { "stage": 0, "offload_param": { "device": "none" }, "offload_optimizer": { "device": "none" }, "stage3_param_persistence_threshold": 1.000000e+04, "stage3_max_live_parameters": 3.000000e+07, "stage3_prefetch_bucket_size": 3.000000e+07, "memory_efficient_linear": false }, "fp16": { "enabled": true, "loss_scale_window": 100 }, "gradient_clipping": 1.0, "prescale_gradients": false, "wall_clock_breakdown": false, "hybrid_engine": { "enabled": false, "inference_tp_size": 1, "release_inference_cache": false, "pin_parameters": true, "tp_gather_partition_size": 8 } } Using /root/.cache/torch_extensions/py38_cu117 as PyTorch extensions root... Creating extension directory /root/.cache/torch_extensions/py38_cu117/utils... huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) Emitting ninja build file /root/.cache/torch_extensions/py38_cu117/utils/build.ninja... Building extension module utils... Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... To disable this warning, you can either: - Avoid using `tokenizers` before the fork if possible - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) [1/2] c++ -MMD -MF flatten_unflatten.o.d -DTORCH_EXTENSION_NAME=utils -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /usr/local/lib/python3.8/site-packages/torch/include -isystem /usr/local/lib/python3.8/site-packages/torch/include/torch/csrc/api/include -isystem /usr/local/lib/python3.8/site-packages/torch/include/TH -isystem /usr/local/lib/python3.8/site-packages/torch/include/THC -isystem /usr/local/include/python3.8 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++14 -c /usr/local/lib/python3.8/site-packages/deepspeed/ops/csrc/utils/flatten_unflatten.cpp -o flatten_unflatten.o [2/2] c++ flatten_unflatten.o -shared -L/usr/local/lib/python3.8/site-packages/torch/lib -lc10 -ltorch_cpu -ltorch -ltorch_python -o utils.so Loading extension module utils... Time to load utils op: 12.565346956253052 seconds ***** Running training ***** ***** Evaluating perplexity, Epoch 0/1 ***** ppl: 4841.33251953125 Beginning of Epoch 1/1, Total Micro Batches 2860 [2023-04-14 07:01:26,912] [INFO] [fused_optimizer.py:362:_update_scale] Grad overflow on iteration 0 [2023-04-14 07:01:26,912] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0 [2023-04-14 07:01:26,912] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 65536, reducing to 32768.0 [2023-04-14 07:01:28,022] [INFO] [fused_optimizer.py:362:_update_scale] Grad overflow on iteration 1 [2023-04-14 07:01:28,023] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0 [2023-04-14 07:01:28,023] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 32768.0, reducing to 16384.0 [2023-04-14 07:01:29,132] [INFO] [fused_optimizer.py:362:_update_scale] Grad overflow on iteration 2 [2023-04-14 07:01:29,133] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0 [2023-04-14 07:01:29,133] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 16384.0, reducing to 8192.0 [2023-04-14 07:01:30,237] [INFO] [fused_optimizer.py:362:_update_scale] Grad overflow on iteration 3 [2023-04-14 07:01:30,238] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0 [2023-04-14 07:01:30,238] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 8192.0, reducing to 4096.0 [2023-04-14 07:01:37,134] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=4, lr=[0.0009999565625930518, 0.0009999565625930518], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:01:37,138] [INFO] [timer.py:199:stop] epoch=0/micro_step=20/global_step=10, RunningAvgSamplesPerSec=28.141096377600697, CurrSamplesPerSec=28.12801123295279, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:01:48,572] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=4, lr=[0.000999691139103864, 0.000999691139103864], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:01:48,576] [INFO] [timer.py:199:stop] epoch=0/micro_step=40/global_step=20, RunningAvgSamplesPerSec=28.078686785579933, CurrSamplesPerSec=28.071358760840397, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:02:00,002] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=4, lr=[0.0009991845519630679, 0.0009991845519630679], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:02:00,006] [INFO] [timer.py:199:stop] epoch=0/micro_step=60/global_step=30, RunningAvgSamplesPerSec=28.067548117432807, CurrSamplesPerSec=28.11579075481346, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:02:11,418] [INFO] [logging.py:96:log_dist] [Rank 0] step=40, skipped=4, lr=[0.0009984370456625003, 0.0009984370456625003], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:02:11,423] [INFO] [timer.py:199:stop] epoch=0/micro_step=80/global_step=40, RunningAvgSamplesPerSec=28.073548066802058, CurrSamplesPerSec=28.17129833011603, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:02:22,838] [INFO] [logging.py:96:log_dist] [Rank 0] step=50, skipped=4, lr=[0.0009974489809677126, 0.0009974489809677126], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:02:22,842] [INFO] [timer.py:199:stop] epoch=0/micro_step=100/global_step=50, RunningAvgSamplesPerSec=28.073460515306376, CurrSamplesPerSec=28.103057175257835, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:02:34,274] [INFO] [logging.py:96:log_dist] [Rank 0] step=60, skipped=4, lr=[0.0009962208347438538, 0.0009962208347438538], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:02:34,278] [INFO] [timer.py:199:stop] epoch=0/micro_step=120/global_step=60, RunningAvgSamplesPerSec=28.06668333658504, CurrSamplesPerSec=27.944933682231035, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:02:45,732] [INFO] [logging.py:96:log_dist] [Rank 0] step=70, skipped=4, lr=[0.0009947531997255255, 0.0009947531997255255], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:02:45,734] [INFO] [timer.py:199:stop] epoch=0/micro_step=140/global_step=70, RunningAvgSamplesPerSec=28.054937933989542, CurrSamplesPerSec=27.874315487266735, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:02:57,224] [INFO] [logging.py:96:log_dist] [Rank 0] step=80, skipped=4, lr=[0.0009930467842307117, 0.0009930467842307117], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:02:57,227] [INFO] [timer.py:199:stop] epoch=0/micro_step=160/global_step=80, RunningAvgSamplesPerSec=28.03502976436525, CurrSamplesPerSec=27.943706074371658, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:03:08,725] [INFO] [logging.py:96:log_dist] [Rank 0] step=90, skipped=4, lr=[0.0009911024118189266, 0.0009911024118189266], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:03:08,728] [INFO] [timer.py:199:stop] epoch=0/micro_step=180/global_step=90, RunningAvgSamplesPerSec=28.01790135317418, CurrSamplesPerSec=27.889363254819838, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:03:20,199] [INFO] [logging.py:96:log_dist] [Rank 0] step=100, skipped=4, lr=[0.0009889210208937447, 0.0009889210208937447], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:03:20,202] [INFO] [timer.py:199:stop] epoch=0/micro_step=200/global_step=100, RunningAvgSamplesPerSec=28.01027508003987, CurrSamplesPerSec=28.008906574044975, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:03:25,943] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations [2023-04-14 07:03:25,943] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0 [2023-04-14 07:03:31,714] [INFO] [logging.py:96:log_dist] [Rank 0] step=110, skipped=4, lr=[0.000986503664249902, 0.000986503664249902], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:03:31,717] [INFO] [timer.py:199:stop] epoch=0/micro_step=220/global_step=110, RunningAvgSamplesPerSec=27.994640314629194, CurrSamplesPerSec=27.73325619189209, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:03:43,229] [INFO] [logging.py:96:log_dist] [Rank 0] step=120, skipped=4, lr=[0.000983851508565192, 0.000983851508565192], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:03:43,232] [INFO] [timer.py:199:stop] epoch=0/micro_step=240/global_step=120, RunningAvgSamplesPerSec=27.98217971508587, CurrSamplesPerSec=27.775588813338135, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:03:54,753] [INFO] [logging.py:96:log_dist] [Rank 0] step=130, skipped=4, lr=[0.0009809658338373964, 0.0009809658338373964], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:03:54,756] [INFO] [timer.py:199:stop] epoch=0/micro_step=260/global_step=130, RunningAvgSamplesPerSec=27.969826533526934, CurrSamplesPerSec=27.816915092931872, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:04:06,264] [INFO] [logging.py:96:log_dist] [Rank 0] step=140, skipped=4, lr=[0.0009778480327665255, 0.0009778480327665255], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:04:06,267] [INFO] [timer.py:199:stop] epoch=0/micro_step=280/global_step=140, RunningAvgSamplesPerSec=27.961798295095445, CurrSamplesPerSec=27.948134115000336, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:04:17,780] [INFO] [logging.py:96:log_dist] [Rank 0] step=150, skipped=4, lr=[0.0009744996100826668, 0.0009744996100826668], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:04:17,783] [INFO] [timer.py:199:stop] epoch=0/micro_step=300/global_step=150, RunningAvgSamplesPerSec=27.953563434154024, CurrSamplesPerSec=27.900500251008975, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:04:29,304] [INFO] [logging.py:96:log_dist] [Rank 0] step=160, skipped=4, lr=[0.0009709221818197624, 0.0009709221818197624], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:04:29,307] [INFO] [timer.py:199:stop] epoch=0/micro_step=320/global_step=160, RunningAvgSamplesPerSec=27.94473140562776, CurrSamplesPerSec=27.869627240021313, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:04:40,823] [INFO] [logging.py:96:log_dist] [Rank 0] step=170, skipped=4, lr=[0.0009671174745356714, 0.0009671174745356714], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:04:40,826] [INFO] [timer.py:199:stop] epoch=0/micro_step=340/global_step=170, RunningAvgSamplesPerSec=27.938436426576004, CurrSamplesPerSec=27.914235169869585, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:04:52,287] [INFO] [logging.py:96:log_dist] [Rank 0] step=180, skipped=4, lr=[0.0009630873244788883, 0.0009630873244788883], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:04:52,291] [INFO] [timer.py:199:stop] epoch=0/micro_step=360/global_step=180, RunningAvgSamplesPerSec=27.93923052396758, CurrSamplesPerSec=28.012267836208732, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:05:03,761] [INFO] [logging.py:96:log_dist] [Rank 0] step=190, skipped=4, lr=[0.0009588336767023232, 0.0009588336767023232], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:05:03,765] [INFO] [timer.py:199:stop] epoch=0/micro_step=380/global_step=190, RunningAvgSamplesPerSec=27.939053853941257, CurrSamplesPerSec=27.900796044680277, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:05:15,227] [INFO] [logging.py:96:log_dist] [Rank 0] step=200, skipped=4, lr=[0.0009543585841245694, 0.0009543585841245694], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:05:15,231] [INFO] [timer.py:199:stop] epoch=0/micro_step=400/global_step=200, RunningAvgSamplesPerSec=27.940316446938077, CurrSamplesPerSec=27.99571484352838, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:05:20,945] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations [2023-04-14 07:05:20,945] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 8192.0 to 16384.0 [2023-04-14 07:05:26,700] [INFO] [logging.py:96:log_dist] [Rank 0] step=210, skipped=4, lr=[0.0009496642065391134, 0.0009496642065391134], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:05:26,703] [INFO] [timer.py:199:stop] epoch=0/micro_step=420/global_step=210, RunningAvgSamplesPerSec=27.9406361563072, CurrSamplesPerSec=27.793493868223226, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:05:38,150] [INFO] [logging.py:96:log_dist] [Rank 0] step=220, skipped=4, lr=[0.0009447528095719625, 0.0009447528095719625], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:05:38,154] [INFO] [timer.py:199:stop] epoch=0/micro_step=440/global_step=220, RunningAvgSamplesPerSec=27.94290479002408, CurrSamplesPerSec=28.06016123471817, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:05:49,608] [INFO] [logging.py:96:log_dist] [Rank 0] step=230, skipped=4, lr=[0.0009396267635881972, 0.0009396267635881972], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:05:49,612] [INFO] [timer.py:199:stop] epoch=0/micro_step=460/global_step=230, RunningAvgSamplesPerSec=27.943999646258902, CurrSamplesPerSec=27.976067155370266, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:06:01,058] [INFO] [logging.py:96:log_dist] [Rank 0] step=240, skipped=4, lr=[0.0009342885425479722, 0.0009342885425479722], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:06:01,062] [INFO] [timer.py:199:stop] epoch=0/micro_step=480/global_step=240, RunningAvgSamplesPerSec=27.945878308209277, CurrSamplesPerSec=27.97900642619156, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:06:12,521] [INFO] [logging.py:96:log_dist] [Rank 0] step=250, skipped=4, lr=[0.0009287407228125202, 0.0009287407228125202], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:06:12,525] [INFO] [timer.py:199:stop] epoch=0/micro_step=500/global_step=250, RunningAvgSamplesPerSec=27.94650874790295, CurrSamplesPerSec=27.93969237507434, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:06:23,989] [INFO] [logging.py:96:log_dist] [Rank 0] step=260, skipped=4, lr=[0.0009229859819007346, 0.0009229859819007346], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:06:23,994] [INFO] [timer.py:199:stop] epoch=0/micro_step=520/global_step=260, RunningAvgSamplesPerSec=27.946762620632057, CurrSamplesPerSec=27.852716360251165, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:06:35,446] [INFO] [logging.py:96:log_dist] [Rank 0] step=270, skipped=4, lr=[0.0009170270971969311, 0.0009170270971969311], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:06:35,450] [INFO] [timer.py:199:stop] epoch=0/micro_step=540/global_step=270, RunningAvgSamplesPerSec=27.947756017554656, CurrSamplesPerSec=27.903829741761506, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:06:46,919] [INFO] [logging.py:96:log_dist] [Rank 0] step=280, skipped=4, lr=[0.0009108669446104109, 0.0009108669446104109], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:06:46,923] [INFO] [timer.py:199:stop] epoch=0/micro_step=560/global_step=280, RunningAvgSamplesPerSec=27.947241583138933, CurrSamplesPerSec=27.948896507994682, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:06:58,387] [INFO] [logging.py:96:log_dist] [Rank 0] step=290, skipped=4, lr=[0.0009045084971874737, 0.0009045084971874737], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:06:58,391] [INFO] [timer.py:199:stop] epoch=0/micro_step=580/global_step=290, RunningAvgSamplesPerSec=27.94705445274327, CurrSamplesPerSec=27.954659449124346, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:07:09,856] [INFO] [logging.py:96:log_dist] [Rank 0] step=300, skipped=4, lr=[0.0008979548236765506, 0.0008979548236765506], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:07:09,860] [INFO] [timer.py:199:stop] epoch=0/micro_step=600/global_step=300, RunningAvgSamplesPerSec=27.94713926621857, CurrSamplesPerSec=27.96594772558736, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:07:15,588] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations [2023-04-14 07:07:15,588] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 16384.0 to 32768.0 [2023-04-14 07:07:21,358] [INFO] [logging.py:96:log_dist] [Rank 0] step=310, skipped=4, lr=[0.0008912090870471478, 0.0008912090870471478], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:07:21,362] [INFO] [timer.py:199:stop] epoch=0/micro_step=620/global_step=310, RunningAvgSamplesPerSec=27.944936270242373, CurrSamplesPerSec=27.91026475482345, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:07:32,837] [INFO] [logging.py:96:log_dist] [Rank 0] step=320, skipped=4, lr=[0.0008842745429633161, 0.0008842745429633161], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:07:32,841] [INFO] [timer.py:199:stop] epoch=0/micro_step=640/global_step=320, RunningAvgSamplesPerSec=27.944254036885653, CurrSamplesPerSec=27.86717377393364, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:07:44,321] [INFO] [logging.py:96:log_dist] [Rank 0] step=330, skipped=4, lr=[0.0008771545382123862, 0.0008771545382123862], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:07:44,325] [INFO] [timer.py:199:stop] epoch=0/micro_step=660/global_step=330, RunningAvgSamplesPerSec=27.94347994504524, CurrSamplesPerSec=27.898429870906778, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:07:55,801] [INFO] [logging.py:96:log_dist] [Rank 0] step=340, skipped=4, lr=[0.0008698525090897231, 0.0008698525090897231], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:07:55,806] [INFO] [timer.py:199:stop] epoch=0/micro_step=680/global_step=340, RunningAvgSamplesPerSec=27.94270471004713, CurrSamplesPerSec=27.936912546325544, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:08:07,325] [INFO] [logging.py:96:log_dist] [Rank 0] step=350, skipped=4, lr=[0.0008623719797402826, 0.0008623719797402826], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:08:07,327] [INFO] [timer.py:199:stop] epoch=0/micro_step=700/global_step=350, RunningAvgSamplesPerSec=27.93934291085674, CurrSamplesPerSec=27.81048847374359, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:08:18,851] [INFO] [logging.py:96:log_dist] [Rank 0] step=360, skipped=4, lr=[0.0008547165604577695, 0.0008547165604577695], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:08:18,854] [INFO] [timer.py:199:stop] epoch=0/micro_step=720/global_step=360, RunningAvgSamplesPerSec=27.93575047924138, CurrSamplesPerSec=27.804871217472908, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:08:30,744] [INFO] [logging.py:96:log_dist] [Rank 0] step=370, skipped=4, lr=[0.0008468899459422181, 0.0008468899459422181], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:08:30,749] [INFO] [timer.py:199:stop] epoch=0/micro_step=740/global_step=370, RunningAvgSamplesPerSec=27.90790507995182, CurrSamplesPerSec=27.907189040202667, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:08:42,219] [INFO] [logging.py:96:log_dist] [Rank 0] step=380, skipped=4, lr=[0.0008388959135168359, 0.0008388959135168359], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:08:42,223] [INFO] [timer.py:199:stop] epoch=0/micro_step=760/global_step=380, RunningAvgSamplesPerSec=27.908444913494918, CurrSamplesPerSec=27.887045373418886, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:08:53,681] [INFO] [logging.py:96:log_dist] [Rank 0] step=390, skipped=4, lr=[0.0008307383213049714, 0.0008307383213049714], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:08:53,684] [INFO] [timer.py:199:stop] epoch=0/micro_step=780/global_step=390, RunningAvgSamplesPerSec=27.90980129021459, CurrSamplesPerSec=27.966454688659038, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:09:05,184] [INFO] [logging.py:96:log_dist] [Rank 0] step=400, skipped=4, lr=[0.0008224211063680853, 0.0008224211063680853], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:09:05,187] [INFO] [timer.py:199:stop] epoch=0/micro_step=800/global_step=400, RunningAvgSamplesPerSec=27.90872959774225, CurrSamplesPerSec=27.84752691848682, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:09:10,935] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations [2023-04-14 07:09:10,935] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 32768.0 to 65536.0 [2023-04-14 07:09:16,717] [INFO] [logging.py:96:log_dist] [Rank 0] step=410, skipped=4, lr=[0.0008139482828056254, 0.0008139482828056254], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:09:16,720] [INFO] [timer.py:199:stop] epoch=0/micro_step=820/global_step=410, RunningAvgSamplesPerSec=27.90608462810118, CurrSamplesPerSec=27.829552200369534, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:09:28,236] [INFO] [logging.py:96:log_dist] [Rank 0] step=420, skipped=4, lr=[0.0008053239398177191, 0.0008053239398177191], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:09:28,239] [INFO] [timer.py:199:stop] epoch=0/micro_step=840/global_step=420, RunningAvgSamplesPerSec=27.904361435920347, CurrSamplesPerSec=27.818385275198793, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:09:39,767] [INFO] [logging.py:96:log_dist] [Rank 0] step=430, skipped=4, lr=[0.0007965522397316221, 0.0007965522397316221], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:09:39,770] [INFO] [timer.py:199:stop] epoch=0/micro_step=860/global_step=430, RunningAvgSamplesPerSec=27.90219677277752, CurrSamplesPerSec=27.88143190257329, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:09:51,289] [INFO] [logging.py:96:log_dist] [Rank 0] step=440, skipped=4, lr=[0.000787637415992873, 0.000787637415992873], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:09:51,292] [INFO] [timer.py:199:stop] epoch=0/micro_step=880/global_step=440, RunningAvgSamplesPerSec=27.900581766447058, CurrSamplesPerSec=27.8346656745396, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:10:02,815] [INFO] [logging.py:96:log_dist] [Rank 0] step=450, skipped=4, lr=[0.000778583771122125, 0.000778583771122125], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:10:02,818] [INFO] [timer.py:199:stop] epoch=0/micro_step=900/global_step=450, RunningAvgSamplesPerSec=27.89877176742672, CurrSamplesPerSec=27.79556597095809, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:10:14,348] [INFO] [logging.py:96:log_dist] [Rank 0] step=460, skipped=4, lr=[0.0007693956746386408, 0.0007693956746386408], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:10:14,351] [INFO] [timer.py:199:stop] epoch=0/micro_step=920/global_step=460, RunningAvgSamplesPerSec=27.896732463439644, CurrSamplesPerSec=27.784754146720147, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:10:25,872] [INFO] [logging.py:96:log_dist] [Rank 0] step=470, skipped=4, lr=[0.0007600775609514493, 0.0007600775609514493], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:10:25,875] [INFO] [timer.py:199:stop] epoch=0/micro_step=940/global_step=470, RunningAvgSamplesPerSec=27.89516590035489, CurrSamplesPerSec=27.84345993311814, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:10:37,388] [INFO] [logging.py:96:log_dist] [Rank 0] step=480, skipped=4, lr=[0.0007506339272191898, 0.0007506339272191898], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:10:37,391] [INFO] [timer.py:199:stop] epoch=0/micro_step=960/global_step=480, RunningAvgSamplesPerSec=27.894055137267724, CurrSamplesPerSec=27.834573315230116, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:10:48,874] [INFO] [logging.py:96:log_dist] [Rank 0] step=490, skipped=4, lr=[0.0007410693311796666, 0.0007410693311796666], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:10:48,878] [INFO] [timer.py:199:stop] epoch=0/micro_step=980/global_step=490, RunningAvgSamplesPerSec=27.894183330433044, CurrSamplesPerSec=27.907345711163856, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:11:00,353] [INFO] [logging.py:96:log_dist] [Rank 0] step=500, skipped=4, lr=[0.0007313883889501701, 0.0007313883889501701], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:11:00,357] [INFO] [timer.py:199:stop] epoch=0/micro_step=1000/global_step=500, RunningAvgSamplesPerSec=27.894589838673543, CurrSamplesPerSec=27.908755828978105, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:11:06,083] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations [2023-04-14 07:11:06,083] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 65536.0 to 131072.0 [2023-04-14 07:11:07,202] [INFO] [fused_optimizer.py:362:_update_scale] Grad overflow on iteration 505 [2023-04-14 07:11:07,202] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 131072.0 to 65536.0 [2023-04-14 07:11:07,202] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 131072.0, reducing to 65536.0 [2023-04-14 07:11:11,809] [INFO] [logging.py:96:log_dist] [Rank 0] step=510, skipped=5, lr=[0.0007225799254574904, 0.0007225799254574904], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:11:11,812] [INFO] [timer.py:199:stop] epoch=0/micro_step=1020/global_step=510, RunningAvgSamplesPerSec=27.89634129666697, CurrSamplesPerSec=27.8067088187211, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:11:23,295] [INFO] [logging.py:96:log_dist] [Rank 0] step=520, skipped=5, lr=[0.0007126908421605375, 0.0007126908421605375], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:11:23,299] [INFO] [timer.py:199:stop] epoch=0/micro_step=1040/global_step=520, RunningAvgSamplesPerSec=27.89659063794343, CurrSamplesPerSec=27.815104964319715, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:11:34,808] [INFO] [logging.py:96:log_dist] [Rank 0] step=530, skipped=5, lr=[0.0007026991088541184, 0.0007026991088541184], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:11:34,811] [INFO] [timer.py:199:stop] epoch=0/micro_step=1060/global_step=530, RunningAvgSamplesPerSec=27.895834937739032, CurrSamplesPerSec=27.752305959568453, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:11:46,336] [INFO] [logging.py:96:log_dist] [Rank 0] step=540, skipped=5, lr=[0.0006926095478028312, 0.0006926095478028312], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:11:46,339] [INFO] [timer.py:199:stop] epoch=0/micro_step=1080/global_step=540, RunningAvgSamplesPerSec=27.894509973359995, CurrSamplesPerSec=27.821614456207993, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:11:57,868] [INFO] [logging.py:96:log_dist] [Rank 0] step=550, skipped=5, lr=[0.0006824270284854318, 0.0006824270284854318], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:11:57,871] [INFO] [timer.py:199:stop] epoch=0/micro_step=1100/global_step=550, RunningAvgSamplesPerSec=27.89287517733588, CurrSamplesPerSec=27.847723365469484, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:12:09,396] [INFO] [logging.py:96:log_dist] [Rank 0] step=560, skipped=5, lr=[0.0006721564652446986, 0.0006721564652446986], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:12:09,399] [INFO] [timer.py:199:stop] epoch=0/micro_step=1120/global_step=560, RunningAvgSamplesPerSec=27.891494332148227, CurrSamplesPerSec=27.835600847069383, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:12:20,889] [INFO] [logging.py:96:log_dist] [Rank 0] step=570, skipped=5, lr=[0.0006618028149156478, 0.0006618028149156478], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:12:20,893] [INFO] [timer.py:199:stop] epoch=0/micro_step=1140/global_step=570, RunningAvgSamplesPerSec=27.891413431856982, CurrSamplesPerSec=27.95477589666847, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:12:32,373] [INFO] [logging.py:96:log_dist] [Rank 0] step=580, skipped=5, lr=[0.000651371074433236, 0.000651371074433236], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:12:32,378] [INFO] [timer.py:199:stop] epoch=0/micro_step=1160/global_step=580, RunningAvgSamplesPerSec=27.891824691179327, CurrSamplesPerSec=27.922760285015862, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:12:43,854] [INFO] [logging.py:96:log_dist] [Rank 0] step=590, skipped=5, lr=[0.0006408662784207149, 0.0006408662784207149], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:12:43,858] [INFO] [timer.py:199:stop] epoch=0/micro_step=1180/global_step=590, RunningAvgSamplesPerSec=27.892302295009106, CurrSamplesPerSec=27.9054657757225, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:12:55,346] [INFO] [logging.py:96:log_dist] [Rank 0] step=600, skipped=5, lr=[0.0006302934967597922, 0.0006302934967597922], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:12:55,350] [INFO] [timer.py:199:stop] epoch=0/micro_step=1200/global_step=600, RunningAvgSamplesPerSec=27.8923588635317, CurrSamplesPerSec=27.87588437646747, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:13:03,375] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations [2023-04-14 07:13:03,375] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 65536.0 to 131072.0 [2023-04-14 07:13:04,497] [INFO] [fused_optimizer.py:362:_update_scale] Grad overflow on iteration 607 [2023-04-14 07:13:04,497] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 131072.0 to 65536.0 [2023-04-14 07:13:04,497] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 131072.0, reducing to 65536.0 [2023-04-14 07:13:06,800] [INFO] [logging.py:96:log_dist] [Rank 0] step=610, skipped=6, lr=[0.0006207240822732765, 0.0006207240822732765], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:13:06,803] [INFO] [timer.py:199:stop] epoch=0/micro_step=1220/global_step=610, RunningAvgSamplesPerSec=27.894034412979668, CurrSamplesPerSec=27.77138191552118, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:13:18,324] [INFO] [logging.py:96:log_dist] [Rank 0] step=620, skipped=6, lr=[0.0006100362109349642, 0.0006100362109349642], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:13:18,327] [INFO] [timer.py:199:stop] epoch=0/micro_step=1240/global_step=620, RunningAvgSamplesPerSec=27.89279294451914, CurrSamplesPerSec=27.905912527559167, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:13:29,816] [INFO] [logging.py:96:log_dist] [Rank 0] step=630, skipped=6, lr=[0.0005992952333228728, 0.0005992952333228728], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:13:29,820] [INFO] [timer.py:199:stop] epoch=0/micro_step=1260/global_step=630, RunningAvgSamplesPerSec=27.89272171060291, CurrSamplesPerSec=27.96218977944038, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:13:41,310] [INFO] [logging.py:96:log_dist] [Rank 0] step=640, skipped=6, lr=[0.0005885063333059565, 0.0005885063333059565], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:13:41,315] [INFO] [timer.py:199:stop] epoch=0/micro_step=1280/global_step=640, RunningAvgSamplesPerSec=27.89272318651298, CurrSamplesPerSec=27.81198102901513, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:13:52,810] [INFO] [logging.py:96:log_dist] [Rank 0] step=650, skipped=6, lr=[0.0005776747178817414, 0.0005776747178817414], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:13:52,814] [INFO] [timer.py:199:stop] epoch=0/micro_step=1300/global_step=650, RunningAvgSamplesPerSec=27.89249637831583, CurrSamplesPerSec=27.907229658431103, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:14:04,303] [INFO] [logging.py:96:log_dist] [Rank 0] step=660, skipped=6, lr=[0.0005668056146632947, 0.0005668056146632947], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:14:04,307] [INFO] [timer.py:199:stop] epoch=0/micro_step=1320/global_step=660, RunningAvgSamplesPerSec=27.892518411874637, CurrSamplesPerSec=27.875195432893204, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:14:15,778] [INFO] [logging.py:96:log_dist] [Rank 0] step=670, skipped=6, lr=[0.0005559042693562469, 0.0005559042693562469], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:14:15,782] [INFO] [timer.py:199:stop] epoch=0/micro_step=1340/global_step=670, RunningAvgSamplesPerSec=27.893138504741042, CurrSamplesPerSec=27.95382687744928, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:14:27,256] [INFO] [logging.py:96:log_dist] [Rank 0] step=680, skipped=6, lr=[0.0005449759432270804, 0.0005449759432270804], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:14:27,260] [INFO] [timer.py:199:stop] epoch=0/micro_step=1360/global_step=680, RunningAvgSamplesPerSec=27.893536617667916, CurrSamplesPerSec=27.96974165373949, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:14:38,770] [INFO] [logging.py:96:log_dist] [Rank 0] step=690, skipped=6, lr=[0.0005340259105639084, 0.0005340259105639084], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:14:38,773] [INFO] [timer.py:199:stop] epoch=0/micro_step=1380/global_step=690, RunningAvgSamplesPerSec=27.892900012846674, CurrSamplesPerSec=27.69449210326329, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:14:50,296] [INFO] [logging.py:96:log_dist] [Rank 0] step=700, skipped=6, lr=[0.0005230594561309696, 0.0005230594561309696], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:14:50,298] [INFO] [timer.py:199:stop] epoch=0/micro_step=1400/global_step=700, RunningAvgSamplesPerSec=27.891842261813647, CurrSamplesPerSec=27.820172764912243, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:15:00,667] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations [2023-04-14 07:15:00,668] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 65536.0 to 131072.0 [2023-04-14 07:15:01,790] [INFO] [fused_optimizer.py:362:_update_scale] Grad overflow on iteration 709 [2023-04-14 07:15:01,790] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 131072.0 to 65536.0 [2023-04-14 07:15:01,790] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 131072.0, reducing to 65536.0 [2023-04-14 07:15:01,791] [INFO] [logging.py:96:log_dist] [Rank 0] step=710, skipped=7, lr=[0.0005131799808136933, 0.0005131799808136933], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:15:01,791] [INFO] [timer.py:199:stop] epoch=0/micro_step=1420/global_step=710, RunningAvgSamplesPerSec=27.891948204255108, CurrSamplesPerSec=28.95740738143756, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:15:13,345] [INFO] [logging.py:96:log_dist] [Rank 0] step=720, skipped=7, lr=[0.000502196910870706, 0.000502196910870706], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:15:13,348] [INFO] [timer.py:199:stop] epoch=0/micro_step=1440/global_step=720, RunningAvgSamplesPerSec=27.890011265201572, CurrSamplesPerSec=27.805216829256672, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:15:24,897] [INFO] [logging.py:96:log_dist] [Rank 0] step=730, skipped=7, lr=[0.000491212780642662, 0.000491212780642662], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:15:24,898] [INFO] [timer.py:199:stop] epoch=0/micro_step=1460/global_step=730, RunningAvgSamplesPerSec=27.888223393268152, CurrSamplesPerSec=27.67360429127508, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:15:36,427] [INFO] [logging.py:96:log_dist] [Rank 0] step=740, skipped=7, lr=[0.00048023289135015165, 0.00048023289135015165], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:15:36,430] [INFO] [timer.py:199:stop] epoch=0/micro_step=1480/global_step=740, RunningAvgSamplesPerSec=27.887041180852446, CurrSamplesPerSec=27.774082919312047, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:15:47,945] [INFO] [logging.py:96:log_dist] [Rank 0] step=750, skipped=7, lr=[0.0004692625421669822, 0.0004692625421669822], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:15:47,948] [INFO] [timer.py:199:stop] epoch=0/micro_step=1500/global_step=750, RunningAvgSamplesPerSec=27.88640588569903, CurrSamplesPerSec=27.785266065711962, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:15:59,438] [INFO] [logging.py:96:log_dist] [Rank 0] step=760, skipped=7, lr=[0.00045830702766266147, 0.00045830702766266147], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:15:59,442] [INFO] [timer.py:199:stop] epoch=0/micro_step=1520/global_step=760, RunningAvgSamplesPerSec=27.886367741109595, CurrSamplesPerSec=27.96386762128507, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:16:10,917] [INFO] [logging.py:96:log_dist] [Rank 0] step=770, skipped=7, lr=[0.0004473716352471042, 0.0004473716352471042], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:16:10,921] [INFO] [timer.py:199:stop] epoch=0/micro_step=1540/global_step=770, RunningAvgSamplesPerSec=27.88680069794766, CurrSamplesPerSec=28.053563105282276, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:16:22,396] [INFO] [logging.py:96:log_dist] [Rank 0] step=780, skipped=7, lr=[0.0004364616426187927, 0.0004364616426187927], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:16:22,398] [INFO] [timer.py:199:stop] epoch=0/micro_step=1560/global_step=780, RunningAvgSamplesPerSec=27.887253810566094, CurrSamplesPerSec=27.89529878735907, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:16:33,902] [INFO] [logging.py:96:log_dist] [Rank 0] step=790, skipped=7, lr=[0.00042558231521762715, 0.00042558231521762715], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:16:33,906] [INFO] [timer.py:199:stop] epoch=0/micro_step=1580/global_step=790, RunningAvgSamplesPerSec=27.886969688438924, CurrSamplesPerSec=27.93280197260283, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:16:45,386] [INFO] [logging.py:96:log_dist] [Rank 0] step=800, skipped=7, lr=[0.0004147389036836881, 0.0004147389036836881], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:16:45,390] [INFO] [timer.py:199:stop] epoch=0/micro_step=1600/global_step=800, RunningAvgSamplesPerSec=27.887305876227373, CurrSamplesPerSec=28.029010161575716, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:16:56,887] [INFO] [logging.py:96:log_dist] [Rank 0] step=810, skipped=7, lr=[0.00040393664132314577, 0.00040393664132314577], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:16:56,891] [INFO] [timer.py:199:stop] epoch=0/micro_step=1620/global_step=810, RunningAvgSamplesPerSec=27.887205733734877, CurrSamplesPerSec=27.867405214544448, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:16:58,024] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations [2023-04-14 07:16:58,024] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 65536.0 to 131072.0 [2023-04-14 07:16:59,143] [INFO] [fused_optimizer.py:362:_update_scale] Grad overflow on iteration 811 [2023-04-14 07:16:59,143] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 131072.0 to 65536.0 [2023-04-14 07:16:59,143] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 131072.0, reducing to 65536.0 [2023-04-14 07:17:08,336] [INFO] [logging.py:96:log_dist] [Rank 0] step=820, skipped=8, lr=[0.00039425409710640367, 0.00039425409710640367], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:17:08,340] [INFO] [timer.py:199:stop] epoch=0/micro_step=1640/global_step=820, RunningAvgSamplesPerSec=27.88866067362485, CurrSamplesPerSec=27.88372567949294, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:17:19,826] [INFO] [logging.py:96:log_dist] [Rank 0] step=830, skipped=8, lr=[0.0003835443627787501, 0.0003835443627787501], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:17:19,830] [INFO] [timer.py:199:stop] epoch=0/micro_step=1660/global_step=830, RunningAvgSamplesPerSec=27.888870665784353, CurrSamplesPerSec=27.930802357198182, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:17:31,327] [INFO] [logging.py:96:log_dist] [Rank 0] step=840, skipped=8, lr=[0.00037289083290325663, 0.00037289083290325663], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:17:31,332] [INFO] [timer.py:199:stop] epoch=0/micro_step=1680/global_step=840, RunningAvgSamplesPerSec=27.88868285582777, CurrSamplesPerSec=27.91060138361735, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:17:42,811] [INFO] [logging.py:96:log_dist] [Rank 0] step=850, skipped=8, lr=[0.00036229864914437627, 0.00036229864914437627], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:17:42,815] [INFO] [timer.py:199:stop] epoch=0/micro_step=1700/global_step=850, RunningAvgSamplesPerSec=27.888991103657123, CurrSamplesPerSec=27.99500244663193, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:17:54,311] [INFO] [logging.py:96:log_dist] [Rank 0] step=860, skipped=8, lr=[0.0003517729235593656, 0.0003517729235593656], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:17:54,315] [INFO] [timer.py:199:stop] epoch=0/micro_step=1720/global_step=860, RunningAvgSamplesPerSec=27.888824968158044, CurrSamplesPerSec=27.79561202119477, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:18:05,790] [INFO] [logging.py:96:log_dist] [Rank 0] step=870, skipped=8, lr=[0.0003413187361310768, 0.0003413187361310768], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:18:05,794] [INFO] [timer.py:199:stop] epoch=0/micro_step=1740/global_step=870, RunningAvgSamplesPerSec=27.889214737300335, CurrSamplesPerSec=27.8595326070193, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:18:17,263] [INFO] [logging.py:96:log_dist] [Rank 0] step=880, skipped=8, lr=[0.00033094113231622814, 0.00033094113231622814], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:18:17,267] [INFO] [timer.py:199:stop] epoch=0/micro_step=1760/global_step=880, RunningAvgSamplesPerSec=27.889744389157606, CurrSamplesPerSec=27.939151486387185, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:18:28,755] [INFO] [logging.py:96:log_dist] [Rank 0] step=890, skipped=8, lr=[0.00032064512061033795, 0.00032064512061033795], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:18:28,759] [INFO] [timer.py:199:stop] epoch=0/micro_step=1780/global_step=890, RunningAvgSamplesPerSec=27.889847441807508, CurrSamplesPerSec=27.883441833100377, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:18:40,235] [INFO] [logging.py:96:log_dist] [Rank 0] step=900, skipped=8, lr=[0.0003104356701304984, 0.0003104356701304984], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:18:40,239] [INFO] [timer.py:199:stop] epoch=0/micro_step=1800/global_step=900, RunningAvgSamplesPerSec=27.890308024616438, CurrSamplesPerSec=27.996123612216046, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:18:51,698] [INFO] [logging.py:96:log_dist] [Rank 0] step=910, skipped=8, lr=[0.0003003177082171523, 0.0003003177082171523], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:18:51,703] [INFO] [timer.py:199:stop] epoch=0/micro_step=1820/global_step=910, RunningAvgSamplesPerSec=27.891063811342896, CurrSamplesPerSec=28.02250273037165, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:18:55,133] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations [2023-04-14 07:18:55,133] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 65536.0 to 131072.0 [2023-04-14 07:18:56,258] [INFO] [fused_optimizer.py:362:_update_scale] Grad overflow on iteration 913 [2023-04-14 07:18:56,258] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 131072.0 to 65536.0 [2023-04-14 07:18:56,258] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 131072.0, reducing to 65536.0 [2023-04-14 07:19:03,163] [INFO] [logging.py:96:log_dist] [Rank 0] step=920, skipped=9, lr=[0.0002912938021228969, 0.0002912938021228969], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:19:03,167] [INFO] [timer.py:199:stop] epoch=0/micro_step=1840/global_step=920, RunningAvgSamplesPerSec=27.89179147203074, CurrSamplesPerSec=27.909759826857577, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:19:14,663] [INFO] [logging.py:96:log_dist] [Rank 0] step=930, skipped=9, lr=[0.0002813630832692028, 0.0002813630832692028], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:19:14,666] [INFO] [timer.py:199:stop] epoch=0/micro_step=1860/global_step=930, RunningAvgSamplesPerSec=27.891644633622363, CurrSamplesPerSec=27.93588915038034, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:19:26,134] [INFO] [logging.py:96:log_dist] [Rank 0] step=940, skipped=9, lr=[0.0002715378841517797, 0.0002715378841517797], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:19:26,138] [INFO] [timer.py:199:stop] epoch=0/micro_step=1880/global_step=940, RunningAvgSamplesPerSec=27.892197066996538, CurrSamplesPerSec=27.81900798747771, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:19:37,618] [INFO] [logging.py:96:log_dist] [Rank 0] step=950, skipped=9, lr=[0.0002618229466615909, 0.0002618229466615909], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:19:37,623] [INFO] [timer.py:199:stop] epoch=0/micro_step=1900/global_step=950, RunningAvgSamplesPerSec=27.892464622791415, CurrSamplesPerSec=27.991750406629706, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:19:49,136] [INFO] [logging.py:96:log_dist] [Rank 0] step=960, skipped=9, lr=[0.0002522229594745347, 0.0002522229594745347], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:19:49,139] [INFO] [timer.py:199:stop] epoch=0/micro_step=1920/global_step=960, RunningAvgSamplesPerSec=27.89188688703324, CurrSamplesPerSec=27.798715007397167, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:20:00,657] [INFO] [logging.py:96:log_dist] [Rank 0] step=970, skipped=9, lr=[0.00024274255578856863, 0.00024274255578856863], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:20:00,660] [INFO] [timer.py:199:stop] epoch=0/micro_step=1940/global_step=970, RunningAvgSamplesPerSec=27.891210364911945, CurrSamplesPerSec=27.92338186963334, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:20:12,189] [INFO] [logging.py:96:log_dist] [Rank 0] step=980, skipped=9, lr=[0.00023338631108761243, 0.00023338631108761243], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:20:12,191] [INFO] [timer.py:199:stop] epoch=0/micro_step=1960/global_step=980, RunningAvgSamplesPerSec=27.89030240271364, CurrSamplesPerSec=27.735944052707037, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:20:23,713] [INFO] [logging.py:96:log_dist] [Rank 0] step=990, skipped=9, lr=[0.00022415874093330168, 0.00022415874093330168], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:20:23,716] [INFO] [timer.py:199:stop] epoch=0/micro_step=1980/global_step=990, RunningAvgSamplesPerSec=27.88958828343984, CurrSamplesPerSec=27.900581448647156, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:20:35,240] [INFO] [logging.py:96:log_dist] [Rank 0] step=1000, skipped=9, lr=[0.00021506429878566358, 0.00021506429878566358], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:20:35,243] [INFO] [timer.py:199:stop] epoch=0/micro_step=2000/global_step=1000, RunningAvgSamplesPerSec=27.8888042861563, CurrSamplesPerSec=27.843431052527308, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:20:46,760] [INFO] [logging.py:96:log_dist] [Rank 0] step=1010, skipped=9, lr=[0.00020610737385376348, 0.00020610737385376348], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:20:46,763] [INFO] [timer.py:199:stop] epoch=0/micro_step=2020/global_step=1010, RunningAvgSamplesPerSec=27.888221552350714, CurrSamplesPerSec=27.786836451762564, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:20:52,511] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations [2023-04-14 07:20:52,511] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 65536.0 to 131072.0 [2023-04-14 07:20:53,632] [INFO] [fused_optimizer.py:362:_update_scale] Grad overflow on iteration 1015 [2023-04-14 07:20:53,632] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 131072.0 to 65536.0 [2023-04-14 07:20:53,632] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 131072.0, reducing to 65536.0 [2023-04-14 07:20:58,235] [INFO] [logging.py:96:log_dist] [Rank 0] step=1020, skipped=10, lr=[0.00019816729286664797, 0.00019816729286664797], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:20:58,237] [INFO] [timer.py:199:stop] epoch=0/micro_step=2040/global_step=1020, RunningAvgSamplesPerSec=27.88870545989438, CurrSamplesPerSec=27.867642445160214, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:21:09,756] [INFO] [logging.py:96:log_dist] [Rank 0] step=1030, skipped=10, lr=[0.00018948350353219912, 0.00018948350353219912], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:21:09,759] [INFO] [timer.py:199:stop] epoch=0/micro_step=2060/global_step=1030, RunningAvgSamplesPerSec=27.88813161815214, CurrSamplesPerSec=27.848382060873252, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:21:21,272] [INFO] [logging.py:96:log_dist] [Rank 0] step=1040, skipped=10, lr=[0.00018094957735583463, 0.00018094957735583463], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:21:21,275] [INFO] [timer.py:199:stop] epoch=0/micro_step=2080/global_step=1040, RunningAvgSamplesPerSec=27.88764375410306, CurrSamplesPerSec=27.903528082688315, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:21:32,763] [INFO] [logging.py:96:log_dist] [Rank 0] step=1050, skipped=10, lr=[0.0001725696330273575, 0.0001725696330273575], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:21:32,767] [INFO] [timer.py:199:stop] epoch=0/micro_step=2100/global_step=1050, RunningAvgSamplesPerSec=27.887679086609435, CurrSamplesPerSec=28.033693636377496, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:21:44,235] [INFO] [logging.py:96:log_dist] [Rank 0] step=1060, skipped=10, lr=[0.00016434771492101485, 0.00016434771492101485], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:21:44,239] [INFO] [timer.py:199:stop] epoch=0/micro_step=2120/global_step=1060, RunningAvgSamplesPerSec=27.88815801794712, CurrSamplesPerSec=28.001240069947958, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:21:55,712] [INFO] [logging.py:96:log_dist] [Rank 0] step=1070, skipped=10, lr=[0.00015628779114358032, 0.00015628779114358032], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:21:55,716] [INFO] [timer.py:199:stop] epoch=0/micro_step=2140/global_step=1070, RunningAvgSamplesPerSec=27.888536944504718, CurrSamplesPerSec=27.99043112291661, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:22:07,186] [INFO] [logging.py:96:log_dist] [Rank 0] step=1080, skipped=10, lr=[0.00014839375161924446, 0.00014839375161924446], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:22:07,190] [INFO] [timer.py:199:stop] epoch=0/micro_step=2160/global_step=1080, RunningAvgSamplesPerSec=27.88894765269856, CurrSamplesPerSec=27.835323752361386, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:22:18,654] [INFO] [logging.py:96:log_dist] [Rank 0] step=1090, skipped=10, lr=[0.0001406694062122389, 0.0001406694062122389], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:22:18,658] [INFO] [timer.py:199:stop] epoch=0/micro_step=2180/global_step=1090, RunningAvgSamplesPerSec=27.889494217031334, CurrSamplesPerSec=28.003711357844125, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:22:30,143] [INFO] [logging.py:96:log_dist] [Rank 0] step=1100, skipped=10, lr=[0.00013311848288809813, 0.00013311848288809813], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:22:30,146] [INFO] [timer.py:199:stop] epoch=0/micro_step=2200/global_step=1100, RunningAvgSamplesPerSec=27.889578327515956, CurrSamplesPerSec=27.854068937808545, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:22:41,668] [INFO] [logging.py:96:log_dist] [Rank 0] step=1110, skipped=10, lr=[0.0001257446259144494, 0.0001257446259144494], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:22:41,671] [INFO] [timer.py:199:stop] epoch=0/micro_step=2220/global_step=1110, RunningAvgSamplesPerSec=27.888904673878535, CurrSamplesPerSec=27.79085813957533, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:22:49,719] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations [2023-04-14 07:22:49,719] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 65536.0 to 131072.0 [2023-04-14 07:22:50,842] [INFO] [fused_optimizer.py:362:_update_scale] Grad overflow on iteration 1117 [2023-04-14 07:22:50,843] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 131072.0 to 65536.0 [2023-04-14 07:22:50,843] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 131072.0, reducing to 65536.0 [2023-04-14 07:22:53,446] [INFO] [logging.py:96:log_dist] [Rank 0] step=1120, skipped=11, lr=[0.00011926248951860314, 0.00011926248951860314], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:22:53,450] [INFO] [timer.py:199:stop] epoch=0/micro_step=2240/global_step=1120, RunningAvgSamplesPerSec=27.88276514737834, CurrSamplesPerSec=27.959655920841712, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:23:04,931] [INFO] [logging.py:96:log_dist] [Rank 0] step=1130, skipped=11, lr=[0.00011223479112018653, 0.00011223479112018653], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:23:04,935] [INFO] [timer.py:199:stop] epoch=0/micro_step=2260/global_step=1130, RunningAvgSamplesPerSec=27.882905723669616, CurrSamplesPerSec=27.956068529561303, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:23:16,386] [INFO] [logging.py:96:log_dist] [Rank 0] step=1140, skipped=11, lr=[0.00010539423807301218, 0.00010539423807301218], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:23:16,390] [INFO] [timer.py:199:stop] epoch=0/micro_step=2280/global_step=1140, RunningAvgSamplesPerSec=27.883620594264737, CurrSamplesPerSec=28.00500852036866, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:23:27,875] [INFO] [logging.py:96:log_dist] [Rank 0] step=1150, skipped=11, lr=[9.874413180194608e-05, 9.874413180194608e-05], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:23:27,878] [INFO] [timer.py:199:stop] epoch=0/micro_step=2300/global_step=1150, RunningAvgSamplesPerSec=27.88376230362711, CurrSamplesPerSec=27.839694409482334, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:23:39,400] [INFO] [logging.py:96:log_dist] [Rank 0] step=1160, skipped=11, lr=[9.228768181739628e-05, 9.228768181739628e-05], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:23:39,403] [INFO] [timer.py:199:stop] epoch=0/micro_step=2320/global_step=1160, RunningAvgSamplesPerSec=27.88317799150038, CurrSamplesPerSec=27.911019278934415, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:23:50,930] [INFO] [logging.py:96:log_dist] [Rank 0] step=1170, skipped=11, lr=[8.60280041663225e-05, 8.60280041663225e-05], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:23:50,933] [INFO] [timer.py:199:stop] epoch=0/micro_step=2340/global_step=1170, RunningAvgSamplesPerSec=27.882588082067915, CurrSamplesPerSec=27.84352347062886, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:24:02,445] [INFO] [logging.py:96:log_dist] [Rank 0] step=1180, skipped=11, lr=[7.996811992835184e-05, 7.996811992835184e-05], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:24:02,448] [INFO] [timer.py:199:stop] epoch=0/micro_step=2360/global_step=1180, RunningAvgSamplesPerSec=27.882268201762017, CurrSamplesPerSec=27.892591546585386, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:24:13,955] [INFO] [logging.py:96:log_dist] [Rank 0] step=1190, skipped=11, lr=[7.411095375772925e-05, 7.411095375772925e-05], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:24:13,959] [INFO] [timer.py:199:stop] epoch=0/micro_step=2380/global_step=1190, RunningAvgSamplesPerSec=27.88202009403273, CurrSamplesPerSec=27.86537445431461, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:24:25,476] [INFO] [logging.py:96:log_dist] [Rank 0] step=1200, skipped=11, lr=[6.845933247180514e-05, 6.845933247180514e-05], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:24:25,479] [INFO] [timer.py:199:stop] epoch=0/micro_step=2400/global_step=1200, RunningAvgSamplesPerSec=27.881605191374955, CurrSamplesPerSec=27.84837050454499, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:24:37,000] [INFO] [logging.py:96:log_dist] [Rank 0] step=1210, skipped=11, lr=[6.301598368674105e-05, 6.301598368674105e-05], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:24:37,002] [INFO] [timer.py:199:stop] epoch=0/micro_step=2420/global_step=1210, RunningAvgSamplesPerSec=27.88109042500762, CurrSamplesPerSec=27.851387031683306, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:24:47,327] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations [2023-04-14 07:24:47,328] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 65536.0 to 131072.0 [2023-04-14 07:24:48,449] [INFO] [fused_optimizer.py:362:_update_scale] Grad overflow on iteration 1219 [2023-04-14 07:24:48,450] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 131072.0 to 65536.0 [2023-04-14 07:24:48,450] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 131072.0, reducing to 65536.0 [2023-04-14 07:24:48,450] [INFO] [logging.py:96:log_dist] [Rank 0] step=1220, skipped=12, lr=[5.8297216162899295e-05, 5.8297216162899295e-05], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:24:48,451] [INFO] [timer.py:199:stop] epoch=0/micro_step=2440/global_step=1220, RunningAvgSamplesPerSec=27.882041162742162, CurrSamplesPerSec=28.920824294076024, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:24:59,914] [INFO] [logging.py:96:log_dist] [Rank 0] step=1230, skipped=12, lr=[5.325673868567482e-05, 5.325673868567482e-05], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:24:59,918] [INFO] [timer.py:199:stop] epoch=0/micro_step=2460/global_step=1230, RunningAvgSamplesPerSec=27.882582805118812, CurrSamplesPerSec=28.024743707689165, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:25:11,382] [INFO] [logging.py:96:log_dist] [Rank 0] step=1240, skipped=12, lr=[4.843187086769574e-05, 4.843187086769574e-05], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:25:11,386] [INFO] [timer.py:199:stop] epoch=0/micro_step=2480/global_step=1240, RunningAvgSamplesPerSec=27.883115787936894, CurrSamplesPerSec=27.959434594084506, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:25:22,842] [INFO] [logging.py:96:log_dist] [Rank 0] step=1250, skipped=12, lr=[4.38249413128744e-05, 4.38249413128744e-05], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:25:22,846] [INFO] [timer.py:199:stop] epoch=0/micro_step=2500/global_step=1250, RunningAvgSamplesPerSec=27.883773985755955, CurrSamplesPerSec=27.98992912726541, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:25:34,315] [INFO] [logging.py:96:log_dist] [Rank 0] step=1260, skipped=12, lr=[3.9438173442575e-05, 3.9438173442575e-05], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:25:34,319] [INFO] [timer.py:199:stop] epoch=0/micro_step=2520/global_step=1260, RunningAvgSamplesPerSec=27.884217398264887, CurrSamplesPerSec=27.88673248894651, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:25:45,828] [INFO] [logging.py:96:log_dist] [Rank 0] step=1270, skipped=12, lr=[3.5273684422533594e-05, 3.5273684422533594e-05], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:25:45,831] [INFO] [timer.py:199:stop] epoch=0/micro_step=2540/global_step=1270, RunningAvgSamplesPerSec=27.883976240456118, CurrSamplesPerSec=27.828173153796904, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:25:57,331] [INFO] [logging.py:96:log_dist] [Rank 0] step=1280, skipped=12, lr=[3.133348414106035e-05, 3.133348414106035e-05], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:25:57,334] [INFO] [timer.py:199:stop] epoch=0/micro_step=2560/global_step=1280, RunningAvgSamplesPerSec=27.883867373684225, CurrSamplesPerSec=27.819359716918402, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:26:08,841] [INFO] [logging.py:96:log_dist] [Rank 0] step=1290, skipped=12, lr=[2.7619474239016175e-05, 2.7619474239016175e-05], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:26:08,845] [INFO] [timer.py:199:stop] epoch=0/micro_step=2580/global_step=1290, RunningAvgSamplesPerSec=27.883616888716304, CurrSamplesPerSec=27.80841414561024, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:26:20,301] [INFO] [logging.py:96:log_dist] [Rank 0] step=1300, skipped=12, lr=[2.4133447192032476e-05, 2.4133447192032476e-05], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:26:20,305] [INFO] [timer.py:199:stop] epoch=0/micro_step=2600/global_step=1300, RunningAvgSamplesPerSec=27.88425762253477, CurrSamplesPerSec=27.963587966994822, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:26:31,766] [INFO] [logging.py:96:log_dist] [Rank 0] step=1310, skipped=12, lr=[2.087708544541689e-05, 2.087708544541689e-05], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:26:31,770] [INFO] [timer.py:199:stop] epoch=0/micro_step=2620/global_step=1310, RunningAvgSamplesPerSec=27.884794065846652, CurrSamplesPerSec=27.93517398067774, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:26:43,220] [INFO] [logging.py:96:log_dist] [Rank 0] step=1320, skipped=12, lr=[1.7851960602162432e-05, 1.7851960602162432e-05], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:26:43,224] [INFO] [timer.py:199:stop] epoch=0/micro_step=2640/global_step=1320, RunningAvgSamplesPerSec=27.88553456135489, CurrSamplesPerSec=27.976463687615464, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:26:44,359] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations [2023-04-14 07:26:44,359] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 65536.0 to 131072.0 [2023-04-14 07:26:45,481] [INFO] [fused_optimizer.py:362:_update_scale] Grad overflow on iteration 1321 [2023-04-14 07:26:45,481] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 131072.0 to 65536.0 [2023-04-14 07:26:45,481] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 131072.0, reducing to 65536.0 [2023-04-14 07:26:54,663] [INFO] [logging.py:96:log_dist] [Rank 0] step=1330, skipped=13, lr=[1.53282648048792e-05, 1.53282648048792e-05], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:26:54,667] [INFO] [timer.py:199:stop] epoch=0/micro_step=2660/global_step=1330, RunningAvgSamplesPerSec=27.886483846959727, CurrSamplesPerSec=27.916034999005806, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:27:06,129] [INFO] [logging.py:96:log_dist] [Rank 0] step=1340, skipped=13, lr=[1.2746419577261248e-05, 1.2746419577261248e-05], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:27:06,133] [INFO] [timer.py:199:stop] epoch=0/micro_step=2680/global_step=1340, RunningAvgSamplesPerSec=27.886975829937917, CurrSamplesPerSec=27.917562132811387, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:27:17,598] [INFO] [logging.py:96:log_dist] [Rank 0] step=1350, skipped=13, lr=[1.0399735319127134e-05, 1.0399735319127134e-05], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:27:17,603] [INFO] [timer.py:199:stop] epoch=0/micro_step=2700/global_step=1350, RunningAvgSamplesPerSec=27.887425274088365, CurrSamplesPerSec=27.92047169795621, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:27:29,066] [INFO] [logging.py:96:log_dist] [Rank 0] step=1360, skipped=13, lr=[8.289344599979375e-06, 8.289344599979375e-06], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:27:29,070] [INFO] [timer.py:199:stop] epoch=0/micro_step=2720/global_step=1360, RunningAvgSamplesPerSec=27.887938002301073, CurrSamplesPerSec=27.977962447777546, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:27:40,552] [INFO] [logging.py:96:log_dist] [Rank 0] step=1370, skipped=13, lr=[6.4162659480493935e-06, 6.4162659480493935e-06], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:27:40,555] [INFO] [timer.py:199:stop] epoch=0/micro_step=2740/global_step=1370, RunningAvgSamplesPerSec=27.88807816395258, CurrSamplesPerSec=27.940076244267264, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:27:52,083] [INFO] [logging.py:96:log_dist] [Rank 0] step=1380, skipped=13, lr=[4.781403358729786e-06, 4.781403358729786e-06], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:27:52,086] [INFO] [timer.py:199:stop] epoch=0/micro_step=2760/global_step=1380, RunningAvgSamplesPerSec=27.887456424438863, CurrSamplesPerSec=27.74971246039417, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:28:03,601] [INFO] [logging.py:96:log_dist] [Rank 0] step=1390, skipped=13, lr=[3.3855458582830455e-06, 3.3855458582830455e-06], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:28:03,604] [INFO] [timer.py:199:stop] epoch=0/micro_step=2780/global_step=1390, RunningAvgSamplesPerSec=27.887111940337512, CurrSamplesPerSec=27.871450259874212, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:28:15,108] [INFO] [logging.py:96:log_dist] [Rank 0] step=1400, skipped=13, lr=[2.2293671230376176e-06, 2.2293671230376176e-06], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:28:15,111] [INFO] [timer.py:199:stop] epoch=0/micro_step=2800/global_step=1400, RunningAvgSamplesPerSec=27.8869534453658, CurrSamplesPerSec=27.841652123641264, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:28:26,626] [INFO] [logging.py:96:log_dist] [Rank 0] step=1410, skipped=13, lr=[1.3134251542544773e-06, 1.3134251542544773e-06], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:28:26,628] [INFO] [timer.py:199:stop] epoch=0/micro_step=2820/global_step=1410, RunningAvgSamplesPerSec=27.886597879837947, CurrSamplesPerSec=27.783661349068428, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:28:38,133] [INFO] [logging.py:96:log_dist] [Rank 0] step=1420, skipped=13, lr=[6.381620088213858e-07, 6.381620088213858e-07], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:28:38,135] [INFO] [timer.py:199:stop] epoch=0/micro_step=2840/global_step=1420, RunningAvgSamplesPerSec=27.88643605360806, CurrSamplesPerSec=27.833776741626135, MemAllocated=8.15GB, MaxMemAllocated=44.21GB [2023-04-14 07:28:41,567] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations [2023-04-14 07:28:41,567] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 65536.0 to 131072.0 [2023-04-14 07:28:42,688] [INFO] [fused_optimizer.py:362:_update_scale] Grad overflow on iteration 1423 [2023-04-14 07:28:42,688] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 131072.0 to 65536.0 [2023-04-14 07:28:42,688] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 131072.0, reducing to 65536.0 [2023-04-14 07:28:49,374] [INFO] [logging.py:96:log_dist] [Rank 0] step=1430, skipped=14, lr=[2.3647733036713525e-07, 2.3647733036713525e-07], mom=[(0.9, 0.95), (0.9, 0.95)] [2023-04-14 07:28:49,378] [INFO] [timer.py:199:stop] epoch=0/micro_step=2860/global_step=1430, RunningAvgSamplesPerSec=27.890724382161228, CurrSamplesPerSec=33.86488836407768, MemAllocated=7.87GB, MaxMemAllocated=44.21GB ***** Evaluating perplexity, Epoch 1/1 ***** ppl: 2.1566169261932373 saving the final model ... [2023-04-14 07:29:39,262] [INFO] [launch.py:460:main] Process 1097 exits successfully.