diff --git "a/training.log" "b/training.log" new file mode 100644--- /dev/null +++ "b/training.log" @@ -0,0 +1,1439 @@ +/home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version! + warnings.warn( +[2023-04-14 06:58:31,332] [WARNING] [runner.py:190:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only. +[2023-04-14 06:58:32,784] [INFO] [runner.py:540:main] cmd = /home/minutiae/.conda/envs/py39/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMSwgMiwgMywgNCwgNSwgNiwgN119 --master_addr=127.0.0.1 --master_port=29500 --enable_each_rank_log=None main.py --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets openai/webgpt_comparisons stanfordnlp/SHP --data_split 2,4,4 --model_name_or_path facebook/opt-1.3b --per_device_train_batch_size 8 --per_device_eval_batch_size 8 --max_seq_len 512 --learning_rate 9.65e-6 --weight_decay 0.1 --num_train_epochs 2 --gradient_accumulation_steps 1 --lr_scheduler_type cosine --num_warmup_steps 0 --seed 1234 --zero_stage 2 --deepspeed --output_dir /lus/grand/projects/BNN-Scale/chatgpt/hf_runs/DeepSpeedExamples/applications/DeepSpeed-Chat/output/actor-models/1.3b +/home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version! + warnings.warn( +[2023-04-14 06:59:25,659] [INFO] [launch.py:229:main] WORLD INFO DICT: {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]} +[2023-04-14 06:59:25,760] [INFO] [launch.py:235:main] nnodes=1, num_local_procs=8, node_rank=0 +[2023-04-14 06:59:25,760] [INFO] [launch.py:246:main] global_rank_mapping=defaultdict(, {'localhost': [0, 1, 2, 3, 4, 5, 6, 7]}) +[2023-04-14 06:59:25,760] [INFO] [launch.py:247:main] dist_world_size=8 +[2023-04-14 06:59:25,760] [INFO] [launch.py:249:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +/home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version! + warnings.warn( +/home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version! + warnings.warn( +/home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version! + warnings.warn( +/home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version! + warnings.warn( +/home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version! + warnings.warn( +/home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version! + warnings.warn( +/home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version! + warnings.warn( +/home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/requests/__init__.py:109: RequestsDependencyWarning: urllib3 (1.26.9) or chardet (5.0.0)/charset_normalizer (2.0.4) doesn't match a supported version! + warnings.warn( +[2023-04-14 07:04:01,148] [INFO] [comm.py:586:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +Found cached dataset parquet (/grand/projects/BNN-Scale/reward/Dahoas___parquet/default-b9d2c4937d617106/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec) + 0%| | 0/2 [00:00 +[2023-04-14 07:15:06,557] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.float16 ZeRO stage 2 optimizer +[2023-04-14 07:15:06,557] [INFO] [stage_1_and_2.py:133:__init__] Reduce bucket size 500,000,000 +[2023-04-14 07:15:06,557] [INFO] [stage_1_and_2.py:134:__init__] Allgather bucket size 500,000,000 +[2023-04-14 07:15:06,557] [INFO] [stage_1_and_2.py:135:__init__] CPU Offload: False +[2023-04-14 07:15:06,557] [INFO] [stage_1_and_2.py:136:__init__] Round robin gradient partitioning: False +Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root... +Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root... +Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root... +Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root... +Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root... +Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root... +Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root... +Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root... +huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... +To disable this warning, you can either: + - Avoid using `tokenizers` before the fork if possible + - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) +huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... +To disable this warning, you can either: + - Avoid using `tokenizers` before the fork if possible + - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) +huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... +To disable this warning, you can either: + - Avoid using `tokenizers` before the fork if possible + - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) +Emitting ninja build file /home/minutiae/.cache/torch_extensions/py39_cu113/utils/build.ninja... +Building extension module utils... +Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N) +huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks... +To disable this warning, you can either: + - Avoid using `tokenizers` before the fork if possible + - Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false) +[1/2] /lus/theta-fs0/software/thetagpu/openmpi/openmpi-4.1.4_ucx-1.12.1_gcc-9.4.0/bin/mpicxx -MMD -MF flatten_unflatten.o.d -DTORCH_EXTENSION_NAME=utils -DTORCH_API_INCLUDE_EXTENSION_H -DPYBIND11_COMPILER_TYPE=\"_gcc\" -DPYBIND11_STDLIB=\"_libstdcpp\" -DPYBIND11_BUILD_ABI=\"_cxxabi1011\" -isystem /home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/torch/include -isystem /home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/torch/include/torch/csrc/api/include -isystem /home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/torch/include/TH -isystem /home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/torch/include/THC -isystem /home/minutiae/.conda/envs/py39/include/python3.9 -D_GLIBCXX_USE_CXX11_ABI=0 -fPIC -std=c++14 -c /home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/deepspeed/ops/csrc/utils/flatten_unflatten.cpp -o flatten_unflatten.o +[2/2] /lus/theta-fs0/software/thetagpu/openmpi/openmpi-4.1.4_ucx-1.12.1_gcc-9.4.0/bin/mpicxx flatten_unflatten.o -shared -L/home/minutiae/.conda/envs/py39/lib/python3.9/site-packages/torch/lib -lc10 -ltorch_cpu -ltorch -ltorch_python -o utils.so +Loading extension module utils... +Time to load utils op: 21.48611044883728 seconds +Loading extension module utils... +Loading extension module utils... +Time to load utils op: 21.532369136810303 seconds +Time to load utils op: 21.53208827972412 seconds +Loading extension module utils... +Loading extension module utils... +Time to load utils op: 21.545364141464233 seconds +Loading extension module utils... +Loading extension module utils... +Time to load utils op: 21.54555320739746 seconds +Loading extension module utils... +Time to load utils op: 21.544370412826538 seconds +Time to load utils op: 21.547586917877197 seconds +Time to load utils op: 21.547967672348022 seconds +Rank: 3 partition count [8, 8] and sizes[(164401920, False), (67840, False)] +Rank: 1 partition count [8, 8] and sizes[(164401920, False), (67840, False)] +Rank: 5 partition count [8, 8] and sizes[(164401920, False), (67840, False)] +Rank: 0 partition count [8, 8] and sizes[(164401920, False), (67840, False)] +Rank: 7 partition count [8, 8] and sizes[(164401920, False), (67840, False)] +Rank: 4 partition count [8, 8] and sizes[(164401920, False), (67840, False)] +Rank: 6 partition count [8, 8] and sizes[(164401920, False), (67840, False)] +Rank: 2 partition count [8, 8] and sizes[(164401920, False), (67840, False)] +Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root... +Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0016155242919921875 seconds +Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0008933544158935547 seconds +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0008301734924316406 seconds +Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root... +Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root... +Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root... +Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0009446144104003906 seconds +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0008039474487304688 seconds +Time to load utils op: 0.0007910728454589844 seconds +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0009191036224365234 seconds +[2023-04-14 07:15:41,001] [INFO] [utils.py:785:see_memory_usage] Before initializing optimizer states +[2023-04-14 07:15:41,002] [INFO] [utils.py:786:see_memory_usage] MA 3.06 GB Max_MA 3.06 GB CA 3.07 GB Max_CA 3 GB +[2023-04-14 07:15:41,002] [INFO] [utils.py:793:see_memory_usage] CPU Virtual Memory: used = 142.4 GB, percent = 14.1% +[2023-04-14 07:15:41,566] [INFO] [utils.py:785:see_memory_usage] After initializing optimizer states +[2023-04-14 07:15:41,567] [INFO] [utils.py:786:see_memory_usage] MA 4.29 GB Max_MA 4.91 GB CA 4.91 GB Max_CA 5 GB +[2023-04-14 07:15:41,567] [INFO] [utils.py:793:see_memory_usage] CPU Virtual Memory: used = 142.4 GB, percent = 14.1% +[2023-04-14 07:15:41,567] [INFO] [stage_1_and_2.py:489:__init__] optimizer state initialized +[2023-04-14 07:15:42,109] [INFO] [utils.py:785:see_memory_usage] After initializing ZeRO optimizer +[2023-04-14 07:15:42,109] [INFO] [utils.py:786:see_memory_usage] MA 4.29 GB Max_MA 4.29 GB CA 4.91 GB Max_CA 5 GB +[2023-04-14 07:15:42,110] [INFO] [utils.py:793:see_memory_usage] CPU Virtual Memory: used = 142.4 GB, percent = 14.1% +[2023-04-14 07:15:42,111] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +[2023-04-14 07:15:42,112] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler +[2023-04-14 07:15:42,112] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = +[2023-04-14 07:15:42,112] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[9.65e-06, 9.65e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:15:42,112] [INFO] [config.py:953:print] DeepSpeedEngine configuration: +[2023-04-14 07:15:42,113] [INFO] [config.py:957:print] activation_checkpointing_config { + "partition_activations": false, + "contiguous_memory_optimization": false, + "cpu_checkpointing": false, + "number_checkpoints": null, + "synchronize_checkpoint_boundary": false, + "profile": false +} +[2023-04-14 07:15:42,113] [INFO] [config.py:957:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +[2023-04-14 07:15:42,113] [INFO] [config.py:957:print] amp_enabled .................. False +[2023-04-14 07:15:42,113] [INFO] [config.py:957:print] amp_params ................... False +[2023-04-14 07:15:42,113] [INFO] [config.py:957:print] autotuning_config ............ { + "enabled": false, + "start_step": null, + "end_step": null, + "metric_path": null, + "arg_mappings": null, + "metric": "throughput", + "model_info": null, + "results_dir": "autotuning_results", + "exps_dir": "autotuning_exps", + "overwrite": true, + "fast": true, + "start_profile_step": 3, + "end_profile_step": 5, + "tuner_type": "gridsearch", + "tuner_early_stopping": 5, + "tuner_num_trials": 50, + "model_info_path": null, + "mp_size": 1, + "max_train_batch_size": null, + "min_train_batch_size": 1, + "max_train_micro_batch_size_per_gpu": 1.024000e+03, + "min_train_micro_batch_size_per_gpu": 1, + "num_tuning_micro_batch_sizes": 3 +} +[2023-04-14 07:15:42,113] [INFO] [config.py:957:print] bfloat16_enabled ............. False +[2023-04-14 07:15:42,113] [INFO] [config.py:957:print] checkpoint_parallel_write_pipeline False +[2023-04-14 07:15:42,113] [INFO] [config.py:957:print] checkpoint_tag_validation_enabled True +[2023-04-14 07:15:42,113] [INFO] [config.py:957:print] checkpoint_tag_validation_fail False +[2023-04-14 07:15:42,113] [INFO] [config.py:957:print] comms_config ................. +[2023-04-14 07:15:42,113] [INFO] [config.py:957:print] communication_data_type ...... None +[2023-04-14 07:15:42,113] [INFO] [config.py:957:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[2023-04-14 07:15:42,113] [INFO] [config.py:957:print] curriculum_enabled_legacy .... False +[2023-04-14 07:15:42,113] [INFO] [config.py:957:print] curriculum_params_legacy ..... False +[2023-04-14 07:15:42,113] [INFO] [config.py:957:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} +[2023-04-14 07:15:42,113] [INFO] [config.py:957:print] data_efficiency_enabled ...... False +[2023-04-14 07:15:42,113] [INFO] [config.py:957:print] dataloader_drop_last ......... False +[2023-04-14 07:15:42,113] [INFO] [config.py:957:print] disable_allgather ............ False +[2023-04-14 07:15:42,113] [INFO] [config.py:957:print] dump_state ................... False +[2023-04-14 07:15:42,113] [INFO] [config.py:957:print] dynamic_loss_scale_args ...... {'init_scale': 65536, 'scale_window': 100, 'delayed_shift': 2, 'min_scale': 1} +[2023-04-14 07:15:42,113] [INFO] [config.py:957:print] eigenvalue_enabled ........... False +[2023-04-14 07:15:42,113] [INFO] [config.py:957:print] eigenvalue_gas_boundary_resolution 1 +[2023-04-14 07:15:42,113] [INFO] [config.py:957:print] eigenvalue_layer_name ........ bert.encoder.layer +[2023-04-14 07:15:42,113] [INFO] [config.py:957:print] eigenvalue_layer_num ......... 0 +[2023-04-14 07:15:42,113] [INFO] [config.py:957:print] eigenvalue_max_iter .......... 100 +[2023-04-14 07:15:42,113] [INFO] [config.py:957:print] eigenvalue_stability ......... 1e-06 +[2023-04-14 07:15:42,114] [INFO] [config.py:957:print] eigenvalue_tol ............... 0.01 +[2023-04-14 07:15:42,114] [INFO] [config.py:957:print] eigenvalue_verbose ........... False +[2023-04-14 07:15:42,114] [INFO] [config.py:957:print] elasticity_enabled ........... False +[2023-04-14 07:15:42,114] [INFO] [config.py:957:print] flops_profiler_config ........ { + "enabled": false, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null +} +[2023-04-14 07:15:42,114] [INFO] [config.py:957:print] fp16_auto_cast ............... False +[2023-04-14 07:15:42,114] [INFO] [config.py:957:print] fp16_enabled ................. True +[2023-04-14 07:15:42,114] [INFO] [config.py:957:print] fp16_master_weights_and_gradients False +[2023-04-14 07:15:42,114] [INFO] [config.py:957:print] global_rank .................. 0 +[2023-04-14 07:15:42,114] [INFO] [config.py:957:print] grad_accum_dtype ............. None +[2023-04-14 07:15:42,114] [INFO] [config.py:957:print] gradient_accumulation_steps .. 1 +[2023-04-14 07:15:42,114] [INFO] [config.py:957:print] gradient_clipping ............ 1.0 +[2023-04-14 07:15:42,114] [INFO] [config.py:957:print] gradient_predivide_factor .... 1.0 +[2023-04-14 07:15:42,114] [INFO] [config.py:957:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 +[2023-04-14 07:15:42,114] [INFO] [config.py:957:print] initial_dynamic_scale ........ 65536 +[2023-04-14 07:15:42,114] [INFO] [config.py:957:print] load_universal_checkpoint .... False +[2023-04-14 07:15:42,114] [INFO] [config.py:957:print] loss_scale ................... 0 +[2023-04-14 07:15:42,114] [INFO] [config.py:957:print] memory_breakdown ............. False +[2023-04-14 07:15:42,114] [INFO] [config.py:957:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False +[2023-04-14 07:15:42,114] [INFO] [config.py:957:print] nebula_config ................ { + "enabled": false, + "persistent_storage_path": null, + "persistent_time_interval": 100, + "num_of_version_in_retention": 2, + "enable_nebula_load": true, + "load_path": null +} +[2023-04-14 07:15:42,114] [INFO] [config.py:957:print] optimizer_legacy_fusion ...... False +[2023-04-14 07:15:42,117] [INFO] [config.py:957:print] optimizer_name ............... None +[2023-04-14 07:15:42,117] [INFO] [config.py:957:print] optimizer_params ............. None +[2023-04-14 07:15:42,117] [INFO] [config.py:957:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0} +[2023-04-14 07:15:42,117] [INFO] [config.py:957:print] pld_enabled .................. False +[2023-04-14 07:15:42,117] [INFO] [config.py:957:print] pld_params ................... False +[2023-04-14 07:15:42,117] [INFO] [config.py:957:print] prescale_gradients ........... False +[2023-04-14 07:15:42,117] [INFO] [config.py:957:print] scheduler_name ............... None +[2023-04-14 07:15:42,117] [INFO] [config.py:957:print] scheduler_params ............. None +[2023-04-14 07:15:42,117] [INFO] [config.py:957:print] sparse_attention ............. None +[2023-04-14 07:15:42,117] [INFO] [config.py:957:print] sparse_gradients_enabled ..... False +[2023-04-14 07:15:42,117] [INFO] [config.py:957:print] steps_per_print .............. 10 +[2023-04-14 07:15:42,117] [INFO] [config.py:957:print] train_batch_size ............. 64 +[2023-04-14 07:15:42,117] [INFO] [config.py:957:print] train_micro_batch_size_per_gpu 8 +[2023-04-14 07:15:42,117] [INFO] [config.py:957:print] use_node_local_storage ....... False +[2023-04-14 07:15:42,117] [INFO] [config.py:957:print] wall_clock_breakdown ......... False +[2023-04-14 07:15:42,117] [INFO] [config.py:957:print] world_size ................... 8 +[2023-04-14 07:15:42,117] [INFO] [config.py:957:print] zero_allow_untested_optimizer False +[2023-04-14 07:15:42,117] [INFO] [config.py:957:print] zero_config .................. stage=2 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500,000,000 allgather_partitions=True allgather_bucket_size=500,000,000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='none', nvme_path=None, buffer_count=5, buffer_size=100,000,000, max_in_cpu=1,000,000,000, pin_memory=False) offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='none', nvme_path=None, buffer_count=4, pin_memory=False, pipeline=False, pipeline_read=False, pipeline_write=False, fast_init=False) sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=30000000 param_persistence_threshold=10000 model_persistence_threshold=sys.maxsize max_live_parameters=30000000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False memory_efficient_linear=False +[2023-04-14 07:15:42,117] [INFO] [config.py:957:print] zero_enabled ................. True +[2023-04-14 07:15:42,117] [INFO] [config.py:957:print] zero_force_ds_cpu_optimizer .. True +[2023-04-14 07:15:42,117] [INFO] [config.py:957:print] zero_optimization_stage ...... 2 +[2023-04-14 07:15:42,117] [INFO] [config.py:943:print_user_config] json = { + "train_batch_size": 64, + "train_micro_batch_size_per_gpu": 8, + "steps_per_print": 10, + "zero_optimization": { + "stage": 2, + "offload_param": { + "device": "none" + }, + "offload_optimizer": { + "device": "none" + }, + "stage3_param_persistence_threshold": 1.000000e+04, + "stage3_max_live_parameters": 3.000000e+07, + "stage3_prefetch_bucket_size": 3.000000e+07, + "memory_efficient_linear": false + }, + "fp16": { + "enabled": true, + "loss_scale_window": 100 + }, + "gradient_clipping": 1.0, + "prescale_gradients": false, + "wall_clock_breakdown": false, + "hybrid_engine": { + "enabled": false, + "inference_tp_size": 1, + "release_inference_cache": false, + "pin_parameters": true, + "tp_gather_partition_size": 8 + } +} +Using /home/minutiae/.cache/torch_extensions/py39_cu113 as PyTorch extensions root... +No modifications detected for re-loaded extension module utils, skipping build step... +Loading extension module utils... +Time to load utils op: 0.0014319419860839844 seconds +***** Running training ***** +***** Evaluating perplexity, Epoch 0/2 ***** +ppl: 3695.26708984375 +Beginning of Epoch 1/2, Total Micro Batches 2065 +[2023-04-14 07:15:58,047] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:15:58,376] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:15:58,708] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384 +[2023-04-14 07:15:59,040] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 16384, reducing to 8192 +[2023-04-14 07:15:59,379] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 8192, reducing to 4096 +[2023-04-14 07:16:00,441] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 4096, reducing to 2048 +[2023-04-14 07:16:01,141] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=6, lr=[9.649977664966322e-06, 9.649977664966322e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:16:01,159] [INFO] [timer.py:199:stop] epoch=0/micro_step=10/global_step=10, RunningAvgSamplesPerSec=184.27451903769784, CurrSamplesPerSec=178.46535809357158, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:16:04,740] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=6, lr=[9.649726398212148e-06, 9.649726398212148e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:16:04,758] [INFO] [timer.py:199:stop] epoch=0/micro_step=20/global_step=20, RunningAvgSamplesPerSec=180.75847659196612, CurrSamplesPerSec=178.6512388465872, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:16:08,330] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=6, lr=[9.6491959604991e-06, 9.6491959604991e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:16:08,348] [INFO] [timer.py:199:stop] epoch=0/micro_step=30/global_step=30, RunningAvgSamplesPerSec=179.94068927773955, CurrSamplesPerSec=178.89566908406053, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:16:11,916] [INFO] [logging.py:96:log_dist] [Rank 0] step=40, skipped=6, lr=[9.648386382519654e-06, 9.648386382519654e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:16:12,030] [INFO] [timer.py:199:stop] epoch=0/micro_step=40/global_step=40, RunningAvgSamplesPerSec=178.3468741701633, CurrSamplesPerSec=141.12131202353532, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:16:15,597] [INFO] [logging.py:96:log_dist] [Rank 0] step=50, skipped=6, lr=[9.647297711118056e-06, 9.647297711118056e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:16:15,615] [INFO] [timer.py:199:stop] epoch=0/micro_step=50/global_step=50, RunningAvgSamplesPerSec=178.42573529930036, CurrSamplesPerSec=178.5987016711144, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:16:19,183] [INFO] [logging.py:96:log_dist] [Rank 0] step=60, skipped=6, lr=[9.645930009287603e-06, 9.645930009287603e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:16:19,201] [INFO] [timer.py:199:stop] epoch=0/micro_step=60/global_step=60, RunningAvgSamplesPerSec=178.4672974294851, CurrSamplesPerSec=178.6980966905942, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:16:22,771] [INFO] [logging.py:96:log_dist] [Rank 0] step=70, skipped=6, lr=[9.64428335616701e-06, 9.64428335616701e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:16:22,790] [INFO] [timer.py:199:stop] epoch=0/micro_step=70/global_step=70, RunningAvgSamplesPerSec=178.4811224959432, CurrSamplesPerSec=177.49617712889585, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:16:26,357] [INFO] [logging.py:96:log_dist] [Rank 0] step=80, skipped=6, lr=[9.64235784703582e-06, 9.64235784703582e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:16:26,375] [INFO] [timer.py:199:stop] epoch=0/micro_step=80/global_step=80, RunningAvgSamplesPerSec=178.51120530097816, CurrSamplesPerSec=178.62187602723955, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:16:29,941] [INFO] [logging.py:96:log_dist] [Rank 0] step=90, skipped=6, lr=[9.640153593308894e-06, 9.640153593308894e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:16:29,959] [INFO] [timer.py:199:stop] epoch=0/micro_step=90/global_step=90, RunningAvgSamplesPerSec=178.54413064796884, CurrSamplesPerSec=178.75604886773934, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:16:33,530] [INFO] [logging.py:96:log_dist] [Rank 0] step=100, skipped=6, lr=[9.637670722529972e-06, 9.637670722529972e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:16:33,548] [INFO] [timer.py:199:stop] epoch=0/micro_step=100/global_step=100, RunningAvgSamplesPerSec=178.54529986190954, CurrSamplesPerSec=178.71415769774134, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:16:37,128] [INFO] [logging.py:96:log_dist] [Rank 0] step=110, skipped=6, lr=[9.634909378364277e-06, 9.634909378364277e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:16:37,146] [INFO] [timer.py:199:stop] epoch=0/micro_step=110/global_step=110, RunningAvgSamplesPerSec=178.52969180834228, CurrSamplesPerSec=178.78021662582927, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:16:40,715] [INFO] [logging.py:96:log_dist] [Rank 0] step=120, skipped=6, lr=[9.631869720590215e-06, 9.631869720590215e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:16:40,733] [INFO] [timer.py:199:stop] epoch=0/micro_step=120/global_step=120, RunningAvgSamplesPerSec=178.53837181922032, CurrSamplesPerSec=178.5123558922577, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:16:44,302] [INFO] [logging.py:96:log_dist] [Rank 0] step=130, skipped=6, lr=[9.628551925090132e-06, 9.628551925090132e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:16:44,321] [INFO] [timer.py:199:stop] epoch=0/micro_step=130/global_step=130, RunningAvgSamplesPerSec=178.54282292060213, CurrSamplesPerSec=178.08931950205266, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:16:47,890] [INFO] [logging.py:96:log_dist] [Rank 0] step=140, skipped=6, lr=[9.624956183840126e-06, 9.624956183840126e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:16:47,908] [INFO] [timer.py:199:stop] epoch=0/micro_step=140/global_step=140, RunningAvgSamplesPerSec=178.54819277355233, CurrSamplesPerSec=178.83607747829134, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:16:51,476] [INFO] [logging.py:96:log_dist] [Rank 0] step=150, skipped=6, lr=[9.621082704898941e-06, 9.621082704898941e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:16:51,494] [INFO] [timer.py:199:stop] epoch=0/micro_step=150/global_step=150, RunningAvgSamplesPerSec=178.55696927444367, CurrSamplesPerSec=178.6183103502426, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:16:55,060] [INFO] [logging.py:96:log_dist] [Rank 0] step=160, skipped=6, lr=[9.61693171239594e-06, 9.61693171239594e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:16:55,079] [INFO] [timer.py:199:stop] epoch=0/micro_step=160/global_step=160, RunningAvgSamplesPerSec=178.56857723010597, CurrSamplesPerSec=178.65742172291127, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:16:58,647] [INFO] [logging.py:96:log_dist] [Rank 0] step=170, skipped=6, lr=[9.612503446518117e-06, 9.612503446518117e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:16:59,312] [INFO] [timer.py:199:stop] epoch=0/micro_step=170/global_step=170, RunningAvgSamplesPerSec=176.67575781999457, CurrSamplesPerSec=63.69488201868073, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:17:02,881] [INFO] [logging.py:96:log_dist] [Rank 0] step=180, skipped=6, lr=[9.60779816349622e-06, 9.60779816349622e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:17:02,899] [INFO] [timer.py:199:stop] epoch=0/micro_step=180/global_step=180, RunningAvgSamplesPerSec=176.78443692615136, CurrSamplesPerSec=178.81511067886143, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:17:06,483] [INFO] [logging.py:96:log_dist] [Rank 0] step=190, skipped=6, lr=[9.602816135589906e-06, 9.602816135589906e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:17:06,502] [INFO] [timer.py:199:stop] epoch=0/micro_step=190/global_step=190, RunningAvgSamplesPerSec=176.84287306677768, CurrSamplesPerSec=178.97486815348202, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:17:10,071] [INFO] [logging.py:96:log_dist] [Rank 0] step=200, skipped=6, lr=[9.597557651072005e-06, 9.597557651072005e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:17:10,089] [INFO] [timer.py:199:stop] epoch=0/micro_step=200/global_step=200, RunningAvgSamplesPerSec=176.9307743321774, CurrSamplesPerSec=177.29102599242452, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:17:13,657] [INFO] [logging.py:96:log_dist] [Rank 0] step=210, skipped=6, lr=[9.59202301421182e-06, 9.59202301421182e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:17:14,241] [INFO] [timer.py:199:stop] epoch=0/micro_step=210/global_step=210, RunningAvgSamplesPerSec=175.69256065589158, CurrSamplesPerSec=69.30462511124324, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:17:17,805] [INFO] [logging.py:96:log_dist] [Rank 0] step=220, skipped=6, lr=[9.586212545257542e-06, 9.586212545257542e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:17:17,824] [INFO] [timer.py:199:stop] epoch=0/micro_step=220/global_step=220, RunningAvgSamplesPerSec=175.83511974506592, CurrSamplesPerSec=178.81606360854315, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:17:21,389] [INFO] [logging.py:96:log_dist] [Rank 0] step=230, skipped=6, lr=[9.580126580417702e-06, 9.580126580417702e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:17:21,407] [INFO] [timer.py:199:stop] epoch=0/micro_step=230/global_step=230, RunningAvgSamplesPerSec=175.96319916810444, CurrSamplesPerSec=178.90532666237013, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:17:24,973] [INFO] [logging.py:96:log_dist] [Rank 0] step=240, skipped=6, lr=[9.573765471841728e-06, 9.573765471841728e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:17:24,991] [INFO] [timer.py:199:stop] epoch=0/micro_step=240/global_step=240, RunningAvgSamplesPerSec=176.07988842213393, CurrSamplesPerSec=178.72843657243604, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:17:28,559] [INFO] [logging.py:96:log_dist] [Rank 0] step=250, skipped=6, lr=[9.567129587599567e-06, 9.567129587599567e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:17:29,121] [INFO] [timer.py:199:stop] epoch=0/micro_step=250/global_step=250, RunningAvgSamplesPerSec=175.12587051715494, CurrSamplesPerSec=71.0182969659562, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:17:32,687] [INFO] [logging.py:96:log_dist] [Rank 0] step=260, skipped=6, lr=[9.560219311660383e-06, 9.560219311660383e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:17:32,706] [INFO] [timer.py:199:stop] epoch=0/micro_step=260/global_step=260, RunningAvgSamplesPerSec=175.264274676631, CurrSamplesPerSec=178.78593212493382, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:17:36,270] [INFO] [logging.py:96:log_dist] [Rank 0] step=270, skipped=6, lr=[9.553035043870342e-06, 9.553035043870342e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:17:36,289] [INFO] [timer.py:199:stop] epoch=0/micro_step=270/global_step=270, RunningAvgSamplesPerSec=175.39492145524426, CurrSamplesPerSec=178.7610485574477, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:17:39,880] [INFO] [logging.py:96:log_dist] [Rank 0] step=280, skipped=6, lr=[9.545577199929482e-06, 9.545577199929482e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:17:40,051] [INFO] [timer.py:199:stop] epoch=0/micro_step=280/global_step=280, RunningAvgSamplesPerSec=175.20593388813313, CurrSamplesPerSec=124.95802588672869, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:17:43,616] [INFO] [logging.py:96:log_dist] [Rank 0] step=290, skipped=6, lr=[9.537846211367644e-06, 9.537846211367644e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:17:43,635] [INFO] [timer.py:199:stop] epoch=0/micro_step=290/global_step=290, RunningAvgSamplesPerSec=175.3284788028387, CurrSamplesPerSec=178.68037344856828, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:17:47,202] [INFO] [logging.py:96:log_dist] [Rank 0] step=300, skipped=6, lr=[9.529842525519525e-06, 9.529842525519525e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:17:47,220] [INFO] [timer.py:199:stop] epoch=0/micro_step=300/global_step=300, RunningAvgSamplesPerSec=175.44043628208146, CurrSamplesPerSec=178.76688188226518, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:17:50,784] [INFO] [logging.py:96:log_dist] [Rank 0] step=310, skipped=6, lr=[9.521566605498769e-06, 9.521566605498769e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:17:50,803] [INFO] [timer.py:199:stop] epoch=0/micro_step=310/global_step=310, RunningAvgSamplesPerSec=175.5492533969797, CurrSamplesPerSec=178.95899886865385, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:17:54,367] [INFO] [logging.py:96:log_dist] [Rank 0] step=320, skipped=6, lr=[9.513018930171194e-06, 9.513018930171194e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:17:54,385] [INFO] [timer.py:199:stop] epoch=0/micro_step=320/global_step=320, RunningAvgSamplesPerSec=175.65119384058016, CurrSamplesPerSec=178.63352489758543, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:17:57,952] [INFO] [logging.py:96:log_dist] [Rank 0] step=330, skipped=6, lr=[9.504199994127064e-06, 9.504199994127064e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:17:57,970] [INFO] [timer.py:199:stop] epoch=0/micro_step=330/global_step=330, RunningAvgSamplesPerSec=175.74385785834335, CurrSamplesPerSec=178.94420597997342, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:18:01,535] [INFO] [logging.py:96:log_dist] [Rank 0] step=340, skipped=6, lr=[9.495110307652484e-06, 9.495110307652484e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:18:01,661] [INFO] [timer.py:199:stop] epoch=0/micro_step=340/global_step=340, RunningAvgSamplesPerSec=175.6798375902244, CurrSamplesPerSec=137.64607888683437, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:18:05,224] [INFO] [logging.py:96:log_dist] [Rank 0] step=350, skipped=6, lr=[9.485750396699863e-06, 9.485750396699863e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:18:05,243] [INFO] [timer.py:199:stop] epoch=0/micro_step=350/global_step=350, RunningAvgSamplesPerSec=175.7699612480979, CurrSamplesPerSec=178.89829202688185, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:18:08,817] [INFO] [logging.py:96:log_dist] [Rank 0] step=360, skipped=6, lr=[9.476120802857493e-06, 9.476120802857493e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:18:08,835] [INFO] [timer.py:199:stop] epoch=0/micro_step=360/global_step=360, RunningAvgSamplesPerSec=175.84087519771325, CurrSamplesPerSec=178.72486663985703, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:18:12,446] [INFO] [logging.py:96:log_dist] [Rank 0] step=370, skipped=6, lr=[9.466222083318202e-06, 9.466222083318202e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:18:12,465] [INFO] [timer.py:199:stop] epoch=0/micro_step=370/global_step=370, RunningAvgSamplesPerSec=175.86967098777444, CurrSamplesPerSec=178.5235155708606, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:18:16,031] [INFO] [logging.py:96:log_dist] [Rank 0] step=380, skipped=6, lr=[9.456054810847115e-06, 9.456054810847115e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:18:16,050] [INFO] [timer.py:199:stop] epoch=0/micro_step=380/global_step=380, RunningAvgSamplesPerSec=175.94419993377838, CurrSamplesPerSec=178.71142117397363, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:18:19,617] [INFO] [logging.py:96:log_dist] [Rank 0] step=390, skipped=6, lr=[9.445619573748516e-06, 9.445619573748516e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:18:19,636] [INFO] [timer.py:199:stop] epoch=0/micro_step=390/global_step=390, RunningAvgSamplesPerSec=176.0133110482754, CurrSamplesPerSec=178.70523455958374, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:18:23,202] [INFO] [logging.py:96:log_dist] [Rank 0] step=400, skipped=6, lr=[9.434916975831804e-06, 9.434916975831804e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:18:23,220] [INFO] [timer.py:199:stop] epoch=0/micro_step=400/global_step=400, RunningAvgSamplesPerSec=176.08097753270664, CurrSamplesPerSec=178.43475162690527, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:18:26,785] [INFO] [logging.py:96:log_dist] [Rank 0] step=410, skipped=6, lr=[9.423947636376555e-06, 9.423947636376555e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:18:26,804] [INFO] [timer.py:199:stop] epoch=0/micro_step=410/global_step=410, RunningAvgSamplesPerSec=176.14699182385613, CurrSamplesPerSec=178.7862893557617, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:18:30,372] [INFO] [logging.py:96:log_dist] [Rank 0] step=420, skipped=6, lr=[9.412712190096692e-06, 9.412712190096692e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:18:30,390] [INFO] [timer.py:199:stop] epoch=0/micro_step=420/global_step=420, RunningAvgSamplesPerSec=176.20662248016495, CurrSamplesPerSec=177.8171626540542, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:18:33,956] [INFO] [logging.py:96:log_dist] [Rank 0] step=430, skipped=6, lr=[9.401211287103756e-06, 9.401211287103756e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:18:34,209] [INFO] [timer.py:199:stop] epoch=0/micro_step=430/global_step=430, RunningAvgSamplesPerSec=175.99964533636668, CurrSamplesPerSec=107.99832632751196, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:18:37,775] [INFO] [logging.py:96:log_dist] [Rank 0] step=440, skipped=6, lr=[9.389445592869288e-06, 9.389445592869288e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:18:37,793] [INFO] [timer.py:199:stop] epoch=0/micro_step=440/global_step=440, RunningAvgSamplesPerSec=176.0625157704159, CurrSamplesPerSec=178.81391953104185, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:18:41,359] [INFO] [logging.py:96:log_dist] [Rank 0] step=450, skipped=6, lr=[9.377415788186326e-06, 9.377415788186326e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:18:41,377] [INFO] [timer.py:199:stop] epoch=0/micro_step=450/global_step=450, RunningAvgSamplesPerSec=176.12252679438848, CurrSamplesPerSec=178.18625571527951, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:18:44,971] [INFO] [logging.py:96:log_dist] [Rank 0] step=460, skipped=6, lr=[9.36512256913001e-06, 9.36512256913001e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:18:44,976] [INFO] [timer.py:199:stop] epoch=0/micro_step=460/global_step=460, RunningAvgSamplesPerSec=176.16410267732272, CurrSamplesPerSec=172.87622241929054, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:18:49,280] [INFO] [logging.py:96:log_dist] [Rank 0] step=470, skipped=6, lr=[9.352566647017312e-06, 9.352566647017312e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:18:49,298] [INFO] [timer.py:199:stop] epoch=0/micro_step=470/global_step=470, RunningAvgSamplesPerSec=176.2150388079026, CurrSamplesPerSec=177.77206062524587, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:18:52,863] [INFO] [logging.py:96:log_dist] [Rank 0] step=480, skipped=6, lr=[9.339748748365863e-06, 9.339748748365863e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:18:52,881] [INFO] [timer.py:199:stop] epoch=0/micro_step=480/global_step=480, RunningAvgSamplesPerSec=176.26890284120165, CurrSamplesPerSec=178.79676876185604, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:18:56,447] [INFO] [logging.py:96:log_dist] [Rank 0] step=490, skipped=6, lr=[9.32666961485193e-06, 9.32666961485193e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:18:56,466] [INFO] [timer.py:199:stop] epoch=0/micro_step=490/global_step=490, RunningAvgSamplesPerSec=176.31898358776255, CurrSamplesPerSec=178.7496211061835, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:19:00,031] [INFO] [logging.py:96:log_dist] [Rank 0] step=500, skipped=6, lr=[9.313330003267494e-06, 9.313330003267494e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:19:00,050] [INFO] [timer.py:199:stop] epoch=0/micro_step=500/global_step=500, RunningAvgSamplesPerSec=176.36775252368622, CurrSamplesPerSec=178.95947609933205, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:19:03,246] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:19:03,576] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:19:03,577] [INFO] [logging.py:96:log_dist] [Rank 0] step=510, skipped=8, lr=[9.302471287976445e-06, 9.302471287976445e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:19:03,577] [INFO] [timer.py:199:stop] epoch=0/micro_step=510/global_step=510, RunningAvgSamplesPerSec=176.46831239595994, CurrSamplesPerSec=194.13074757947172, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:19:07,143] [INFO] [logging.py:96:log_dist] [Rank 0] step=520, skipped=8, lr=[9.288664771061984e-06, 9.288664771061984e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:19:07,162] [INFO] [timer.py:199:stop] epoch=0/micro_step=520/global_step=520, RunningAvgSamplesPerSec=176.512112079308, CurrSamplesPerSec=178.80093704893528, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:19:10,736] [INFO] [logging.py:96:log_dist] [Rank 0] step=530, skipped=8, lr=[9.27459997513409e-06, 9.27459997513409e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:19:10,755] [INFO] [timer.py:199:stop] epoch=0/micro_step=530/global_step=530, RunningAvgSamplesPerSec=176.54604142944274, CurrSamplesPerSec=178.6055939282119, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:19:14,329] [INFO] [logging.py:96:log_dist] [Rank 0] step=540, skipped=8, lr=[9.260277714017674e-06, 9.260277714017674e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:19:14,348] [INFO] [timer.py:199:stop] epoch=0/micro_step=540/global_step=540, RunningAvgSamplesPerSec=176.5789338082624, CurrSamplesPerSec=178.8626504290071, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:19:17,916] [INFO] [logging.py:96:log_dist] [Rank 0] step=550, skipped=8, lr=[9.24569881643525e-06, 9.24569881643525e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:19:17,934] [INFO] [timer.py:199:stop] epoch=0/micro_step=550/global_step=550, RunningAvgSamplesPerSec=176.61660427609561, CurrSamplesPerSec=178.87993304216985, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:19:21,509] [INFO] [logging.py:96:log_dist] [Rank 0] step=560, skipped=8, lr=[9.230864125958966e-06, 9.230864125958966e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:19:21,527] [INFO] [timer.py:199:stop] epoch=0/micro_step=560/global_step=560, RunningAvgSamplesPerSec=176.6535866607537, CurrSamplesPerSec=178.80188982755624, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:19:25,094] [INFO] [logging.py:96:log_dist] [Rank 0] step=570, skipped=8, lr=[9.2157745009618e-06, 9.2157745009618e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:19:25,113] [INFO] [timer.py:199:stop] epoch=0/micro_step=570/global_step=570, RunningAvgSamplesPerSec=176.68929115450385, CurrSamplesPerSec=178.73938525501623, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:19:28,677] [INFO] [logging.py:96:log_dist] [Rank 0] step=580, skipped=8, lr=[9.20043081456789e-06, 9.20043081456789e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:19:28,718] [INFO] [timer.py:199:stop] epoch=0/micro_step=580/global_step=580, RunningAvgSamplesPerSec=176.70698286954553, CurrSamplesPerSec=168.31845859718447, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:19:32,283] [INFO] [logging.py:96:log_dist] [Rank 0] step=590, skipped=8, lr=[9.184833954602016e-06, 9.184833954602016e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:19:32,301] [INFO] [timer.py:199:stop] epoch=0/micro_step=590/global_step=590, RunningAvgSamplesPerSec=176.74256784347733, CurrSamplesPerSec=178.552489759864, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:19:35,866] [INFO] [logging.py:96:log_dist] [Rank 0] step=600, skipped=8, lr=[9.168984823538221e-06, 9.168984823538221e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:19:35,885] [INFO] [timer.py:199:stop] epoch=0/micro_step=600/global_step=600, RunningAvgSamplesPerSec=176.77672868191746, CurrSamplesPerSec=178.74664544719047, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:19:39,450] [INFO] [logging.py:96:log_dist] [Rank 0] step=610, skipped=8, lr=[9.1528843384476e-06, 9.1528843384476e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:19:39,468] [INFO] [timer.py:199:stop] epoch=0/micro_step=610/global_step=610, RunningAvgSamplesPerSec=176.80986188504218, CurrSamplesPerSec=178.62306461787432, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:19:39,797] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:19:40,127] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:19:42,979] [INFO] [logging.py:96:log_dist] [Rank 0] step=620, skipped=10, lr=[9.139823601033817e-06, 9.139823601033817e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:19:42,998] [INFO] [timer.py:199:stop] epoch=0/micro_step=620/global_step=620, RunningAvgSamplesPerSec=176.88435580016656, CurrSamplesPerSec=178.80582014670264, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:19:46,564] [INFO] [logging.py:96:log_dist] [Rank 0] step=630, skipped=10, lr=[9.123273036104072e-06, 9.123273036104072e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:19:46,582] [INFO] [timer.py:199:stop] epoch=0/micro_step=630/global_step=630, RunningAvgSamplesPerSec=176.9135593339478, CurrSamplesPerSec=178.74938304981814, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:19:50,165] [INFO] [logging.py:96:log_dist] [Rank 0] step=640, skipped=10, lr=[9.106473762147958e-06, 9.106473762147958e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:19:50,183] [INFO] [timer.py:199:stop] epoch=0/micro_step=640/global_step=640, RunningAvgSamplesPerSec=176.92975812144076, CurrSamplesPerSec=170.7817615936847, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:19:53,750] [INFO] [logging.py:96:log_dist] [Rank 0] step=650, skipped=10, lr=[9.089426751214259e-06, 9.089426751214259e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:19:53,768] [INFO] [timer.py:199:stop] epoch=0/micro_step=650/global_step=650, RunningAvgSamplesPerSec=176.95760803216433, CurrSamplesPerSec=178.90520742665123, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:19:57,334] [INFO] [logging.py:96:log_dist] [Rank 0] step=660, skipped=10, lr=[9.072132989686448e-06, 9.072132989686448e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:19:57,353] [INFO] [timer.py:199:stop] epoch=0/micro_step=660/global_step=660, RunningAvgSamplesPerSec=176.9849503754496, CurrSamplesPerSec=178.58943361229257, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:20:00,920] [INFO] [logging.py:96:log_dist] [Rank 0] step=670, skipped=10, lr=[9.054593478225617e-06, 9.054593478225617e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:20:00,938] [INFO] [timer.py:199:stop] epoch=0/micro_step=670/global_step=670, RunningAvgSamplesPerSec=177.01061599037078, CurrSamplesPerSec=178.72058290900569, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:20:04,503] [INFO] [logging.py:96:log_dist] [Rank 0] step=680, skipped=10, lr=[9.036809231712576e-06, 9.036809231712576e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:20:04,522] [INFO] [timer.py:199:stop] epoch=0/micro_step=680/global_step=680, RunningAvgSamplesPerSec=177.03680424657273, CurrSamplesPerSec=178.7290315750646, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:20:08,087] [INFO] [logging.py:96:log_dist] [Rank 0] step=690, skipped=10, lr=[9.018781279189126e-06, 9.018781279189126e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:20:08,105] [INFO] [timer.py:199:stop] epoch=0/micro_step=690/global_step=690, RunningAvgSamplesPerSec=177.06202609817402, CurrSamplesPerSec=178.94420597997342, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:20:11,670] [INFO] [logging.py:96:log_dist] [Rank 0] step=700, skipped=10, lr=[9.000510663798525e-06, 9.000510663798525e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:20:11,689] [INFO] [timer.py:199:stop] epoch=0/micro_step=700/global_step=700, RunningAvgSamplesPerSec=177.08680227663214, CurrSamplesPerSec=178.57470175510957, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:20:15,257] [INFO] [logging.py:96:log_dist] [Rank 0] step=710, skipped=10, lr=[8.981998442725115e-06, 8.981998442725115e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:20:15,275] [INFO] [timer.py:199:stop] epoch=0/micro_step=710/global_step=710, RunningAvgSamplesPerSec=177.10888762064332, CurrSamplesPerSec=178.93168165565933, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:20:16,325] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:20:16,655] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:20:18,788] [INFO] [logging.py:96:log_dist] [Rank 0] step=720, skipped=12, lr=[8.967015429129005e-06, 8.967015429129005e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:20:18,806] [INFO] [timer.py:199:stop] epoch=0/micro_step=720/global_step=720, RunningAvgSamplesPerSec=177.16786345201925, CurrSamplesPerSec=178.76461993561594, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:20:22,371] [INFO] [logging.py:96:log_dist] [Rank 0] step=730, skipped=12, lr=[8.948071026515447e-06, 8.948071026515447e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:20:22,390] [INFO] [timer.py:199:stop] epoch=0/micro_step=730/global_step=730, RunningAvgSamplesPerSec=177.19013577847855, CurrSamplesPerSec=178.51627349283734, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:20:25,959] [INFO] [logging.py:96:log_dist] [Rank 0] step=740, skipped=12, lr=[8.928888052510068e-06, 8.928888052510068e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:20:25,978] [INFO] [timer.py:199:stop] epoch=0/micro_step=740/global_step=740, RunningAvgSamplesPerSec=177.20932937887963, CurrSamplesPerSec=178.84668187950763, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:20:29,545] [INFO] [logging.py:96:log_dist] [Rank 0] step=750, skipped=12, lr=[8.909467617088604e-06, 8.909467617088604e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:20:29,564] [INFO] [timer.py:199:stop] epoch=0/micro_step=750/global_step=750, RunningAvgSamplesPerSec=177.2288675801064, CurrSamplesPerSec=178.49882368587294, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:20:33,131] [INFO] [logging.py:96:log_dist] [Rank 0] step=760, skipped=12, lr=[8.889810843966922e-06, 8.889810843966922e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:20:33,334] [INFO] [timer.py:199:stop] epoch=0/micro_step=760/global_step=760, RunningAvgSamplesPerSec=177.1287058061632, CurrSamplesPerSec=118.01809254491481, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:20:36,898] [INFO] [logging.py:96:log_dist] [Rank 0] step=770, skipped=12, lr=[8.869918870535976e-06, 8.869918870535976e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:20:36,916] [INFO] [timer.py:199:stop] epoch=0/micro_step=770/global_step=770, RunningAvgSamplesPerSec=177.1510939526829, CurrSamplesPerSec=178.72272474876263, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:20:40,481] [INFO] [logging.py:96:log_dist] [Rank 0] step=780, skipped=12, lr=[8.849792847796023e-06, 8.849792847796023e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:20:40,499] [INFO] [timer.py:199:stop] epoch=0/micro_step=780/global_step=780, RunningAvgSamplesPerSec=177.17231214461074, CurrSamplesPerSec=178.93084676303926, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:20:44,065] [INFO] [logging.py:96:log_dist] [Rank 0] step=790, skipped=12, lr=[8.829433940290002e-06, 8.829433940290002e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:20:44,084] [INFO] [timer.py:199:stop] epoch=0/micro_step=790/global_step=790, RunningAvgSamplesPerSec=177.19218878454006, CurrSamplesPerSec=178.66491308873762, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:20:47,650] [INFO] [logging.py:96:log_dist] [Rank 0] step=800, skipped=12, lr=[8.80884332603616e-06, 8.80884332603616e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:20:47,734] [INFO] [timer.py:199:stop] epoch=0/micro_step=800/global_step=800, RunningAvgSamplesPerSec=177.17148982488303, CurrSamplesPerSec=151.27010431975765, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:20:51,298] [INFO] [logging.py:96:log_dist] [Rank 0] step=810, skipped=12, lr=[8.788022196459883e-06, 8.788022196459883e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:20:51,316] [INFO] [timer.py:199:stop] epoch=0/micro_step=810/global_step=810, RunningAvgSamplesPerSec=177.19195777413094, CurrSamplesPerSec=178.6917920472685, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:20:53,080] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:20:53,410] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:20:54,826] [INFO] [logging.py:96:log_dist] [Rank 0] step=820, skipped=14, lr=[8.771200130921456e-06, 8.771200130921456e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:20:54,845] [INFO] [timer.py:199:stop] epoch=0/micro_step=820/global_step=820, RunningAvgSamplesPerSec=177.24458415963548, CurrSamplesPerSec=178.5890771677012, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:20:58,417] [INFO] [logging.py:96:log_dist] [Rank 0] step=830, skipped=14, lr=[8.749967118687843e-06, 8.749967118687843e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:20:58,482] [INFO] [timer.py:199:stop] epoch=0/micro_step=830/global_step=830, RunningAvgSamplesPerSec=177.23115701762717, CurrSamplesPerSec=157.64661636031235, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:21:02,047] [INFO] [logging.py:96:log_dist] [Rank 0] step=840, skipped=14, lr=[8.728506997859123e-06, 8.728506997859123e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:21:02,065] [INFO] [timer.py:199:stop] epoch=0/micro_step=840/global_step=840, RunningAvgSamplesPerSec=177.25004819026225, CurrSamplesPerSec=178.87921783331512, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:21:05,637] [INFO] [logging.py:96:log_dist] [Rank 0] step=850, skipped=14, lr=[8.706821010172547e-06, 8.706821010172547e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:21:05,655] [INFO] [timer.py:199:stop] epoch=0/micro_step=850/global_step=850, RunningAvgSamplesPerSec=177.26493160277076, CurrSamplesPerSec=178.66301046012293, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:21:09,220] [INFO] [logging.py:96:log_dist] [Rank 0] step=860, skipped=14, lr=[8.684910410434607e-06, 8.684910410434607e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:21:09,238] [INFO] [timer.py:199:stop] epoch=0/micro_step=860/global_step=860, RunningAvgSamplesPerSec=177.2830967271538, CurrSamplesPerSec=178.82976309518497, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:21:12,804] [INFO] [logging.py:96:log_dist] [Rank 0] step=870, skipped=14, lr=[8.662776466448409e-06, 8.662776466448409e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:21:13,441] [INFO] [timer.py:199:stop] epoch=0/micro_step=870/global_step=870, RunningAvgSamplesPerSec=176.95039289595368, CurrSamplesPerSec=65.47383502333844, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:21:17,008] [INFO] [logging.py:96:log_dist] [Rank 0] step=880, skipped=14, lr=[8.640420458940333e-06, 8.640420458940333e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:21:17,026] [INFO] [timer.py:199:stop] epoch=0/micro_step=880/global_step=880, RunningAvgSamplesPerSec=176.97061924586163, CurrSamplesPerSec=178.71499056943566, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:21:20,604] [INFO] [logging.py:96:log_dist] [Rank 0] step=890, skipped=14, lr=[8.61784368148592e-06, 8.61784368148592e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:21:20,622] [INFO] [timer.py:199:stop] epoch=0/micro_step=890/global_step=890, RunningAvgSamplesPerSec=176.98429888790267, CurrSamplesPerSec=178.81761214093817, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:21:24,189] [INFO] [logging.py:96:log_dist] [Rank 0] step=900, skipped=14, lr=[8.595047440435015e-06, 8.595047440435015e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:21:24,207] [INFO] [timer.py:199:stop] epoch=0/micro_step=900/global_step=900, RunningAvgSamplesPerSec=177.00358641934514, CurrSamplesPerSec=178.38115629414915, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:21:27,773] [INFO] [logging.py:96:log_dist] [Rank 0] step=910, skipped=14, lr=[8.5720330548362e-06, 8.5720330548362e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:21:28,170] [INFO] [timer.py:199:stop] epoch=0/micro_step=910/global_step=910, RunningAvgSamplesPerSec=176.81874237945163, CurrSamplesPerSec=86.87026358276972, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:21:30,661] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:21:30,991] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:21:31,692] [INFO] [logging.py:96:log_dist] [Rank 0] step=920, skipped=16, lr=[8.553465376743393e-06, 8.553465376743393e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:21:31,710] [INFO] [timer.py:199:stop] epoch=0/micro_step=920/global_step=920, RunningAvgSamplesPerSec=176.8634759769924, CurrSamplesPerSec=178.69643126987518, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:21:35,276] [INFO] [logging.py:96:log_dist] [Rank 0] step=930, skipped=16, lr=[8.53006169520226e-06, 8.53006169520226e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:21:35,294] [INFO] [timer.py:199:stop] epoch=0/micro_step=930/global_step=930, RunningAvgSamplesPerSec=176.88404984017404, CurrSamplesPerSec=178.86110111646823, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:21:38,860] [INFO] [logging.py:96:log_dist] [Rank 0] step=940, skipped=16, lr=[8.506443629353965e-06, 8.506443629353965e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:21:39,672] [INFO] [timer.py:199:stop] epoch=0/micro_step=940/global_step=940, RunningAvgSamplesPerSec=176.49135742721543, CurrSamplesPerSec=55.57821787262095, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:21:43,236] [INFO] [logging.py:96:log_dist] [Rank 0] step=950, skipped=16, lr=[8.482612545799954e-06, 8.482612545799954e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:21:43,255] [INFO] [timer.py:199:stop] epoch=0/micro_step=950/global_step=950, RunningAvgSamplesPerSec=176.5157562678972, CurrSamplesPerSec=178.76628662759722, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:21:46,822] [INFO] [logging.py:96:log_dist] [Rank 0] step=960, skipped=16, lr=[8.458569823467424e-06, 8.458569823467424e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:21:46,840] [INFO] [timer.py:199:stop] epoch=0/micro_step=960/global_step=960, RunningAvgSamplesPerSec=176.53819457251984, CurrSamplesPerSec=178.78212175159362, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:21:50,407] [INFO] [logging.py:96:log_dist] [Rank 0] step=970, skipped=16, lr=[8.434316853529531e-06, 8.434316853529531e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:21:50,425] [INFO] [timer.py:199:stop] epoch=0/micro_step=970/global_step=970, RunningAvgSamplesPerSec=176.5606108951584, CurrSamplesPerSec=178.83536262008502, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:21:53,990] [INFO] [logging.py:96:log_dist] [Rank 0] step=980, skipped=16, lr=[8.409855039324893e-06, 8.409855039324893e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:21:54,302] [INFO] [timer.py:199:stop] epoch=0/micro_step=980/global_step=980, RunningAvgSamplesPerSec=176.43734210869138, CurrSamplesPerSec=98.3179219713729, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:21:57,870] [INFO] [logging.py:96:log_dist] [Rank 0] step=990, skipped=16, lr=[8.385185796276388e-06, 8.385185796276388e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:21:57,888] [INFO] [timer.py:199:stop] epoch=0/micro_step=990/global_step=990, RunningAvgSamplesPerSec=176.45963943156698, CurrSamplesPerSec=178.5983451895261, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:22:01,456] [INFO] [logging.py:96:log_dist] [Rank 0] step=1000, skipped=16, lr=[8.360310551809257e-06, 8.360310551809257e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:22:01,475] [INFO] [timer.py:199:stop] epoch=0/micro_step=1000/global_step=1000, RunningAvgSamplesPerSec=176.48120845473144, CurrSamplesPerSec=178.57185070863838, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:22:05,074] [INFO] [logging.py:96:log_dist] [Rank 0] step=1010, skipped=16, lr=[8.335230745268505e-06, 8.335230745268505e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:22:06,417] [INFO] [timer.py:199:stop] epoch=0/micro_step=1010/global_step=1010, RunningAvgSamplesPerSec=175.850523599254, CurrSamplesPerSec=38.045886860184666, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:22:09,615] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:22:09,945] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:22:09,945] [INFO] [logging.py:96:log_dist] [Rank 0] step=1020, skipped=18, lr=[8.315020590141294e-06, 8.315020590141294e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:22:09,946] [INFO] [timer.py:199:stop] epoch=0/micro_step=1020/global_step=1020, RunningAvgSamplesPerSec=175.90511823819023, CurrSamplesPerSec=194.32004279688348, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:22:13,517] [INFO] [logging.py:96:log_dist] [Rank 0] step=1030, skipped=18, lr=[8.289576236745907e-06, 8.289576236745907e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:22:13,535] [INFO] [timer.py:199:stop] epoch=0/micro_step=1030/global_step=1030, RunningAvgSamplesPerSec=175.93019839361943, CurrSamplesPerSec=178.57101917056602, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:22:17,102] [INFO] [logging.py:96:log_dist] [Rank 0] step=1040, skipped=18, lr=[8.263931414144542e-06, 8.263931414144542e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:22:17,120] [INFO] [timer.py:199:stop] epoch=0/micro_step=1040/global_step=1040, RunningAvgSamplesPerSec=175.9565444149551, CurrSamplesPerSec=178.83643490953764, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:22:20,689] [INFO] [logging.py:96:log_dist] [Rank 0] step=1050, skipped=18, lr=[8.23808760621196e-06, 8.23808760621196e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:22:20,707] [INFO] [timer.py:199:stop] epoch=0/micro_step=1050/global_step=1050, RunningAvgSamplesPerSec=175.9816878988253, CurrSamplesPerSec=178.86801364121578, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:22:24,292] [INFO] [logging.py:96:log_dist] [Rank 0] step=1060, skipped=18, lr=[8.212046308336714e-06, 8.212046308336714e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:22:24,314] [INFO] [timer.py:199:stop] epoch=0/micro_step=1060/global_step=1060, RunningAvgSamplesPerSec=175.99914816834197, CurrSamplesPerSec=173.468990051407, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:22:27,885] [INFO] [logging.py:96:log_dist] [Rank 0] step=1070, skipped=18, lr=[8.18580902733463e-06, 8.18580902733463e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:22:28,521] [INFO] [timer.py:199:stop] epoch=0/micro_step=1070/global_step=1070, RunningAvgSamplesPerSec=175.74251931623095, CurrSamplesPerSec=65.55826356069652, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:22:32,091] [INFO] [logging.py:96:log_dist] [Rank 0] step=1080, skipped=18, lr=[8.159377281361623e-06, 8.159377281361623e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:22:32,110] [INFO] [timer.py:199:stop] epoch=0/micro_step=1080/global_step=1080, RunningAvgSamplesPerSec=175.7683675804192, CurrSamplesPerSec=178.76378660127955, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:22:35,676] [INFO] [logging.py:96:log_dist] [Rank 0] step=1090, skipped=18, lr=[8.132752599825838e-06, 8.132752599825838e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:22:35,695] [INFO] [timer.py:199:stop] epoch=0/micro_step=1090/global_step=1090, RunningAvgSamplesPerSec=175.79503150136904, CurrSamplesPerSec=178.8808866628727, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:22:39,284] [INFO] [logging.py:96:log_dist] [Rank 0] step=1100, skipped=18, lr=[8.105936523299164e-06, 8.105936523299164e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:22:39,676] [INFO] [timer.py:199:stop] epoch=0/micro_step=1100/global_step=1100, RunningAvgSamplesPerSec=175.64742204710205, CurrSamplesPerSec=87.48952757658316, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:22:43,241] [INFO] [logging.py:96:log_dist] [Rank 0] step=1110, skipped=18, lr=[8.078930603428098e-06, 8.078930603428098e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:22:43,259] [INFO] [timer.py:199:stop] epoch=0/micro_step=1110/global_step=1110, RunningAvgSamplesPerSec=175.6754706265506, CurrSamplesPerSec=178.90198812231708, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:22:46,826] [INFO] [logging.py:96:log_dist] [Rank 0] step=1120, skipped=18, lr=[8.051736402843955e-06, 8.051736402843955e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:22:46,844] [INFO] [timer.py:199:stop] epoch=0/micro_step=1120/global_step=1120, RunningAvgSamplesPerSec=175.70232882911398, CurrSamplesPerSec=178.7610485574477, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:22:47,173] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:22:47,503] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:22:50,352] [INFO] [logging.py:96:log_dist] [Rank 0] step=1130, skipped=20, lr=[8.029846537308772e-06, 8.029846537308772e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:22:50,370] [INFO] [timer.py:199:stop] epoch=0/micro_step=1130/global_step=1130, RunningAvgSamplesPerSec=175.75372135882958, CurrSamplesPerSec=178.71974998518633, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:22:53,937] [INFO] [logging.py:96:log_dist] [Rank 0] step=1140, skipped=20, lr=[8.002317403988813e-06, 8.002317403988813e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:22:54,183] [INFO] [timer.py:199:stop] epoch=0/micro_step=1140/global_step=1140, RunningAvgSamplesPerSec=175.68308387616537, CurrSamplesPerSec=109.34764495712827, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:22:57,749] [INFO] [logging.py:96:log_dist] [Rank 0] step=1150, skipped=20, lr=[7.974604422990641e-06, 7.974604422990641e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:22:57,768] [INFO] [timer.py:199:stop] epoch=0/micro_step=1150/global_step=1150, RunningAvgSamplesPerSec=175.7091736275364, CurrSamplesPerSec=178.7999842804684, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:23:01,335] [INFO] [logging.py:96:log_dist] [Rank 0] step=1160, skipped=20, lr=[7.946709197857927e-06, 7.946709197857927e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:23:01,353] [INFO] [timer.py:199:stop] epoch=0/micro_step=1160/global_step=1160, RunningAvgSamplesPerSec=175.73464020802857, CurrSamplesPerSec=178.70000006657094, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:23:04,923] [INFO] [logging.py:96:log_dist] [Rank 0] step=1170, skipped=20, lr=[7.918633342679455e-06, 7.918633342679455e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:23:04,942] [INFO] [timer.py:199:stop] epoch=0/micro_step=1170/global_step=1170, RunningAvgSamplesPerSec=175.75838485111476, CurrSamplesPerSec=177.91297731577768, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:23:08,508] [INFO] [logging.py:96:log_dist] [Rank 0] step=1180, skipped=20, lr=[7.890378481995714e-06, 7.890378481995714e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:23:08,526] [INFO] [timer.py:199:stop] epoch=0/micro_step=1180/global_step=1180, RunningAvgSamplesPerSec=175.78333066387148, CurrSamplesPerSec=178.63661566923318, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:23:12,104] [INFO] [logging.py:96:log_dist] [Rank 0] step=1190, skipped=20, lr=[7.86194625070492e-06, 7.86194625070492e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:23:12,123] [INFO] [timer.py:199:stop] epoch=0/micro_step=1190/global_step=1190, RunningAvgSamplesPerSec=175.80343741363808, CurrSamplesPerSec=178.8017707296733, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:23:15,689] [INFO] [logging.py:96:log_dist] [Rank 0] step=1200, skipped=20, lr=[7.83333829396839e-06, 7.83333829396839e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:23:15,708] [INFO] [timer.py:199:stop] epoch=0/micro_step=1200/global_step=1200, RunningAvgSamplesPerSec=175.82742671326542, CurrSamplesPerSec=178.64648307907524, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:23:19,273] [INFO] [logging.py:96:log_dist] [Rank 0] step=1210, skipped=20, lr=[7.804556267115377e-06, 7.804556267115377e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:23:19,292] [INFO] [timer.py:199:stop] epoch=0/micro_step=1210/global_step=1210, RunningAvgSamplesPerSec=175.85137826220165, CurrSamplesPerSec=178.77950221413545, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:23:22,858] [INFO] [logging.py:96:log_dist] [Rank 0] step=1220, skipped=20, lr=[7.775601835547265e-06, 7.775601835547265e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:23:22,876] [INFO] [timer.py:199:stop] epoch=0/micro_step=1220/global_step=1220, RunningAvgSamplesPerSec=175.87484477767129, CurrSamplesPerSec=178.807964029975, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:23:23,922] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:23:24,252] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:23:26,383] [INFO] [logging.py:96:log_dist] [Rank 0] step=1230, skipped=22, lr=[7.752315284418645e-06, 7.752315284418645e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:23:26,437] [INFO] [timer.py:199:stop] epoch=0/micro_step=1230/global_step=1230, RunningAvgSamplesPerSec=175.90728000050507, CurrSamplesPerSec=162.7911575566601, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:23:30,013] [INFO] [logging.py:96:log_dist] [Rank 0] step=1240, skipped=22, lr=[7.723054752955656e-06, 7.723054752955656e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:23:30,032] [INFO] [timer.py:199:stop] epoch=0/micro_step=1240/global_step=1240, RunningAvgSamplesPerSec=175.92601105368112, CurrSamplesPerSec=178.79176707389038, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:23:33,598] [INFO] [logging.py:96:log_dist] [Rank 0] step=1250, skipped=22, lr=[7.693626532662776e-06, 7.693626532662776e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:23:33,617] [INFO] [timer.py:199:stop] epoch=0/micro_step=1250/global_step=1250, RunningAvgSamplesPerSec=175.94813989375533, CurrSamplesPerSec=178.71463362347325, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:23:37,180] [INFO] [logging.py:96:log_dist] [Rank 0] step=1260, skipped=22, lr=[7.664032326331793e-06, 7.664032326331793e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:23:37,198] [INFO] [timer.py:199:stop] epoch=0/micro_step=1260/global_step=1260, RunningAvgSamplesPerSec=175.9710919658821, CurrSamplesPerSec=178.94897761233332, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:23:40,762] [INFO] [logging.py:96:log_dist] [Rank 0] step=1270, skipped=22, lr=[7.634273846358865e-06, 7.634273846358865e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:23:41,340] [INFO] [timer.py:199:stop] epoch=0/micro_step=1270/global_step=1270, RunningAvgSamplesPerSec=175.7804087967915, CurrSamplesPerSec=69.74412845774373, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:23:44,929] [INFO] [logging.py:96:log_dist] [Rank 0] step=1280, skipped=22, lr=[7.604352814645445e-06, 7.604352814645445e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:23:44,948] [INFO] [timer.py:199:stop] epoch=0/micro_step=1280/global_step=1280, RunningAvgSamplesPerSec=175.79536151795492, CurrSamplesPerSec=178.78176453742, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:23:48,513] [INFO] [logging.py:96:log_dist] [Rank 0] step=1290, skipped=22, lr=[7.5742709624986415e-06, 7.5742709624986415e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:23:48,531] [INFO] [timer.py:199:stop] epoch=0/micro_step=1290/global_step=1290, RunningAvgSamplesPerSec=175.81815932273898, CurrSamplesPerSec=178.81463421782928, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:23:52,105] [INFO] [logging.py:96:log_dist] [Rank 0] step=1300, skipped=22, lr=[7.544030030531045e-06, 7.544030030531045e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:23:53,121] [INFO] [timer.py:199:stop] epoch=0/micro_step=1300/global_step=1300, RunningAvgSamplesPerSec=175.46707076842276, CurrSamplesPerSec=47.216827655837605, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:23:56,688] [INFO] [logging.py:96:log_dist] [Rank 0] step=1310, skipped=22, lr=[7.513631768560006e-06, 7.513631768560006e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:23:56,706] [INFO] [timer.py:199:stop] epoch=0/micro_step=1310/global_step=1310, RunningAvgSamplesPerSec=175.4917486128255, CurrSamplesPerSec=178.80272351717917, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:24:00,269] [INFO] [logging.py:96:log_dist] [Rank 0] step=1320, skipped=22, lr=[7.483077935506395e-06, 7.483077935506395e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:24:00,287] [INFO] [timer.py:199:stop] epoch=0/micro_step=1320/global_step=1320, RunningAvgSamplesPerSec=175.51708961952028, CurrSamplesPerSec=178.88279393478172, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:24:02,050] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:24:02,380] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:24:03,796] [INFO] [logging.py:96:log_dist] [Rank 0] step=1330, skipped=24, lr=[7.458524045628197e-06, 7.458524045628197e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:24:03,814] [INFO] [timer.py:199:stop] epoch=0/micro_step=1330/global_step=1330, RunningAvgSamplesPerSec=175.56201871334628, CurrSamplesPerSec=178.82511694710433, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:24:07,380] [INFO] [logging.py:96:log_dist] [Rank 0] step=1340, skipped=24, lr=[7.427694645774799e-06, 7.427694645774799e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:24:07,519] [INFO] [timer.py:199:stop] epoch=0/micro_step=1340/global_step=1340, RunningAvgSamplesPerSec=175.54205735012516, CurrSamplesPerSec=133.65324352596807, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:24:11,084] [INFO] [logging.py:96:log_dist] [Rank 0] step=1350, skipped=24, lr=[7.3967146473796505e-06, 7.3967146473796505e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:24:11,102] [INFO] [timer.py:199:stop] epoch=0/micro_step=1350/global_step=1350, RunningAvgSamplesPerSec=175.56613495971234, CurrSamplesPerSec=178.86968226172294, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:24:14,677] [INFO] [logging.py:96:log_dist] [Rank 0] step=1360, skipped=24, lr=[7.365585843024369e-06, 7.365585843024369e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:24:14,694] [INFO] [timer.py:199:stop] epoch=0/micro_step=1360/global_step=1360, RunningAvgSamplesPerSec=175.58664619401273, CurrSamplesPerSec=174.67976201399205, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:24:18,261] [INFO] [logging.py:96:log_dist] [Rank 0] step=1370, skipped=24, lr=[7.334310033900866e-06, 7.334310033900866e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:24:18,280] [INFO] [timer.py:199:stop] epoch=0/micro_step=1370/global_step=1370, RunningAvgSamplesPerSec=175.60935688190173, CurrSamplesPerSec=178.5796913055019, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:24:21,843] [INFO] [logging.py:96:log_dist] [Rank 0] step=1380, skipped=24, lr=[7.3028890297071225e-06, 7.3028890297071225e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:24:21,862] [INFO] [timer.py:199:stop] epoch=0/micro_step=1380/global_step=1380, RunningAvgSamplesPerSec=175.6327009455547, CurrSamplesPerSec=178.87528428684234, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:24:25,428] [INFO] [logging.py:96:log_dist] [Rank 0] step=1390, skipped=24, lr=[7.271324648542479e-06, 7.271324648542479e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:24:25,447] [INFO] [timer.py:199:stop] epoch=0/micro_step=1390/global_step=1390, RunningAvgSamplesPerSec=175.65456528485083, CurrSamplesPerSec=178.85800257191022, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:24:29,011] [INFO] [logging.py:96:log_dist] [Rank 0] step=1400, skipped=24, lr=[7.239618716802426e-06, 7.239618716802426e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:24:29,030] [INFO] [timer.py:199:stop] epoch=0/micro_step=1400/global_step=1400, RunningAvgSamplesPerSec=175.6769881398642, CurrSamplesPerSec=178.86324632559206, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:24:33,140] [INFO] [logging.py:96:log_dist] [Rank 0] step=1410, skipped=24, lr=[7.207773069072936e-06, 7.207773069072936e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:24:33,158] [INFO] [timer.py:199:stop] epoch=0/micro_step=1410/global_step=1410, RunningAvgSamplesPerSec=175.69582774898308, CurrSamplesPerSec=178.68572573500094, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:24:36,722] [INFO] [logging.py:96:log_dist] [Rank 0] step=1420, skipped=24, lr=[7.175789548024305e-06, 7.175789548024305e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:24:36,741] [INFO] [timer.py:199:stop] epoch=0/micro_step=1420/global_step=1420, RunningAvgSamplesPerSec=175.71768714266057, CurrSamplesPerSec=178.86503403917052, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:24:39,219] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:24:39,549] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:24:40,248] [INFO] [logging.py:96:log_dist] [Rank 0] step=1430, skipped=26, lr=[7.150104705765753e-06, 7.150104705765753e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:24:40,266] [INFO] [timer.py:199:stop] epoch=0/micro_step=1430/global_step=1430, RunningAvgSamplesPerSec=175.75838452517974, CurrSamplesPerSec=178.93907675899078, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:24:43,830] [INFO] [logging.py:96:log_dist] [Rank 0] step=1440, skipped=26, lr=[7.117877681668282e-06, 7.117877681668282e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:24:43,849] [INFO] [timer.py:199:stop] epoch=0/micro_step=1440/global_step=1440, RunningAvgSamplesPerSec=175.7795974872835, CurrSamplesPerSec=178.65575706026306, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:24:47,413] [INFO] [logging.py:96:log_dist] [Rank 0] step=1450, skipped=26, lr=[7.085517985827763e-06, 7.085517985827763e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:24:47,480] [INFO] [timer.py:199:stop] epoch=0/micro_step=1450/global_step=1450, RunningAvgSamplesPerSec=175.7844230307227, CurrSamplesPerSec=157.712470094991, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:24:51,064] [INFO] [logging.py:96:log_dist] [Rank 0] step=1460, skipped=26, lr=[7.053027490658626e-06, 7.053027490658626e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:24:51,082] [INFO] [timer.py:199:stop] epoch=0/micro_step=1460/global_step=1460, RunningAvgSamplesPerSec=175.79967979842687, CurrSamplesPerSec=178.92011312336325, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:24:54,647] [INFO] [logging.py:96:log_dist] [Rank 0] step=1470, skipped=26, lr=[7.020408076143678e-06, 7.020408076143678e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:24:54,665] [INFO] [timer.py:199:stop] epoch=0/micro_step=1470/global_step=1470, RunningAvgSamplesPerSec=175.8200781790564, CurrSamplesPerSec=178.86956307351142, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:24:58,229] [INFO] [logging.py:96:log_dist] [Rank 0] step=1480, skipped=26, lr=[6.9876616297253334e-06, 6.9876616297253334e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:24:58,830] [INFO] [timer.py:199:stop] epoch=0/micro_step=1480/global_step=1480, RunningAvgSamplesPerSec=175.65020914397152, CurrSamplesPerSec=68.06405797358683, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:25:02,394] [INFO] [logging.py:96:log_dist] [Rank 0] step=1490, skipped=26, lr=[6.954790046196393e-06, 6.954790046196393e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:25:02,412] [INFO] [timer.py:199:stop] epoch=0/micro_step=1490/global_step=1490, RunningAvgSamplesPerSec=175.6715150828845, CurrSamplesPerSec=179.01580719529818, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:25:05,978] [INFO] [logging.py:96:log_dist] [Rank 0] step=1500, skipped=26, lr=[6.921795227590407e-06, 6.921795227590407e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:25:05,995] [INFO] [timer.py:199:stop] epoch=0/micro_step=1500/global_step=1500, RunningAvgSamplesPerSec=175.69229084349791, CurrSamplesPerSec=178.75735828325037, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:25:09,558] [INFO] [logging.py:96:log_dist] [Rank 0] step=1510, skipped=26, lr=[6.888679083071628e-06, 6.888679083071628e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:25:09,577] [INFO] [timer.py:199:stop] epoch=0/micro_step=1510/global_step=1510, RunningAvgSamplesPerSec=175.7131522246912, CurrSamplesPerSec=178.94563744296195, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:25:13,143] [INFO] [logging.py:96:log_dist] [Rank 0] step=1520, skipped=26, lr=[6.855443528824528e-06, 6.855443528824528e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:25:13,161] [INFO] [timer.py:199:stop] epoch=0/micro_step=1520/global_step=1520, RunningAvgSamplesPerSec=175.73289418522768, CurrSamplesPerSec=178.4576461726755, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:25:16,365] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:25:16,695] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:25:16,695] [INFO] [logging.py:96:log_dist] [Rank 0] step=1530, skipped=28, lr=[6.828770402512193e-06, 6.828770402512193e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:25:16,696] [INFO] [timer.py:199:stop] epoch=0/micro_step=1530/global_step=1530, RunningAvgSamplesPerSec=175.7682110067167, CurrSamplesPerSec=194.31174373926865, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:25:20,260] [INFO] [logging.py:96:log_dist] [Rank 0] step=1540, skipped=28, lr=[6.795324761522614e-06, 6.795324761522614e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:25:20,338] [INFO] [timer.py:199:stop] epoch=0/micro_step=1540/global_step=1540, RunningAvgSamplesPerSec=175.76901609832007, CurrSamplesPerSec=153.30163538218693, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:25:23,932] [INFO] [logging.py:96:log_dist] [Rank 0] step=1550, skipped=28, lr=[6.761765112523549e-06, 6.761765112523549e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:25:23,951] [INFO] [timer.py:199:stop] epoch=0/micro_step=1550/global_step=1550, RunningAvgSamplesPerSec=175.7809835507248, CurrSamplesPerSec=178.71986897382806, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:25:27,514] [INFO] [logging.py:96:log_dist] [Rank 0] step=1560, skipped=28, lr=[6.728093397361774e-06, 6.728093397361774e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:25:27,533] [INFO] [timer.py:199:stop] epoch=0/micro_step=1560/global_step=1560, RunningAvgSamplesPerSec=175.80055933600613, CurrSamplesPerSec=178.89376153759005, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:25:31,097] [INFO] [logging.py:96:log_dist] [Rank 0] step=1570, skipped=28, lr=[6.694311564368495e-06, 6.694311564368495e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:25:31,202] [INFO] [timer.py:199:stop] epoch=0/micro_step=1570/global_step=1570, RunningAvgSamplesPerSec=175.79296083681945, CurrSamplesPerSec=144.04536722424106, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:25:34,774] [INFO] [logging.py:96:log_dist] [Rank 0] step=1580, skipped=28, lr=[6.660421568246617e-06, 6.660421568246617e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:25:34,792] [INFO] [timer.py:199:stop] epoch=0/micro_step=1580/global_step=1580, RunningAvgSamplesPerSec=175.80981411357013, CurrSamplesPerSec=178.86098193969235, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:25:38,364] [INFO] [logging.py:96:log_dist] [Rank 0] step=1590, skipped=28, lr=[6.626425369957642e-06, 6.626425369957642e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:25:38,382] [INFO] [timer.py:199:stop] epoch=0/micro_step=1590/global_step=1590, RunningAvgSamplesPerSec=175.82629120965925, CurrSamplesPerSec=179.0890189699872, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:25:41,948] [INFO] [logging.py:96:log_dist] [Rank 0] step=1600, skipped=28, lr=[6.592324936608196e-06, 6.592324936608196e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:25:41,966] [INFO] [timer.py:199:stop] epoch=0/micro_step=1600/global_step=1600, RunningAvgSamplesPerSec=175.84449990978723, CurrSamplesPerSec=178.92750727046948, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:25:45,531] [INFO] [logging.py:96:log_dist] [Rank 0] step=1610, skipped=28, lr=[6.558122241336213e-06, 6.558122241336213e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:25:45,549] [INFO] [timer.py:199:stop] epoch=0/micro_step=1610/global_step=1610, RunningAvgSamplesPerSec=175.86277910210742, CurrSamplesPerSec=178.84930338602828, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:25:49,117] [INFO] [logging.py:96:log_dist] [Rank 0] step=1620, skipped=28, lr=[6.5238192631967634e-06, 6.5238192631967634e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:25:49,135] [INFO] [timer.py:199:stop] epoch=0/micro_step=1620/global_step=1620, RunningAvgSamplesPerSec=175.88000177776544, CurrSamplesPerSec=178.6326927850309, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:25:52,702] [INFO] [logging.py:96:log_dist] [Rank 0] step=1630, skipped=28, lr=[6.489417987047536e-06, 6.489417987047536e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:25:53,610] [INFO] [timer.py:199:stop] epoch=0/micro_step=1630/global_step=1630, RunningAvgSamplesPerSec=175.63349783197427, CurrSamplesPerSec=51.322381671076165, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:25:53,940] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:25:54,270] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:25:57,122] [INFO] [logging.py:96:log_dist] [Rank 0] step=1640, skipped=30, lr=[6.461827529019496e-06, 6.461827529019496e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:25:57,141] [INFO] [timer.py:199:stop] epoch=0/micro_step=1640/global_step=1640, RunningAvgSamplesPerSec=175.66817066106825, CurrSamplesPerSec=178.60155357505232, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:26:00,706] [INFO] [logging.py:96:log_dist] [Rank 0] step=1650, skipped=30, lr=[6.427254336375904e-06, 6.427254336375904e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:26:00,725] [INFO] [timer.py:199:stop] epoch=0/micro_step=1650/global_step=1650, RunningAvgSamplesPerSec=175.6867269856703, CurrSamplesPerSec=178.66312937322417, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:26:04,292] [INFO] [logging.py:96:log_dist] [Rank 0] step=1660, skipped=30, lr=[6.392588433215169e-06, 6.392588433215169e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:26:04,311] [INFO] [timer.py:199:stop] epoch=0/micro_step=1660/global_step=1660, RunningAvgSamplesPerSec=175.70446587659265, CurrSamplesPerSec=178.780811973269, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:26:07,878] [INFO] [logging.py:96:log_dist] [Rank 0] step=1670, skipped=30, lr=[6.357831825394751e-06, 6.357831825394751e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:26:08,286] [INFO] [timer.py:199:stop] epoch=0/micro_step=1670/global_step=1670, RunningAvgSamplesPerSec=175.60938875878534, CurrSamplesPerSec=85.46477463394133, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:26:11,854] [INFO] [logging.py:96:log_dist] [Rank 0] step=1680, skipped=30, lr=[6.322986524020506e-06, 6.322986524020506e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:26:11,872] [INFO] [timer.py:199:stop] epoch=0/micro_step=1680/global_step=1680, RunningAvgSamplesPerSec=175.62743373248642, CurrSamplesPerSec=178.80605835341638, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:26:15,438] [INFO] [logging.py:96:log_dist] [Rank 0] step=1690, skipped=30, lr=[6.2880545453303324e-06, 6.2880545453303324e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:26:15,456] [INFO] [timer.py:199:stop] epoch=0/micro_step=1690/global_step=1690, RunningAvgSamplesPerSec=175.64584231262822, CurrSamplesPerSec=178.78212175159362, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:26:19,025] [INFO] [logging.py:96:log_dist] [Rank 0] step=1700, skipped=30, lr=[6.253037910577493e-06, 6.253037910577493e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:26:19,044] [INFO] [timer.py:199:stop] epoch=0/micro_step=1700/global_step=1700, RunningAvgSamplesPerSec=175.66305892226524, CurrSamplesPerSec=178.8459669365214, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:26:22,611] [INFO] [logging.py:96:log_dist] [Rank 0] step=1710, skipped=30, lr=[6.217938645913672e-06, 6.217938645913672e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:26:23,206] [INFO] [timer.py:199:stop] epoch=0/micro_step=1710/global_step=1710, RunningAvgSamplesPerSec=175.51776547737788, CurrSamplesPerSec=68.43606165264517, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:26:26,776] [INFO] [logging.py:96:log_dist] [Rank 0] step=1720, skipped=30, lr=[6.182758782271725e-06, 6.182758782271725e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:26:26,793] [INFO] [timer.py:199:stop] epoch=0/micro_step=1720/global_step=1720, RunningAvgSamplesPerSec=175.53570037464488, CurrSamplesPerSec=178.21867012257223, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:26:30,372] [INFO] [logging.py:96:log_dist] [Rank 0] step=1730, skipped=30, lr=[6.14750035524817e-06, 6.14750035524817e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:26:30,390] [INFO] [timer.py:199:stop] epoch=0/micro_step=1730/global_step=1730, RunningAvgSamplesPerSec=175.55045632384926, CurrSamplesPerSec=178.74866888452655, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:26:31,439] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:26:31,769] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:26:33,904] [INFO] [logging.py:96:log_dist] [Rank 0] step=1740, skipped=32, lr=[6.1192384188205335e-06, 6.1192384188205335e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:26:33,922] [INFO] [timer.py:199:stop] epoch=0/micro_step=1740/global_step=1740, RunningAvgSamplesPerSec=175.58311392297878, CurrSamplesPerSec=178.79426788289322, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:26:37,493] [INFO] [logging.py:96:log_dist] [Rank 0] step=1750, skipped=32, lr=[6.083843721848405e-06, 6.083843721848405e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:26:37,887] [INFO] [timer.py:199:stop] epoch=0/micro_step=1750/global_step=1750, RunningAvgSamplesPerSec=175.49623911336855, CurrSamplesPerSec=87.16427386228955, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:26:41,454] [INFO] [logging.py:96:log_dist] [Rank 0] step=1760, skipped=32, lr=[6.048376184972021e-06, 6.048376184972021e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:26:41,473] [INFO] [timer.py:199:stop] epoch=0/micro_step=1760/global_step=1760, RunningAvgSamplesPerSec=175.51410580162064, CurrSamplesPerSec=178.5194789303015, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:26:45,040] [INFO] [logging.py:96:log_dist] [Rank 0] step=1770, skipped=32, lr=[6.012837860433409e-06, 6.012837860433409e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:26:45,058] [INFO] [timer.py:199:stop] epoch=0/micro_step=1770/global_step=1770, RunningAvgSamplesPerSec=175.53183933086976, CurrSamplesPerSec=178.55035199838235, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:26:48,626] [INFO] [logging.py:96:log_dist] [Rank 0] step=1780, skipped=32, lr=[5.977230804570549e-06, 5.977230804570549e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:26:48,644] [INFO] [timer.py:199:stop] epoch=0/micro_step=1780/global_step=1780, RunningAvgSamplesPerSec=175.5492211005764, CurrSamplesPerSec=178.8400093005841, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:26:52,210] [INFO] [logging.py:96:log_dist] [Rank 0] step=1790, skipped=32, lr=[5.9415570776983906e-06, 5.9415570776983906e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:26:53,028] [INFO] [timer.py:199:stop] epoch=0/micro_step=1790/global_step=1790, RunningAvgSamplesPerSec=175.35176024783405, CurrSamplesPerSec=55.283490440040886, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:26:56,593] [INFO] [logging.py:96:log_dist] [Rank 0] step=1800, skipped=32, lr=[5.905818743989637e-06, 5.905818743989637e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:26:56,612] [INFO] [timer.py:199:stop] epoch=0/micro_step=1800/global_step=1800, RunningAvgSamplesPerSec=175.37042440510436, CurrSamplesPerSec=178.75033527908406, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:27:00,179] [INFO] [logging.py:96:log_dist] [Rank 0] step=1810, skipped=32, lr=[5.8700178713553115e-06, 5.8700178713553115e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:27:00,198] [INFO] [timer.py:199:stop] epoch=0/micro_step=1810/global_step=1810, RunningAvgSamplesPerSec=175.38849475223228, CurrSamplesPerSec=178.49644982611528, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:27:03,783] [INFO] [logging.py:96:log_dist] [Rank 0] step=1820, skipped=32, lr=[5.834156531325094e-06, 5.834156531325094e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:27:04,183] [INFO] [timer.py:199:stop] epoch=0/micro_step=1820/global_step=1820, RunningAvgSamplesPerSec=175.300936945648, CurrSamplesPerSec=86.55288250095764, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:27:07,750] [INFO] [logging.py:96:log_dist] [Rank 0] step=1830, skipped=32, lr=[5.798236798927466e-06, 5.798236798927466e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:27:07,769] [INFO] [timer.py:199:stop] epoch=0/micro_step=1830/global_step=1830, RunningAvgSamplesPerSec=175.3191744588234, CurrSamplesPerSec=178.8005797595703, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:27:09,533] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:27:09,863] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:27:11,280] [INFO] [logging.py:96:log_dist] [Rank 0] step=1840, skipped=34, lr=[5.769460367084017e-06, 5.769460367084017e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:27:11,298] [INFO] [timer.py:199:stop] epoch=0/micro_step=1840/global_step=1840, RunningAvgSamplesPerSec=175.35187446297343, CurrSamplesPerSec=178.65872969389764, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:27:14,866] [INFO] [logging.py:96:log_dist] [Rank 0] step=1850, skipped=34, lr=[5.733440768204955e-06, 5.733440768204955e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:27:14,884] [INFO] [timer.py:199:stop] epoch=0/micro_step=1850/global_step=1850, RunningAvgSamplesPerSec=175.36969591513923, CurrSamplesPerSec=178.5388327204354, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:27:18,452] [INFO] [logging.py:96:log_dist] [Rank 0] step=1860, skipped=34, lr=[5.697368604629078e-06, 5.697368604629078e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:27:18,471] [INFO] [timer.py:199:stop] epoch=0/micro_step=1860/global_step=1860, RunningAvgSamplesPerSec=175.38701387548983, CurrSamplesPerSec=178.59941463856026, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:27:22,038] [INFO] [logging.py:96:log_dist] [Rank 0] step=1870, skipped=34, lr=[5.6612459635836515e-06, 5.6612459635836515e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:27:22,056] [INFO] [timer.py:199:stop] epoch=0/micro_step=1870/global_step=1870, RunningAvgSamplesPerSec=175.40436686540264, CurrSamplesPerSec=178.71606141587807, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:27:25,624] [INFO] [logging.py:96:log_dist] [Rank 0] step=1880, skipped=34, lr=[5.625074935216701e-06, 5.625074935216701e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:27:25,642] [INFO] [timer.py:199:stop] epoch=0/micro_step=1880/global_step=1880, RunningAvgSamplesPerSec=175.4215190815781, CurrSamplesPerSec=178.4706975061948, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:27:30,535] [INFO] [logging.py:96:log_dist] [Rank 0] step=1890, skipped=34, lr=[5.588857612476057e-06, 5.588857612476057e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:27:30,553] [INFO] [timer.py:199:stop] epoch=0/micro_step=1890/global_step=1890, RunningAvgSamplesPerSec=175.43789173165723, CurrSamplesPerSec=178.48683633974778, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:27:34,137] [INFO] [logging.py:96:log_dist] [Rank 0] step=1900, skipped=34, lr=[5.552596090988269e-06, 5.552596090988269e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:27:34,155] [INFO] [timer.py:199:stop] epoch=0/micro_step=1900/global_step=1900, RunningAvgSamplesPerSec=175.451194382474, CurrSamplesPerSec=178.74688349626405, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:27:37,723] [INFO] [logging.py:96:log_dist] [Rank 0] step=1910, skipped=34, lr=[5.516292468937331e-06, 5.516292468937331e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:27:37,742] [INFO] [timer.py:199:stop] epoch=0/micro_step=1910/global_step=1910, RunningAvgSamplesPerSec=175.46768670824574, CurrSamplesPerSec=178.9434902570673, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:27:41,308] [INFO] [logging.py:96:log_dist] [Rank 0] step=1920, skipped=34, lr=[5.4799488469432925e-06, 5.4799488469432925e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:27:41,394] [INFO] [timer.py:199:stop] epoch=0/micro_step=1920/global_step=1920, RunningAvgSamplesPerSec=175.46760163103463, CurrSamplesPerSec=150.3866207200237, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:27:44,965] [INFO] [logging.py:96:log_dist] [Rank 0] step=1930, skipped=34, lr=[5.443567327940695e-06, 5.443567327940695e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:27:44,983] [INFO] [timer.py:199:stop] epoch=0/micro_step=1930/global_step=1930, RunningAvgSamplesPerSec=175.48314373151004, CurrSamplesPerSec=178.98739852441582, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:27:47,463] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:27:47,792] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:27:48,491] [INFO] [logging.py:96:log_dist] [Rank 0] step=1940, skipped=36, lr=[5.414436241467469e-06, 5.414436241467469e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:27:48,509] [INFO] [timer.py:199:stop] epoch=0/micro_step=1940/global_step=1940, RunningAvgSamplesPerSec=175.5141214632341, CurrSamplesPerSec=178.80975063864085, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:27:52,077] [INFO] [logging.py:96:log_dist] [Rank 0] step=1950, skipped=36, lr=[5.377991814166224e-06, 5.377991814166224e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:27:52,095] [INFO] [timer.py:199:stop] epoch=0/micro_step=1950/global_step=1950, RunningAvgSamplesPerSec=175.53004768602787, CurrSamplesPerSec=178.80772581818377, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:27:55,662] [INFO] [logging.py:96:log_dist] [Rank 0] step=1960, skipped=36, lr=[5.341515389350037e-06, 5.341515389350037e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:27:57,392] [INFO] [timer.py:199:stop] epoch=0/micro_step=1960/global_step=1960, RunningAvgSamplesPerSec=175.12620566362656, CurrSamplesPerSec=30.91901141341344, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:28:00,985] [INFO] [logging.py:96:log_dist] [Rank 0] step=1970, skipped=36, lr=[5.3050090776377606e-06, 5.3050090776377606e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:28:01,004] [INFO] [timer.py:199:stop] epoch=0/micro_step=1970/global_step=1970, RunningAvgSamplesPerSec=175.13762928757012, CurrSamplesPerSec=178.7959351277616, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:28:04,570] [INFO] [logging.py:96:log_dist] [Rank 0] step=1980, skipped=36, lr=[5.268474991377581e-06, 5.268474991377581e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:28:04,588] [INFO] [timer.py:199:stop] epoch=0/micro_step=1980/global_step=1980, RunningAvgSamplesPerSec=175.15561464513814, CurrSamplesPerSec=178.71070731199075, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:28:08,161] [INFO] [logging.py:96:log_dist] [Rank 0] step=1990, skipped=36, lr=[5.2319152445247865e-06, 5.2319152445247865e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:28:08,291] [INFO] [timer.py:199:stop] epoch=0/micro_step=1990/global_step=1990, RunningAvgSamplesPerSec=175.14474558349036, CurrSamplesPerSec=136.1650930786645, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:28:11,859] [INFO] [logging.py:96:log_dist] [Rank 0] step=2000, skipped=36, lr=[5.195331952519456e-06, 5.195331952519456e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:28:11,878] [INFO] [timer.py:199:stop] epoch=0/micro_step=2000/global_step=2000, RunningAvgSamplesPerSec=175.16204847473117, CurrSamplesPerSec=178.88529729088546, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:28:15,447] [INFO] [logging.py:96:log_dist] [Rank 0] step=2010, skipped=36, lr=[5.158727232164049e-06, 5.158727232164049e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:28:15,465] [INFO] [timer.py:199:stop] epoch=0/micro_step=2010/global_step=2010, RunningAvgSamplesPerSec=175.17886260449808, CurrSamplesPerSec=178.6885804188775, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:28:19,032] [INFO] [logging.py:96:log_dist] [Rank 0] step=2020, skipped=36, lr=[5.122103201500922e-06, 5.122103201500922e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:28:19,050] [INFO] [timer.py:199:stop] epoch=0/micro_step=2020/global_step=2020, RunningAvgSamplesPerSec=175.19616499152494, CurrSamplesPerSec=178.7133248338099, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:28:22,615] [INFO] [logging.py:96:log_dist] [Rank 0] step=2030, skipped=36, lr=[5.08546197968978e-06, 5.08546197968978e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:28:22,634] [INFO] [timer.py:199:stop] epoch=0/micro_step=2030/global_step=2030, RunningAvgSamplesPerSec=175.21354965143257, CurrSamplesPerSec=178.7542633320481, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:28:25,830] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:28:26,160] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:28:26,161] [INFO] [logging.py:96:log_dist] [Rank 0] step=2040, skipped=38, lr=[5.056138049327779e-06, 5.056138049327779e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:28:26,161] [INFO] [timer.py:199:stop] epoch=0/micro_step=2040/global_step=2040, RunningAvgSamplesPerSec=175.2440247805813, CurrSamplesPerSec=194.14408593284978, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:28:29,726] [INFO] [logging.py:96:log_dist] [Rank 0] step=2050, skipped=38, lr=[5.019471226829501e-06, 5.019471226829501e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:28:30,566] [INFO] [timer.py:199:stop] epoch=0/micro_step=2050/global_step=2050, RunningAvgSamplesPerSec=175.0687293497253, CurrSamplesPerSec=54.22730359844773, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:28:34,133] [INFO] [logging.py:96:log_dist] [Rank 0] step=2060, skipped=38, lr=[4.982793151730683e-06, 4.982793151730683e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:28:34,152] [INFO] [timer.py:199:stop] epoch=0/micro_step=2060/global_step=2060, RunningAvgSamplesPerSec=175.08610889281422, CurrSamplesPerSec=178.700237991419, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +***** Evaluating perplexity, Epoch 1/2 ***** +ppl: 2.7634823322296143 +Beginning of Epoch 2/2, Total Micro Batches 2065 +[2023-04-14 07:28:53,389] [INFO] [logging.py:96:log_dist] [Rank 0] step=2070, skipped=38, lr=[4.94610594631818e-06, 4.94610594631818e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:28:53,408] [INFO] [timer.py:199:stop] epoch=1/micro_step=5/global_step=2070, RunningAvgSamplesPerSec=175.10978625182273, CurrSamplesPerSec=178.66431851294246, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:28:56,974] [INFO] [logging.py:96:log_dist] [Rank 0] step=2080, skipped=38, lr=[4.909411733407139e-06, 4.909411733407139e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:28:56,992] [INFO] [timer.py:199:stop] epoch=1/micro_step=15/global_step=2080, RunningAvgSamplesPerSec=175.12691316302684, CurrSamplesPerSec=178.96341334954724, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:29:00,557] [INFO] [logging.py:96:log_dist] [Rank 0] step=2090, skipped=38, lr=[4.872712636218195e-06, 4.872712636218195e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:29:00,576] [INFO] [timer.py:199:stop] epoch=1/micro_step=25/global_step=2090, RunningAvgSamplesPerSec=175.14409156626672, CurrSamplesPerSec=178.7361719156269, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:29:04,141] [INFO] [logging.py:96:log_dist] [Rank 0] step=2100, skipped=38, lr=[4.836010778254584e-06, 4.836010778254584e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:29:04,858] [INFO] [timer.py:199:stop] epoch=1/micro_step=35/global_step=2100, RunningAvgSamplesPerSec=175.00172873537667, CurrSamplesPerSec=60.62637382244194, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:29:08,428] [INFO] [logging.py:96:log_dist] [Rank 0] step=2110, skipped=38, lr=[4.799308283179299e-06, 4.799308283179299e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:29:08,446] [INFO] [timer.py:199:stop] epoch=1/micro_step=45/global_step=2110, RunningAvgSamplesPerSec=175.0182309329687, CurrSamplesPerSec=178.1292633562712, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:29:12,012] [INFO] [logging.py:96:log_dist] [Rank 0] step=2120, skipped=38, lr=[4.762607274692188e-06, 4.762607274692188e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:29:12,031] [INFO] [timer.py:199:stop] epoch=1/micro_step=55/global_step=2120, RunningAvgSamplesPerSec=175.03546481848542, CurrSamplesPerSec=178.65670829226346, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:29:15,619] [INFO] [logging.py:96:log_dist] [Rank 0] step=2130, skipped=38, lr=[4.7259098764070906e-06, 4.7259098764070906e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:29:15,790] [INFO] [timer.py:199:stop] epoch=1/micro_step=65/global_step=2130, RunningAvgSamplesPerSec=175.01350003146337, CurrSamplesPerSec=120.98837511177777, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:29:19,361] [INFO] [logging.py:96:log_dist] [Rank 0] step=2140, skipped=38, lr=[4.689218211728945e-06, 4.689218211728945e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:29:19,379] [INFO] [timer.py:199:stop] epoch=1/micro_step=75/global_step=2140, RunningAvgSamplesPerSec=175.0296261699975, CurrSamplesPerSec=178.83452862606734, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:29:19,708] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:29:20,038] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:29:22,891] [INFO] [logging.py:96:log_dist] [Rank 0] step=2150, skipped=40, lr=[4.659870434902894e-06, 4.659870434902894e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:29:22,909] [INFO] [timer.py:199:stop] epoch=1/micro_step=85/global_step=2150, RunningAvgSamplesPerSec=175.05878946957924, CurrSamplesPerSec=177.7715897065037, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:29:26,475] [INFO] [logging.py:96:log_dist] [Rank 0] step=2160, skipped=40, lr=[4.623194440560303e-06, 4.623194440560303e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:29:26,493] [INFO] [timer.py:199:stop] epoch=1/micro_step=95/global_step=2160, RunningAvgSamplesPerSec=175.07566740932268, CurrSamplesPerSec=178.8376263573442, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:29:30,058] [INFO] [logging.py:96:log_dist] [Rank 0] step=2170, skipped=40, lr=[4.586530123201418e-06, 4.586530123201418e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:29:30,076] [INFO] [timer.py:199:stop] epoch=1/micro_step=105/global_step=2170, RunningAvgSamplesPerSec=175.0925687002495, CurrSamplesPerSec=178.91248109636737, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:29:33,641] [INFO] [logging.py:96:log_dist] [Rank 0] step=2180, skipped=40, lr=[4.549879604317032e-06, 4.549879604317032e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:29:33,659] [INFO] [timer.py:199:stop] epoch=1/micro_step=115/global_step=2180, RunningAvgSamplesPerSec=175.10938262451967, CurrSamplesPerSec=178.7955778583865, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:29:37,225] [INFO] [logging.py:96:log_dist] [Rank 0] step=2190, skipped=40, lr=[4.513245004599523e-06, 4.513245004599523e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:29:37,243] [INFO] [timer.py:199:stop] epoch=1/micro_step=125/global_step=2190, RunningAvgSamplesPerSec=175.1258089758605, CurrSamplesPerSec=178.86968226172294, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:29:40,809] [INFO] [logging.py:96:log_dist] [Rank 0] step=2200, skipped=40, lr=[4.476628443820144e-06, 4.476628443820144e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:29:40,828] [INFO] [timer.py:199:stop] epoch=1/micro_step=135/global_step=2200, RunningAvgSamplesPerSec=175.1418964890195, CurrSamplesPerSec=178.87719143928476, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:29:44,395] [INFO] [logging.py:96:log_dist] [Rank 0] step=2210, skipped=40, lr=[4.440032040706374e-06, 4.440032040706374e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:29:44,413] [INFO] [timer.py:199:stop] epoch=1/micro_step=145/global_step=2210, RunningAvgSamplesPerSec=175.15764796644672, CurrSamplesPerSec=178.68655834170954, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:29:47,993] [INFO] [logging.py:96:log_dist] [Rank 0] step=2220, skipped=40, lr=[4.403457912819315e-06, 4.403457912819315e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:29:48,737] [INFO] [timer.py:199:stop] epoch=1/micro_step=155/global_step=2220, RunningAvgSamplesPerSec=175.01494195219564, CurrSamplesPerSec=58.85921524980288, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:29:52,304] [INFO] [logging.py:96:log_dist] [Rank 0] step=2230, skipped=40, lr=[4.366908176431164e-06, 4.366908176431164e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:29:52,326] [INFO] [timer.py:199:stop] epoch=1/micro_step=165/global_step=2230, RunningAvgSamplesPerSec=175.03115883461155, CurrSamplesPerSec=178.6057127648957, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:29:55,905] [INFO] [logging.py:96:log_dist] [Rank 0] step=2240, skipped=40, lr=[4.33038494640277e-06, 4.33038494640277e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:29:55,923] [INFO] [timer.py:199:stop] epoch=1/micro_step=175/global_step=2240, RunningAvgSamplesPerSec=175.04554169354392, CurrSamplesPerSec=178.53538910120236, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:29:56,970] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:29:57,300] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:29:59,432] [INFO] [logging.py:96:log_dist] [Rank 0] step=2250, skipped=42, lr=[4.301186867169562e-06, 4.301186867169562e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:29:59,450] [INFO] [timer.py:199:stop] epoch=1/micro_step=185/global_step=2250, RunningAvgSamplesPerSec=175.07396049006897, CurrSamplesPerSec=178.84894590334105, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:30:03,017] [INFO] [logging.py:96:log_dist] [Rank 0] step=2260, skipped=42, lr=[4.264716673064343e-06, 4.264716673064343e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:30:03,035] [INFO] [timer.py:199:stop] epoch=1/micro_step=195/global_step=2260, RunningAvgSamplesPerSec=175.08982791017425, CurrSamplesPerSec=178.79402970759315, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:30:06,599] [INFO] [logging.py:96:log_dist] [Rank 0] step=2270, skipped=42, lr=[4.228278898379567e-06, 4.228278898379567e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:30:06,618] [INFO] [timer.py:199:stop] epoch=1/micro_step=205/global_step=2270, RunningAvgSamplesPerSec=175.10596201054403, CurrSamplesPerSec=178.55569649807, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:30:10,183] [INFO] [logging.py:96:log_dist] [Rank 0] step=2280, skipped=42, lr=[4.191875651497689e-06, 4.191875651497689e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:30:10,202] [INFO] [timer.py:199:stop] epoch=1/micro_step=215/global_step=2280, RunningAvgSamplesPerSec=175.12173813018043, CurrSamplesPerSec=178.71046935926378, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:30:13,767] [INFO] [logging.py:96:log_dist] [Rank 0] step=2290, skipped=42, lr=[4.1555090388033026e-06, 4.1555090388033026e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:30:13,785] [INFO] [timer.py:199:stop] epoch=1/micro_step=225/global_step=2290, RunningAvgSamplesPerSec=175.13748612725948, CurrSamplesPerSec=178.80617745701124, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:30:17,352] [INFO] [logging.py:96:log_dist] [Rank 0] step=2300, skipped=42, lr=[4.119181164561248e-06, 4.119181164561248e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:30:17,370] [INFO] [timer.py:199:stop] epoch=1/micro_step=235/global_step=2300, RunningAvgSamplesPerSec=175.15276366645972, CurrSamplesPerSec=178.60488091142994, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:30:20,940] [INFO] [logging.py:96:log_dist] [Rank 0] step=2310, skipped=42, lr=[4.082894130794863e-06, 4.082894130794863e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:30:20,959] [INFO] [timer.py:199:stop] epoch=1/micro_step=245/global_step=2310, RunningAvgSamplesPerSec=175.167338632962, CurrSamplesPerSec=178.79295792659534, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:30:24,541] [INFO] [logging.py:96:log_dist] [Rank 0] step=2320, skipped=42, lr=[4.046650037164352e-06, 4.046650037164352e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:30:24,559] [INFO] [timer.py:199:stop] epoch=1/micro_step=255/global_step=2320, RunningAvgSamplesPerSec=175.17925048441862, CurrSamplesPerSec=178.78235989516907, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:30:28,125] [INFO] [logging.py:96:log_dist] [Rank 0] step=2330, skipped=42, lr=[4.010450980845293e-06, 4.010450980845293e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:30:28,143] [INFO] [timer.py:199:stop] epoch=1/micro_step=265/global_step=2330, RunningAvgSamplesPerSec=175.19435811191732, CurrSamplesPerSec=178.71856010748328, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:30:31,709] [INFO] [logging.py:96:log_dist] [Rank 0] step=2340, skipped=42, lr=[3.97429905640729e-06, 3.97429905640729e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:30:32,034] [INFO] [timer.py:199:stop] epoch=1/micro_step=275/global_step=2340, RunningAvgSamplesPerSec=175.14646783144545, CurrSamplesPerSec=96.31139345326895, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:30:33,797] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:30:34,127] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:30:35,542] [INFO] [logging.py:96:log_dist] [Rank 0] step=2350, skipped=44, lr=[3.945412857624087e-06, 3.945412857624087e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:30:35,560] [INFO] [timer.py:199:stop] epoch=1/micro_step=285/global_step=2350, RunningAvgSamplesPerSec=175.1733533615671, CurrSamplesPerSec=178.96293609786994, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:30:39,129] [INFO] [logging.py:96:log_dist] [Rank 0] step=2360, skipped=44, lr=[3.9093510401049834e-06, 3.9093510401049834e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:30:39,147] [INFO] [timer.py:199:stop] epoch=1/micro_step=295/global_step=2360, RunningAvgSamplesPerSec=175.1876835817042, CurrSamplesPerSec=178.28140300355187, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:30:42,714] [INFO] [logging.py:96:log_dist] [Rank 0] step=2370, skipped=44, lr=[3.873342204367026e-06, 3.873342204367026e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:30:42,732] [INFO] [timer.py:199:stop] epoch=1/micro_step=305/global_step=2370, RunningAvgSamplesPerSec=175.20236064859748, CurrSamplesPerSec=178.6569461018463, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:30:46,300] [INFO] [logging.py:96:log_dist] [Rank 0] step=2380, skipped=44, lr=[3.837388433973173e-06, 3.837388433973173e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:30:46,318] [INFO] [timer.py:199:stop] epoch=1/micro_step=315/global_step=2380, RunningAvgSamplesPerSec=175.21672138982777, CurrSamplesPerSec=178.91713178532248, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:30:49,882] [INFO] [logging.py:96:log_dist] [Rank 0] step=2390, skipped=44, lr=[3.8014918093001603e-06, 3.8014918093001603e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:30:49,901] [INFO] [timer.py:199:stop] epoch=1/micro_step=325/global_step=2390, RunningAvgSamplesPerSec=175.23154086765612, CurrSamplesPerSec=178.92190197387978, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:30:53,469] [INFO] [logging.py:96:log_dist] [Rank 0] step=2400, skipped=44, lr=[3.7656544074181273e-06, 3.7656544074181273e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:30:53,488] [INFO] [timer.py:199:stop] epoch=1/micro_step=335/global_step=2400, RunningAvgSamplesPerSec=175.24549762328286, CurrSamplesPerSec=178.8898273514674, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:30:57,076] [INFO] [logging.py:96:log_dist] [Rank 0] step=2410, skipped=44, lr=[3.729878301970432e-06, 3.729878301970432e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:30:57,094] [INFO] [timer.py:199:stop] epoch=1/micro_step=345/global_step=2410, RunningAvgSamplesPerSec=175.25544619086466, CurrSamplesPerSec=178.92619536106457, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:31:00,665] [INFO] [logging.py:96:log_dist] [Rank 0] step=2420, skipped=44, lr=[3.694165563053662e-06, 3.694165563053662e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:31:00,683] [INFO] [timer.py:199:stop] epoch=1/micro_step=355/global_step=2420, RunningAvgSamplesPerSec=175.26864835098078, CurrSamplesPerSec=178.78712289991174, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:31:04,250] [INFO] [logging.py:96:log_dist] [Rank 0] step=2430, skipped=44, lr=[3.658518257097859e-06, 3.658518257097859e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:31:04,269] [INFO] [timer.py:199:stop] epoch=1/micro_step=365/global_step=2430, RunningAvgSamplesPerSec=175.282534164942, CurrSamplesPerSec=178.76711998524235, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:31:07,834] [INFO] [logging.py:96:log_dist] [Rank 0] step=2440, skipped=44, lr=[3.6229384467469454e-06, 3.6229384467469454e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:31:07,853] [INFO] [timer.py:199:stop] epoch=1/micro_step=375/global_step=2440, RunningAvgSamplesPerSec=175.29656329491405, CurrSamplesPerSec=178.71856010748328, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:31:10,332] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:31:10,661] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:31:11,360] [INFO] [logging.py:96:log_dist] [Rank 0] step=2450, skipped=46, lr=[3.594524578908245e-06, 3.594524578908245e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:31:11,379] [INFO] [timer.py:199:stop] epoch=1/micro_step=385/global_step=2450, RunningAvgSamplesPerSec=175.32183611448855, CurrSamplesPerSec=178.94873902467154, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:31:14,945] [INFO] [logging.py:96:log_dist] [Rank 0] step=2460, skipped=46, lr=[3.559071445966512e-06, 3.559071445966512e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:31:14,963] [INFO] [timer.py:199:stop] epoch=1/micro_step=395/global_step=2460, RunningAvgSamplesPerSec=175.33557211722362, CurrSamplesPerSec=178.78783737251226, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:31:18,528] [INFO] [logging.py:96:log_dist] [Rank 0] step=2470, skipped=46, lr=[3.5236915628754707e-06, 3.5236915628754707e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:31:18,982] [INFO] [timer.py:199:stop] epoch=1/micro_step=405/global_step=2470, RunningAvgSamplesPerSec=175.26470969536572, CurrSamplesPerSec=80.68512460808711, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:31:22,547] [INFO] [logging.py:96:log_dist] [Rank 0] step=2480, skipped=46, lr=[3.488386976805277e-06, 3.488386976805277e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:31:22,566] [INFO] [timer.py:199:stop] epoch=1/micro_step=415/global_step=2480, RunningAvgSamplesPerSec=175.27864350504987, CurrSamplesPerSec=178.65147664160952, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:31:26,133] [INFO] [logging.py:96:log_dist] [Rank 0] step=2490, skipped=46, lr=[3.453159730569205e-06, 3.453159730569205e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:31:26,152] [INFO] [timer.py:199:stop] epoch=1/micro_step=425/global_step=2490, RunningAvgSamplesPerSec=175.29207803621034, CurrSamplesPerSec=178.9493354950189, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:31:29,744] [INFO] [logging.py:96:log_dist] [Rank 0] step=2500, skipped=46, lr=[3.418011862505455e-06, 3.418011862505455e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:31:29,762] [INFO] [timer.py:199:stop] epoch=1/micro_step=435/global_step=2500, RunningAvgSamplesPerSec=175.30084217285292, CurrSamplesPerSec=178.80510553036953, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:31:33,332] [INFO] [logging.py:96:log_dist] [Rank 0] step=2510, skipped=46, lr=[3.3829454063591996e-06, 3.3829454063591996e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:31:33,350] [INFO] [timer.py:199:stop] epoch=1/micro_step=445/global_step=2510, RunningAvgSamplesPerSec=175.31354465261091, CurrSamplesPerSec=178.44673197769845, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:31:36,916] [INFO] [logging.py:96:log_dist] [Rank 0] step=2520, skipped=46, lr=[3.347962391164914e-06, 3.347962391164914e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:31:36,934] [INFO] [timer.py:199:stop] epoch=1/micro_step=455/global_step=2520, RunningAvgSamplesPerSec=175.32701777650846, CurrSamplesPerSec=178.82356828472626, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:31:40,500] [INFO] [logging.py:96:log_dist] [Rank 0] step=2530, skipped=46, lr=[3.3130648411289645e-06, 3.3130648411289645e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:31:40,519] [INFO] [timer.py:199:stop] epoch=1/micro_step=465/global_step=2530, RunningAvgSamplesPerSec=175.34033348472275, CurrSamplesPerSec=178.69869149123133, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:31:44,086] [INFO] [logging.py:96:log_dist] [Rank 0] step=2540, skipped=46, lr=[3.2782547755124867e-06, 3.2782547755124867e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:31:44,104] [INFO] [timer.py:199:stop] epoch=1/micro_step=475/global_step=2540, RunningAvgSamplesPerSec=175.35327148969282, CurrSamplesPerSec=178.97630010707823, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:31:47,302] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:31:47,631] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:31:47,632] [INFO] [logging.py:96:log_dist] [Rank 0] step=2550, skipped=48, lr=[3.2504710655161594e-06, 3.2504710655161594e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:31:47,632] [INFO] [timer.py:199:stop] epoch=1/micro_step=485/global_step=2550, RunningAvgSamplesPerSec=175.3769870154989, CurrSamplesPerSec=194.22710571823424, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:31:51,197] [INFO] [logging.py:96:log_dist] [Rank 0] step=2560, skipped=48, lr=[3.2158235441602494e-06, 3.2158235441602494e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:31:51,215] [INFO] [timer.py:199:stop] epoch=1/micro_step=495/global_step=2560, RunningAvgSamplesPerSec=175.39017311744595, CurrSamplesPerSec=178.87433072587132, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:31:54,783] [INFO] [logging.py:96:log_dist] [Rank 0] step=2570, skipped=48, lr=[3.1812691338529457e-06, 3.1812691338529457e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:31:54,801] [INFO] [timer.py:199:stop] epoch=1/micro_step=505/global_step=2570, RunningAvgSamplesPerSec=175.40277191372112, CurrSamplesPerSec=178.79033807158396, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:31:58,370] [INFO] [logging.py:96:log_dist] [Rank 0] step=2580, skipped=48, lr=[3.146809834000446e-06, 3.146809834000446e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:31:58,388] [INFO] [timer.py:199:stop] epoch=1/micro_step=515/global_step=2580, RunningAvgSamplesPerSec=175.41501739895196, CurrSamplesPerSec=178.5154424722852, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:32:01,961] [INFO] [logging.py:96:log_dist] [Rank 0] step=2590, skipped=48, lr=[3.1124476385056108e-06, 3.1124476385056108e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:32:01,979] [INFO] [timer.py:199:stop] epoch=1/micro_step=525/global_step=2590, RunningAvgSamplesPerSec=175.42714048299047, CurrSamplesPerSec=178.73414876121103, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:32:05,557] [INFO] [logging.py:96:log_dist] [Rank 0] step=2600, skipped=48, lr=[3.0781845356526032e-06, 3.0781845356526032e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:32:05,942] [INFO] [timer.py:199:stop] epoch=1/micro_step=535/global_step=2600, RunningAvgSamplesPerSec=175.37037443121025, CurrSamplesPerSec=88.39919015434225, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:32:09,508] [INFO] [logging.py:96:log_dist] [Rank 0] step=2610, skipped=48, lr=[3.044022507991825e-06, 3.044022507991825e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:32:09,526] [INFO] [timer.py:199:stop] epoch=1/micro_step=545/global_step=2610, RunningAvgSamplesPerSec=175.38325990769613, CurrSamplesPerSec=178.74533618861935, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:32:13,091] [INFO] [logging.py:96:log_dist] [Rank 0] step=2620, skipped=48, lr=[3.009963532225215e-06, 3.009963532225215e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:32:13,109] [INFO] [timer.py:199:stop] epoch=1/micro_step=555/global_step=2620, RunningAvgSamplesPerSec=175.3961343995998, CurrSamplesPerSec=178.8903042132965, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:32:16,675] [INFO] [logging.py:96:log_dist] [Rank 0] step=2630, skipped=48, lr=[2.976009579091864e-06, 2.976009579091864e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:32:16,693] [INFO] [timer.py:199:stop] epoch=1/micro_step=565/global_step=2630, RunningAvgSamplesPerSec=175.40866552656433, CurrSamplesPerSec=178.56032865661078, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:32:20,262] [INFO] [logging.py:96:log_dist] [Rank 0] step=2640, skipped=48, lr=[2.9421626132539906e-06, 2.9421626132539906e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:32:20,280] [INFO] [timer.py:199:stop] epoch=1/micro_step=575/global_step=2640, RunningAvgSamplesPerSec=175.42070540723367, CurrSamplesPerSec=178.75938196313675, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:32:23,847] [INFO] [logging.py:96:log_dist] [Rank 0] step=2650, skipped=48, lr=[2.9084245931832494e-06, 2.9084245931832494e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:32:23,865] [INFO] [timer.py:199:stop] epoch=1/micro_step=585/global_step=2650, RunningAvgSamplesPerSec=175.43293698784552, CurrSamplesPerSec=178.44566435639746, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:32:24,195] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:32:24,525] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:32:27,379] [INFO] [logging.py:96:log_dist] [Rank 0] step=2660, skipped=50, lr=[2.8815139301504362e-06, 2.8815139301504362e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:32:27,397] [INFO] [timer.py:199:stop] epoch=1/micro_step=595/global_step=2660, RunningAvgSamplesPerSec=175.45468178576562, CurrSamplesPerSec=177.80526668591975, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:32:30,964] [INFO] [logging.py:96:log_dist] [Rank 0] step=2670, skipped=50, lr=[2.84797692761501e-06, 2.84797692761501e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:32:30,983] [INFO] [timer.py:199:stop] epoch=1/micro_step=605/global_step=2670, RunningAvgSamplesPerSec=175.4666354866578, CurrSamplesPerSec=178.16863385141875, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:32:34,552] [INFO] [logging.py:96:log_dist] [Rank 0] step=2680, skipped=50, lr=[2.8145543206703957e-06, 2.8145543206703957e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:32:34,570] [INFO] [timer.py:199:stop] epoch=1/micro_step=615/global_step=2680, RunningAvgSamplesPerSec=175.47824417344052, CurrSamplesPerSec=178.4462574777653, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:32:38,156] [INFO] [logging.py:96:log_dist] [Rank 0] step=2690, skipped=50, lr=[2.781248043233765e-06, 2.781248043233765e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:32:38,175] [INFO] [timer.py:199:stop] epoch=1/micro_step=625/global_step=2690, RunningAvgSamplesPerSec=175.48719903140406, CurrSamplesPerSec=178.54405777571736, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:32:41,746] [INFO] [logging.py:96:log_dist] [Rank 0] step=2700, skipped=50, lr=[2.7480600224911686e-06, 2.7480600224911686e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:32:41,765] [INFO] [timer.py:199:stop] epoch=1/micro_step=635/global_step=2700, RunningAvgSamplesPerSec=175.49811177277874, CurrSamplesPerSec=176.11875817246468, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:32:45,331] [INFO] [logging.py:96:log_dist] [Rank 0] step=2710, skipped=50, lr=[2.7149921787860233e-06, 2.7149921787860233e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:32:45,349] [INFO] [timer.py:199:stop] epoch=1/micro_step=645/global_step=2710, RunningAvgSamplesPerSec=175.50988480424456, CurrSamplesPerSec=178.64945540409775, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:32:48,915] [INFO] [logging.py:96:log_dist] [Rank 0] step=2720, skipped=50, lr=[2.6820464255079945e-06, 2.6820464255079945e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:32:48,933] [INFO] [timer.py:199:stop] epoch=1/micro_step=655/global_step=2720, RunningAvgSamplesPerSec=175.52162162243437, CurrSamplesPerSec=178.68108706823332, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:32:52,498] [INFO] [logging.py:96:log_dist] [Rank 0] step=2730, skipped=50, lr=[2.6492246689822873e-06, 2.6492246689822873e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:32:52,622] [INFO] [timer.py:199:stop] epoch=1/micro_step=665/global_step=2730, RunningAvgSamplesPerSec=175.51485903564813, CurrSamplesPerSec=138.2088541867787, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:32:56,188] [INFO] [logging.py:96:log_dist] [Rank 0] step=2740, skipped=50, lr=[2.6165288083593353e-06, 2.6165288083593353e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:32:56,206] [INFO] [timer.py:199:stop] epoch=1/micro_step=675/global_step=2740, RunningAvgSamplesPerSec=175.5264846827877, CurrSamplesPerSec=178.60820837178602, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:32:59,773] [INFO] [logging.py:96:log_dist] [Rank 0] step=2750, skipped=50, lr=[2.5839607355049186e-06, 2.5839607355049186e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:32:59,791] [INFO] [timer.py:199:stop] epoch=1/micro_step=685/global_step=2750, RunningAvgSamplesPerSec=175.53795196845613, CurrSamplesPerSec=178.7278415737691, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:33:00,838] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:33:01,168] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:33:03,300] [INFO] [logging.py:96:log_dist] [Rank 0] step=2760, skipped=52, lr=[2.557999551030911e-06, 2.557999551030911e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:33:03,319] [INFO] [timer.py:199:stop] epoch=1/micro_step=695/global_step=2760, RunningAvgSamplesPerSec=175.5592871224166, CurrSamplesPerSec=178.57731529435088, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:33:06,888] [INFO] [logging.py:96:log_dist] [Rank 0] step=2770, skipped=52, lr=[2.5256662399899455e-06, 2.5256662399899455e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:33:06,992] [INFO] [timer.py:199:stop] epoch=1/micro_step=705/global_step=2770, RunningAvgSamplesPerSec=175.55524692161492, CurrSamplesPerSec=144.33651129241005, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:33:10,570] [INFO] [logging.py:96:log_dist] [Rank 0] step=2780, skipped=52, lr=[2.4934659742571634e-06, 2.4934659742571634e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:33:10,589] [INFO] [timer.py:199:stop] epoch=1/micro_step=715/global_step=2780, RunningAvgSamplesPerSec=175.56442067977278, CurrSamplesPerSec=178.4378355238493, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:33:14,156] [INFO] [logging.py:96:log_dist] [Rank 0] step=2790, skipped=52, lr=[2.4614006170219634e-06, 2.4614006170219634e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:33:14,174] [INFO] [timer.py:199:stop] epoch=1/micro_step=725/global_step=2790, RunningAvgSamplesPerSec=175.57554153789843, CurrSamplesPerSec=178.7296265816548, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:33:17,745] [INFO] [logging.py:96:log_dist] [Rank 0] step=2800, skipped=52, lr=[2.4294720236675865e-06, 2.4294720236675865e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:33:18,918] [INFO] [timer.py:199:stop] epoch=1/micro_step=735/global_step=2800, RunningAvgSamplesPerSec=175.38735135314874, CurrSamplesPerSec=42.32136656624815, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:33:22,486] [INFO] [logging.py:96:log_dist] [Rank 0] step=2810, skipped=52, lr=[2.3976820416637756e-06, 2.3976820416637756e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:33:22,504] [INFO] [timer.py:199:stop] epoch=1/micro_step=745/global_step=2810, RunningAvgSamplesPerSec=175.3987857581585, CurrSamplesPerSec=178.78843277070814, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:33:26,072] [INFO] [logging.py:96:log_dist] [Rank 0] step=2820, skipped=52, lr=[2.366032510459863e-06, 2.366032510459863e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:33:26,090] [INFO] [timer.py:199:stop] epoch=1/micro_step=755/global_step=2820, RunningAvgSamplesPerSec=175.41022244608368, CurrSamplesPerSec=178.58349305485794, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:33:29,658] [INFO] [logging.py:96:log_dist] [Rank 0] step=2830, skipped=52, lr=[2.334525261378342e-06, 2.334525261378342e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:33:29,676] [INFO] [timer.py:199:stop] epoch=1/micro_step=765/global_step=2830, RunningAvgSamplesPerSec=175.42150031688445, CurrSamplesPerSec=178.74581228042607, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:33:33,245] [INFO] [logging.py:96:log_dist] [Rank 0] step=2840, skipped=52, lr=[2.3031621175089e-06, 2.3031621175089e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:33:33,296] [INFO] [timer.py:199:stop] epoch=1/micro_step=775/global_step=2840, RunningAvgSamplesPerSec=175.427072808489, CurrSamplesPerSec=163.60613429777504, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:33:36,863] [INFO] [logging.py:96:log_dist] [Rank 0] step=2850, skipped=52, lr=[2.2719448936029273e-06, 2.2719448936029273e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:33:36,881] [INFO] [timer.py:199:stop] epoch=1/micro_step=785/global_step=2850, RunningAvgSamplesPerSec=175.43835386741222, CurrSamplesPerSec=178.45503613503087, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:33:38,645] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:33:38,975] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:33:40,417] [INFO] [logging.py:96:log_dist] [Rank 0] step=2860, skipped=54, lr=[2.247077390977253e-06, 2.247077390977253e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:33:40,435] [INFO] [timer.py:199:stop] epoch=1/micro_step=795/global_step=2860, RunningAvgSamplesPerSec=175.454944810705, CurrSamplesPerSec=167.13194021152722, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:33:44,001] [INFO] [logging.py:96:log_dist] [Rank 0] step=2870, skipped=54, lr=[2.216127369161282e-06, 2.216127369161282e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:33:44,638] [INFO] [timer.py:199:stop] epoch=1/micro_step=805/global_step=2870, RunningAvgSamplesPerSec=175.36257053963806, CurrSamplesPerSec=65.5379360571931, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:33:48,203] [INFO] [logging.py:96:log_dist] [Rank 0] step=2880, skipped=54, lr=[2.185328303361015e-06, 2.185328303361015e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:33:48,222] [INFO] [timer.py:199:stop] epoch=1/micro_step=815/global_step=2880, RunningAvgSamplesPerSec=175.37415763068918, CurrSamplesPerSec=178.64137091138988, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:33:51,790] [INFO] [logging.py:96:log_dist] [Rank 0] step=2890, skipped=54, lr=[2.154681975688849e-06, 2.154681975688849e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:33:51,808] [INFO] [timer.py:199:stop] epoch=1/micro_step=825/global_step=2890, RunningAvgSamplesPerSec=175.3853843990023, CurrSamplesPerSec=178.78557489553347, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:33:55,375] [INFO] [logging.py:96:log_dist] [Rank 0] step=2900, skipped=54, lr=[2.1241901594193676e-06, 2.1241901594193676e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:33:55,394] [INFO] [timer.py:199:stop] epoch=1/micro_step=835/global_step=2900, RunningAvgSamplesPerSec=175.39659597200853, CurrSamplesPerSec=178.64933650919784, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:33:58,960] [INFO] [logging.py:96:log_dist] [Rank 0] step=2910, skipped=54, lr=[2.0938546188867244e-06, 2.0938546188867244e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:33:59,168] [INFO] [timer.py:199:stop] epoch=1/micro_step=845/global_step=2910, RunningAvgSamplesPerSec=175.37653463776212, CurrSamplesPerSec=116.68382468962682, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:34:02,739] [INFO] [logging.py:96:log_dist] [Rank 0] step=2920, skipped=54, lr=[2.063677109382567e-06, 2.063677109382567e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:34:02,757] [INFO] [timer.py:199:stop] epoch=1/micro_step=855/global_step=2920, RunningAvgSamplesPerSec=175.38733130588002, CurrSamplesPerSec=177.43329184516165, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:34:06,327] [INFO] [logging.py:96:log_dist] [Rank 0] step=2930, skipped=54, lr=[2.033659377054463e-06, 2.033659377054463e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:34:06,346] [INFO] [timer.py:199:stop] epoch=1/micro_step=865/global_step=2930, RunningAvgSamplesPerSec=175.39784304260147, CurrSamplesPerSec=178.5852751805908, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:34:09,914] [INFO] [logging.py:96:log_dist] [Rank 0] step=2940, skipped=54, lr=[2.0038031588048647e-06, 2.0038031588048647e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:34:09,932] [INFO] [timer.py:199:stop] epoch=1/micro_step=875/global_step=2940, RunningAvgSamplesPerSec=175.40881130164325, CurrSamplesPerSec=178.51639221069067, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:34:13,535] [INFO] [logging.py:96:log_dist] [Rank 0] step=2950, skipped=54, lr=[1.974110182190611e-06, 1.974110182190611e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:34:13,553] [INFO] [timer.py:199:stop] epoch=1/micro_step=885/global_step=2950, RunningAvgSamplesPerSec=175.41395737386372, CurrSamplesPerSec=178.79664967079518, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:34:16,033] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:34:16,363] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:34:17,062] [INFO] [logging.py:96:log_dist] [Rank 0] step=2960, skipped=56, lr=[1.9504744897673342e-06, 1.9504744897673342e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:34:17,081] [INFO] [timer.py:199:stop] epoch=1/micro_step=895/global_step=2960, RunningAvgSamplesPerSec=175.43427517257751, CurrSamplesPerSec=178.68370372244047, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:34:20,647] [INFO] [logging.py:96:log_dist] [Rank 0] step=2970, skipped=56, lr=[1.921079671326169e-06, 1.921079671326169e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:34:20,665] [INFO] [timer.py:199:stop] epoch=1/micro_step=905/global_step=2970, RunningAvgSamplesPerSec=175.44529569632783, CurrSamplesPerSec=178.91021546369217, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:34:24,236] [INFO] [logging.py:96:log_dist] [Rank 0] step=2980, skipped=56, lr=[1.8918528811120437e-06, 1.8918528811120437e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:34:24,254] [INFO] [timer.py:199:stop] epoch=1/micro_step=915/global_step=2980, RunningAvgSamplesPerSec=175.45543086745442, CurrSamplesPerSec=178.82618911369485, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:34:27,821] [INFO] [logging.py:96:log_dist] [Rank 0] step=2990, skipped=56, lr=[1.8627958102614874e-06, 1.8627958102614874e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:34:27,839] [INFO] [timer.py:199:stop] epoch=1/micro_step=925/global_step=2990, RunningAvgSamplesPerSec=175.46618127319772, CurrSamplesPerSec=178.60880257418205, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:34:31,404] [INFO] [logging.py:96:log_dist] [Rank 0] step=3000, skipped=56, lr=[1.8339101400906334e-06, 1.8339101400906334e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:34:31,423] [INFO] [timer.py:199:stop] epoch=1/micro_step=935/global_step=3000, RunningAvgSamplesPerSec=175.47708892554837, CurrSamplesPerSec=178.75283492917754, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:34:34,988] [INFO] [logging.py:96:log_dist] [Rank 0] step=3010, skipped=56, lr=[1.8051975419979348e-06, 1.8051975419979348e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:34:35,162] [INFO] [timer.py:199:stop] epoch=1/micro_step=945/global_step=3010, RunningAvgSamplesPerSec=175.46297801867968, CurrSamplesPerSec=124.53338900409643, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:34:38,728] [INFO] [logging.py:96:log_dist] [Rank 0] step=3020, skipped=56, lr=[1.7766596773674558e-06, 1.7766596773674558e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:34:38,746] [INFO] [timer.py:199:stop] epoch=1/micro_step=955/global_step=3020, RunningAvgSamplesPerSec=175.47380767691158, CurrSamplesPerSec=178.84191570089908, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:34:42,312] [INFO] [logging.py:96:log_dist] [Rank 0] step=3030, skipped=56, lr=[1.7482981974727318e-06, 1.7482981974727318e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:34:42,330] [INFO] [timer.py:199:stop] epoch=1/micro_step=965/global_step=3030, RunningAvgSamplesPerSec=175.48456409782185, CurrSamplesPerSec=178.80915509845195, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:34:45,898] [INFO] [logging.py:96:log_dist] [Rank 0] step=3040, skipped=56, lr=[1.7201147433812318e-06, 1.7201147433812318e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:34:45,917] [INFO] [timer.py:199:stop] epoch=1/micro_step=975/global_step=3040, RunningAvgSamplesPerSec=175.4948000849635, CurrSamplesPerSec=178.41992668733778, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:34:49,487] [INFO] [logging.py:96:log_dist] [Rank 0] step=3050, skipped=56, lr=[1.6921109458593997e-06, 1.6921109458593997e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:34:49,635] [INFO] [timer.py:199:stop] epoch=1/micro_step=985/global_step=3050, RunningAvgSamplesPerSec=175.48414631982206, CurrSamplesPerSec=131.24092142034377, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:34:52,831] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:34:53,161] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:34:53,161] [INFO] [logging.py:96:log_dist] [Rank 0] step=3060, skipped=58, lr=[1.6698383498131253e-06, 1.6698383498131253e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:34:53,162] [INFO] [timer.py:199:stop] epoch=1/micro_step=995/global_step=3060, RunningAvgSamplesPerSec=175.50384313667155, CurrSamplesPerSec=194.1741390447518, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:34:56,729] [INFO] [logging.py:96:log_dist] [Rank 0] step=3070, skipped=58, lr=[1.6421620104061834e-06, 1.6421620104061834e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:34:56,748] [INFO] [timer.py:199:stop] epoch=1/micro_step=1005/global_step=3070, RunningAvgSamplesPerSec=175.51393037047305, CurrSamplesPerSec=178.72831757238572, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:35:00,313] [INFO] [logging.py:96:log_dist] [Rank 0] step=3080, skipped=58, lr=[1.6146698381125982e-06, 1.6146698381125982e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:35:00,332] [INFO] [timer.py:199:stop] epoch=1/micro_step=1015/global_step=3080, RunningAvgSamplesPerSec=175.52440789455224, CurrSamplesPerSec=178.7981978669608, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:35:03,897] [INFO] [logging.py:96:log_dist] [Rank 0] step=3090, skipped=58, lr=[1.5873634236994883e-06, 1.5873634236994883e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:35:03,916] [INFO] [timer.py:199:stop] epoch=1/micro_step=1025/global_step=3090, RunningAvgSamplesPerSec=175.5347092537403, CurrSamplesPerSec=178.62187602723955, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:35:07,481] [INFO] [logging.py:96:log_dist] [Rank 0] step=3100, skipped=58, lr=[1.5602443471855431e-06, 1.5602443471855431e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:35:07,499] [INFO] [timer.py:199:stop] epoch=1/micro_step=1035/global_step=3100, RunningAvgSamplesPerSec=175.54498128830141, CurrSamplesPerSec=178.66051332156175, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:35:11,064] [INFO] [logging.py:96:log_dist] [Rank 0] step=3110, skipped=58, lr=[1.5333141777496092e-06, 1.5333141777496092e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:35:11,082] [INFO] [timer.py:199:stop] epoch=1/micro_step=1045/global_step=3110, RunningAvgSamplesPerSec=175.5553423085148, CurrSamplesPerSec=178.91653552963697, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:35:14,648] [INFO] [logging.py:96:log_dist] [Rank 0] step=3120, skipped=58, lr=[1.5065744736398855e-06, 1.5065744736398855e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:35:14,666] [INFO] [timer.py:199:stop] epoch=1/micro_step=1055/global_step=3120, RunningAvgSamplesPerSec=175.56548720176968, CurrSamplesPerSec=178.64648307907524, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:35:18,252] [INFO] [logging.py:96:log_dist] [Rank 0] step=3130, skipped=58, lr=[1.4800267820837643e-06, 1.4800267820837643e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:35:18,270] [INFO] [timer.py:199:stop] epoch=1/micro_step=1065/global_step=3130, RunningAvgSamplesPerSec=175.5725685751029, CurrSamplesPerSec=178.70333107209834, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:35:21,837] [INFO] [logging.py:96:log_dist] [Rank 0] step=3140, skipped=58, lr=[1.453672639198298e-06, 1.453672639198298e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:35:21,855] [INFO] [timer.py:199:stop] epoch=1/micro_step=1075/global_step=3140, RunningAvgSamplesPerSec=175.5825168965636, CurrSamplesPerSec=178.63697230361345, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:35:25,420] [INFO] [logging.py:96:log_dist] [Rank 0] step=3150, skipped=58, lr=[1.4275135699013245e-06, 1.4275135699013245e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:35:25,439] [INFO] [timer.py:199:stop] epoch=1/micro_step=1085/global_step=3150, RunningAvgSamplesPerSec=175.59247424365415, CurrSamplesPerSec=178.92583757092933, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:35:29,005] [INFO] [logging.py:96:log_dist] [Rank 0] step=3160, skipped=58, lr=[1.4015510878232252e-06, 1.4015510878232252e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:35:29,023] [INFO] [timer.py:199:stop] epoch=1/micro_step=1095/global_step=3160, RunningAvgSamplesPerSec=175.60235538918772, CurrSamplesPerSec=178.8674177128651, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:35:29,352] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:35:29,682] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:35:32,531] [INFO] [logging.py:96:log_dist] [Rank 0] step=3170, skipped=60, lr=[1.3809236548583506e-06, 1.3809236548583506e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:35:32,549] [INFO] [timer.py:199:stop] epoch=1/micro_step=1105/global_step=3170, RunningAvgSamplesPerSec=175.62097677864833, CurrSamplesPerSec=178.90115350677254, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:35:36,116] [INFO] [logging.py:96:log_dist] [Rank 0] step=3180, skipped=60, lr=[1.3553188077581583e-06, 1.3553188077581583e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:35:36,134] [INFO] [timer.py:199:stop] epoch=1/micro_step=1115/global_step=3180, RunningAvgSamplesPerSec=175.63058967345353, CurrSamplesPerSec=178.74283674824227, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:35:39,700] [INFO] [logging.py:96:log_dist] [Rank 0] step=3190, skipped=60, lr=[1.3299147252496713e-06, 1.3299147252496713e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:35:39,718] [INFO] [timer.py:199:stop] epoch=1/micro_step=1125/global_step=3190, RunningAvgSamplesPerSec=175.64024402411613, CurrSamplesPerSec=178.7962923985645, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:35:43,284] [INFO] [logging.py:96:log_dist] [Rank 0] step=3200, skipped=60, lr=[1.3047128772778128e-06, 1.3047128772778128e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:35:43,303] [INFO] [timer.py:199:stop] epoch=1/micro_step=1135/global_step=3200, RunningAvgSamplesPerSec=175.64981655047444, CurrSamplesPerSec=178.61652756512754, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:35:46,868] [INFO] [logging.py:96:log_dist] [Rank 0] step=3210, skipped=60, lr=[1.2797147220857014e-06, 1.2797147220857014e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:35:46,904] [INFO] [timer.py:199:stop] epoch=1/micro_step=1145/global_step=3210, RunningAvgSamplesPerSec=175.65689906485014, CurrSamplesPerSec=170.79719176260545, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:35:50,470] [INFO] [logging.py:96:log_dist] [Rank 0] step=3220, skipped=60, lr=[1.2549217061302684e-06, 1.2549217061302684e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:35:50,488] [INFO] [timer.py:199:stop] epoch=1/micro_step=1155/global_step=3220, RunningAvgSamplesPerSec=175.66633005275074, CurrSamplesPerSec=178.5919287642725, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:35:54,069] [INFO] [logging.py:96:log_dist] [Rank 0] step=3230, skipped=60, lr=[1.230335263998571e-06, 1.230335263998571e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:35:54,087] [INFO] [timer.py:199:stop] epoch=1/micro_step=1165/global_step=3230, RunningAvgSamplesPerSec=175.67364144636696, CurrSamplesPerSec=178.7172512603095, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:35:57,653] [INFO] [logging.py:96:log_dist] [Rank 0] step=3240, skipped=60, lr=[1.2059568183247775e-06, 1.2059568183247775e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:35:57,671] [INFO] [timer.py:199:stop] epoch=1/micro_step=1175/global_step=3240, RunningAvgSamplesPerSec=175.68298935478492, CurrSamplesPerSec=178.74188459886602, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:36:01,238] [INFO] [logging.py:96:log_dist] [Rank 0] step=3250, skipped=60, lr=[1.1817877797078513e-06, 1.1817877797078513e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:36:01,256] [INFO] [timer.py:199:stop] epoch=1/micro_step=1185/global_step=3250, RunningAvgSamplesPerSec=175.6922230151946, CurrSamplesPerSec=178.70273624057506, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:36:04,822] [INFO] [logging.py:96:log_dist] [Rank 0] step=3260, skipped=60, lr=[1.1578295466299286e-06, 1.1578295466299286e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:36:04,840] [INFO] [timer.py:199:stop] epoch=1/micro_step=1195/global_step=3260, RunningAvgSamplesPerSec=175.70147847047284, CurrSamplesPerSec=178.82166227554484, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:36:05,885] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:36:06,215] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:36:08,348] [INFO] [logging.py:96:log_dist] [Rank 0] step=3270, skipped=62, lr=[1.1388156721506803e-06, 1.1388156721506803e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:36:08,366] [INFO] [timer.py:199:stop] epoch=1/micro_step=1205/global_step=3270, RunningAvgSamplesPerSec=175.7192787546277, CurrSamplesPerSec=178.77759714419867, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:36:11,932] [INFO] [logging.py:96:log_dist] [Rank 0] step=3280, skipped=62, lr=[1.1152403742315206e-06, 1.1152403742315206e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:36:11,950] [INFO] [timer.py:199:stop] epoch=1/micro_step=1215/global_step=3280, RunningAvgSamplesPerSec=175.72840071389794, CurrSamplesPerSec=178.6379233355893, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:36:15,517] [INFO] [logging.py:96:log_dist] [Rank 0] step=3290, skipped=62, lr=[1.0918797324537545e-06, 1.0918797324537545e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:36:15,535] [INFO] [timer.py:199:stop] epoch=1/micro_step=1225/global_step=3290, RunningAvgSamplesPerSec=175.73736903068976, CurrSamplesPerSec=178.68739095617744, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:36:19,110] [INFO] [logging.py:96:log_dist] [Rank 0] step=3300, skipped=62, lr=[1.0687350985236195e-06, 1.0687350985236195e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:36:19,129] [INFO] [timer.py:199:stop] epoch=1/micro_step=1235/global_step=3300, RunningAvgSamplesPerSec=175.7451463025297, CurrSamplesPerSec=178.86598750102448, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:36:22,725] [INFO] [logging.py:96:log_dist] [Rank 0] step=3310, skipped=62, lr=[1.0458078116485878e-06, 1.0458078116485878e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:36:23,034] [INFO] [timer.py:199:stop] epoch=1/micro_step=1245/global_step=3310, RunningAvgSamplesPerSec=175.70731281811084, CurrSamplesPerSec=94.57194092782409, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:36:26,604] [INFO] [logging.py:96:log_dist] [Rank 0] step=3320, skipped=62, lr=[1.0230991984598799e-06, 1.0230991984598799e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:36:26,622] [INFO] [timer.py:199:stop] epoch=1/micro_step=1255/global_step=3320, RunningAvgSamplesPerSec=175.7158832909153, CurrSamplesPerSec=178.79641148914942, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:36:30,188] [INFO] [logging.py:96:log_dist] [Rank 0] step=3330, skipped=62, lr=[1.0006105729356949e-06, 1.0006105729356949e-06], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:36:30,206] [INFO] [timer.py:199:stop] epoch=1/micro_step=1265/global_step=3330, RunningAvgSamplesPerSec=175.72488541329272, CurrSamplesPerSec=178.92500273284506, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:36:33,776] [INFO] [logging.py:96:log_dist] [Rank 0] step=3340, skipped=62, lr=[9.783432363251897e-07, 9.783432363251897e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:36:33,795] [INFO] [timer.py:199:stop] epoch=1/micro_step=1275/global_step=3340, RunningAvgSamplesPerSec=175.73323905420526, CurrSamplesPerSec=178.81129906169772, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:36:37,360] [INFO] [logging.py:96:log_dist] [Rank 0] step=3350, skipped=62, lr=[9.562984770731798e-07, 9.562984770731798e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:36:37,379] [INFO] [timer.py:199:stop] epoch=1/micro_step=1285/global_step=3350, RunningAvgSamplesPerSec=175.7421410961989, CurrSamplesPerSec=178.8356009055188, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:36:40,949] [INFO] [logging.py:96:log_dist] [Rank 0] step=3360, skipped=62, lr=[9.344775707455875e-07, 9.344775707455875e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:36:40,967] [INFO] [timer.py:199:stop] epoch=1/micro_step=1295/global_step=3360, RunningAvgSamplesPerSec=175.75038366750013, CurrSamplesPerSec=178.6311474537111, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:36:42,732] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:36:43,061] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:36:44,477] [INFO] [logging.py:96:log_dist] [Rank 0] step=3370, skipped=64, lr=[9.171828687022605e-07, 9.171828687022605e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:36:44,495] [INFO] [timer.py:199:stop] epoch=1/micro_step=1305/global_step=3370, RunningAvgSamplesPerSec=175.76720278910642, CurrSamplesPerSec=178.87099334253335, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:36:48,061] [INFO] [logging.py:96:log_dist] [Rank 0] step=3380, skipped=64, lr=[8.957680706730655e-07, 8.957680706730655e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:36:48,080] [INFO] [timer.py:199:stop] epoch=1/micro_step=1315/global_step=3380, RunningAvgSamplesPerSec=175.77592748114708, CurrSamplesPerSec=178.6441052902683, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:36:51,645] [INFO] [logging.py:96:log_dist] [Rank 0] step=3390, skipped=64, lr=[8.745806280117845e-07, 8.745806280117845e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:36:51,663] [INFO] [timer.py:199:stop] epoch=1/micro_step=1325/global_step=3390, RunningAvgSamplesPerSec=175.78468344555955, CurrSamplesPerSec=178.7374810399496, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:36:55,237] [INFO] [logging.py:96:log_dist] [Rank 0] step=3400, skipped=64, lr=[8.536217666778187e-07, 8.536217666778187e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:36:55,256] [INFO] [timer.py:199:stop] epoch=1/micro_step=1335/global_step=3400, RunningAvgSamplesPerSec=175.79218042116347, CurrSamplesPerSec=178.70963652970954, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:36:58,850] [INFO] [logging.py:96:log_dist] [Rank 0] step=3410, skipped=64, lr=[8.328926994042706e-07, 8.328926994042706e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:36:58,868] [INFO] [timer.py:199:stop] epoch=1/micro_step=1345/global_step=3410, RunningAvgSamplesPerSec=175.79677523969337, CurrSamplesPerSec=178.68215750841873, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:37:02,433] [INFO] [logging.py:96:log_dist] [Rank 0] step=3420, skipped=64, lr=[8.123946256277734e-07, 8.123946256277734e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:37:02,451] [INFO] [timer.py:199:stop] epoch=1/micro_step=1355/global_step=3420, RunningAvgSamplesPerSec=175.80553617521295, CurrSamplesPerSec=178.99408210635673, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:37:06,018] [INFO] [logging.py:96:log_dist] [Rank 0] step=3430, skipped=64, lr=[7.921287314190879e-07, 7.921287314190879e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:37:06,036] [INFO] [timer.py:199:stop] epoch=1/micro_step=1365/global_step=3430, RunningAvgSamplesPerSec=175.8139041977077, CurrSamplesPerSec=178.86527240368105, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:37:09,601] [INFO] [logging.py:96:log_dist] [Rank 0] step=3440, skipped=64, lr=[7.72096189414476e-07, 7.72096189414476e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:37:09,620] [INFO] [timer.py:199:stop] epoch=1/micro_step=1375/global_step=3440, RunningAvgSamplesPerSec=175.82238973708252, CurrSamplesPerSec=178.74486009934878, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:37:13,185] [INFO] [logging.py:96:log_dist] [Rank 0] step=3450, skipped=64, lr=[7.522981587478424e-07, 7.522981587478424e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:37:13,513] [INFO] [timer.py:199:stop] epoch=1/micro_step=1385/global_step=3450, RunningAvgSamplesPerSec=175.7875644859922, CurrSamplesPerSec=95.88994260223218, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:37:17,079] [INFO] [logging.py:96:log_dist] [Rank 0] step=3460, skipped=64, lr=[7.327357849836707e-07, 7.327357849836707e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:37:17,097] [INFO] [timer.py:199:stop] epoch=1/micro_step=1395/global_step=3460, RunningAvgSamplesPerSec=175.7960306370693, CurrSamplesPerSec=178.84382214185825, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:37:19,578] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:37:19,908] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:37:20,607] [INFO] [logging.py:96:log_dist] [Rank 0] step=3470, skipped=66, lr=[7.172563200580809e-07, 7.172563200580809e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:37:20,625] [INFO] [timer.py:199:stop] epoch=1/micro_step=1405/global_step=3470, RunningAvgSamplesPerSec=175.812259600064, CurrSamplesPerSec=178.76926294057907, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:37:24,189] [INFO] [logging.py:96:log_dist] [Rank 0] step=3480, skipped=66, lr=[6.98120971973826e-07, 6.98120971973826e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:37:24,208] [INFO] [timer.py:199:stop] epoch=1/micro_step=1415/global_step=3480, RunningAvgSamplesPerSec=175.82085948930464, CurrSamplesPerSec=178.77628743216854, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:37:27,778] [INFO] [logging.py:96:log_dist] [Rank 0] step=3490, skipped=66, lr=[6.7922441562206e-07, 6.7922441562206e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:37:27,796] [INFO] [timer.py:199:stop] epoch=1/micro_step=1425/global_step=3490, RunningAvgSamplesPerSec=175.82856518442705, CurrSamplesPerSec=177.72098048770152, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:37:31,377] [INFO] [logging.py:96:log_dist] [Rank 0] step=3500, skipped=66, lr=[6.605677444056651e-07, 6.605677444056651e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:37:31,396] [INFO] [timer.py:199:stop] epoch=1/micro_step=1435/global_step=3500, RunningAvgSamplesPerSec=175.83476895287086, CurrSamplesPerSec=178.83321807975548, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:37:34,960] [INFO] [logging.py:96:log_dist] [Rank 0] step=3510, skipped=66, lr=[6.421520378471632e-07, 6.421520378471632e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:37:34,978] [INFO] [timer.py:199:stop] epoch=1/micro_step=1445/global_step=3510, RunningAvgSamplesPerSec=175.8431805617849, CurrSamplesPerSec=178.71415769774134, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:37:38,542] [INFO] [logging.py:96:log_dist] [Rank 0] step=3520, skipped=66, lr=[6.239783615262409e-07, 6.239783615262409e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:37:38,560] [INFO] [timer.py:199:stop] epoch=1/micro_step=1455/global_step=3520, RunningAvgSamplesPerSec=175.85165884204355, CurrSamplesPerSec=178.9496933791359, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:37:42,125] [INFO] [logging.py:96:log_dist] [Rank 0] step=3530, skipped=66, lr=[6.060477670181025e-07, 6.060477670181025e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:37:42,144] [INFO] [timer.py:199:stop] epoch=1/micro_step=1465/global_step=3530, RunningAvgSamplesPerSec=175.85990020293528, CurrSamplesPerSec=178.91951684780653, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:37:45,709] [INFO] [logging.py:96:log_dist] [Rank 0] step=3540, skipped=66, lr=[5.8836129183262e-07, 5.8836129183262e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:37:45,727] [INFO] [timer.py:199:stop] epoch=1/micro_step=1475/global_step=3540, RunningAvgSamplesPerSec=175.86804151824174, CurrSamplesPerSec=178.73402975355407, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:37:49,292] [INFO] [logging.py:96:log_dist] [Rank 0] step=3550, skipped=66, lr=[5.709199593542966e-07, 5.709199593542966e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:37:49,311] [INFO] [timer.py:199:stop] epoch=1/micro_step=1485/global_step=3550, RunningAvgSamplesPerSec=175.87619072531527, CurrSamplesPerSec=178.93382855817313, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:37:52,879] [INFO] [logging.py:96:log_dist] [Rank 0] step=3560, skipped=66, lr=[5.537247787830584e-07, 5.537247787830584e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:37:52,898] [INFO] [timer.py:199:stop] epoch=1/micro_step=1495/global_step=3560, RunningAvgSamplesPerSec=175.88375406261935, CurrSamplesPerSec=178.63946878414765, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:37:56,093] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:37:56,423] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:37:56,423] [INFO] [logging.py:96:log_dist] [Rank 0] step=3570, skipped=68, lr=[5.401465327902731e-07, 5.401465327902731e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:37:56,424] [INFO] [timer.py:199:stop] epoch=1/micro_step=1505/global_step=3570, RunningAvgSamplesPerSec=175.89960025969916, CurrSamplesPerSec=194.2723680439559, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:37:59,988] [INFO] [logging.py:96:log_dist] [Rank 0] step=3580, skipped=68, lr=[5.233969233352374e-07, 5.233969233352374e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:38:00,006] [INFO] [timer.py:199:stop] epoch=1/micro_step=1515/global_step=3580, RunningAvgSamplesPerSec=175.90765412373196, CurrSamplesPerSec=178.7022603782076, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:38:03,576] [INFO] [logging.py:96:log_dist] [Rank 0] step=3590, skipped=68, lr=[5.068962155911986e-07, 5.068962155911986e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:38:03,594] [INFO] [timer.py:199:stop] epoch=1/micro_step=1525/global_step=3590, RunningAvgSamplesPerSec=175.91515414209272, CurrSamplesPerSec=179.00446651844956, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:38:07,165] [INFO] [logging.py:96:log_dist] [Rank 0] step=3600, skipped=68, lr=[4.906453643311369e-07, 4.906453643311369e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:38:07,194] [INFO] [timer.py:199:stop] epoch=1/micro_step=1535/global_step=3600, RunningAvgSamplesPerSec=175.92089500386905, CurrSamplesPerSec=174.03717186753397, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:38:10,761] [INFO] [logging.py:96:log_dist] [Rank 0] step=3610, skipped=68, lr=[4.746453098707016e-07, 4.746453098707016e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:38:10,778] [INFO] [timer.py:199:stop] epoch=1/micro_step=1545/global_step=3610, RunningAvgSamplesPerSec=175.92859339754335, CurrSamplesPerSec=177.6647556770423, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:38:14,359] [INFO] [logging.py:96:log_dist] [Rank 0] step=3620, skipped=68, lr=[4.5889697801379865e-07, 4.5889697801379865e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:38:14,377] [INFO] [timer.py:199:stop] epoch=1/micro_step=1555/global_step=3620, RunningAvgSamplesPerSec=175.93450804866072, CurrSamplesPerSec=178.78569397184165, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:38:17,940] [INFO] [logging.py:96:log_dist] [Rank 0] step=3630, skipped=68, lr=[4.434012799990236e-07, 4.434012799990236e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:38:17,958] [INFO] [timer.py:199:stop] epoch=1/micro_step=1565/global_step=3630, RunningAvgSamplesPerSec=175.94264220760496, CurrSamplesPerSec=179.04947956444164, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:38:21,521] [INFO] [logging.py:96:log_dist] [Rank 0] step=3640, skipped=68, lr=[4.2815911244693217e-07, 4.2815911244693217e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:38:21,539] [INFO] [timer.py:199:stop] epoch=1/micro_step=1575/global_step=3640, RunningAvgSamplesPerSec=175.95064921013778, CurrSamplesPerSec=178.83786464881055, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:38:25,103] [INFO] [logging.py:96:log_dist] [Rank 0] step=3650, skipped=68, lr=[4.1317135730816497e-07, 4.1317135730816497e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:38:25,121] [INFO] [timer.py:199:stop] epoch=1/micro_step=1585/global_step=3650, RunningAvgSamplesPerSec=175.9585946677489, CurrSamplesPerSec=178.9635326628643, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:38:26,526] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 32768, reducing to 16384 +[2023-04-14 07:38:28,659] [INFO] [logging.py:96:log_dist] [Rank 0] step=3660, skipped=69, lr=[3.99900617366988e-07, 3.99900617366988e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:38:28,677] [INFO] [timer.py:199:stop] epoch=1/micro_step=1595/global_step=3660, RunningAvgSamplesPerSec=175.96991011336877, CurrSamplesPerSec=178.45645978246344, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:38:32,256] [INFO] [logging.py:96:log_dist] [Rank 0] step=3670, skipped=69, lr=[3.853986228240133e-07, 3.853986228240133e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:38:32,274] [INFO] [timer.py:199:stop] epoch=1/micro_step=1605/global_step=3670, RunningAvgSamplesPerSec=175.9764080603274, CurrSamplesPerSec=178.99384339840674, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:38:35,840] [INFO] [logging.py:96:log_dist] [Rank 0] step=3680, skipped=69, lr=[3.711535149252111e-07, 3.711535149252111e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:38:35,858] [INFO] [timer.py:199:stop] epoch=1/micro_step=1615/global_step=3680, RunningAvgSamplesPerSec=175.9839736531631, CurrSamplesPerSec=178.736885981062, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:38:39,434] [INFO] [logging.py:96:log_dist] [Rank 0] step=3690, skipped=69, lr=[3.571661179288087e-07, 3.571661179288087e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:38:40,429] [INFO] [timer.py:199:stop] epoch=1/micro_step=1625/global_step=3690, RunningAvgSamplesPerSec=175.8620287074178, CurrSamplesPerSec=47.955138212785144, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:38:43,994] [INFO] [logging.py:96:log_dist] [Rank 0] step=3700, skipped=69, lr=[3.4343724118122583e-07, 3.4343724118122583e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:38:44,012] [INFO] [timer.py:199:stop] epoch=1/micro_step=1635/global_step=3700, RunningAvgSamplesPerSec=175.86990189720575, CurrSamplesPerSec=178.90019966997139, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:38:47,577] [INFO] [logging.py:96:log_dist] [Rank 0] step=3710, skipped=69, lr=[3.2996767907024114e-07, 3.2996767907024114e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:38:47,595] [INFO] [timer.py:199:stop] epoch=1/micro_step=1645/global_step=3710, RunningAvgSamplesPerSec=175.87774062346514, CurrSamplesPerSec=178.67442683969082, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:38:51,158] [INFO] [logging.py:96:log_dist] [Rank 0] step=3720, skipped=69, lr=[3.1675821097902673e-07, 3.1675821097902673e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:38:51,177] [INFO] [timer.py:199:stop] epoch=1/micro_step=1655/global_step=3720, RunningAvgSamplesPerSec=175.88577537112607, CurrSamplesPerSec=178.95816372109147, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:38:54,739] [INFO] [logging.py:96:log_dist] [Rank 0] step=3730, skipped=69, lr=[3.0380960124105446e-07, 3.0380960124105446e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:38:54,757] [INFO] [timer.py:199:stop] epoch=1/micro_step=1665/global_step=3730, RunningAvgSamplesPerSec=175.89380119342655, CurrSamplesPerSec=178.77723994810552, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:38:58,321] [INFO] [logging.py:96:log_dist] [Rank 0] step=3740, skipped=69, lr=[2.9112259909586574e-07, 2.9112259909586574e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:38:58,339] [INFO] [timer.py:199:stop] epoch=1/micro_step=1675/global_step=3740, RunningAvgSamplesPerSec=175.90165288323814, CurrSamplesPerSec=178.69952421877548, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:39:01,903] [INFO] [logging.py:96:log_dist] [Rank 0] step=3750, skipped=69, lr=[2.7869793864572123e-07, 2.7869793864572123e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:39:02,113] [INFO] [timer.py:199:stop] epoch=1/micro_step=1685/global_step=3750, RunningAvgSamplesPerSec=175.88467497894754, CurrSamplesPerSec=116.39538330389145, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:39:05,676] [INFO] [logging.py:96:log_dist] [Rank 0] step=3760, skipped=69, lr=[2.6653633881312157e-07, 2.6653633881312157e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:39:05,695] [INFO] [timer.py:199:stop] epoch=1/micro_step=1695/global_step=3760, RunningAvgSamplesPerSec=175.89256984674827, CurrSamplesPerSec=178.96174297981278, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:39:09,261] [INFO] [logging.py:96:log_dist] [Rank 0] step=3770, skipped=69, lr=[2.5463850329921177e-07, 2.5463850329921177e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:39:09,279] [INFO] [timer.py:199:stop] epoch=1/micro_step=1705/global_step=3770, RunningAvgSamplesPerSec=175.90008723492375, CurrSamplesPerSec=178.68073025768828, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:39:12,845] [INFO] [logging.py:96:log_dist] [Rank 0] step=3780, skipped=69, lr=[2.430051205430605e-07, 2.430051205430605e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:39:12,864] [INFO] [timer.py:199:stop] epoch=1/micro_step=1715/global_step=3780, RunningAvgSamplesPerSec=175.90755744040902, CurrSamplesPerSec=178.87695304300544, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:39:16,428] [INFO] [logging.py:96:log_dist] [Rank 0] step=3790, skipped=69, lr=[2.3163686368182603e-07, 2.3163686368182603e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:39:16,447] [INFO] [timer.py:199:stop] epoch=1/micro_step=1725/global_step=3790, RunningAvgSamplesPerSec=175.91520101852097, CurrSamplesPerSec=178.97689676117275, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:39:20,009] [INFO] [logging.py:96:log_dist] [Rank 0] step=3800, skipped=69, lr=[2.2053439051180898e-07, 2.2053439051180898e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:39:20,027] [INFO] [timer.py:199:stop] epoch=1/micro_step=1735/global_step=3800, RunningAvgSamplesPerSec=175.92301950331907, CurrSamplesPerSec=178.98715983429204, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:39:23,589] [INFO] [logging.py:96:log_dist] [Rank 0] step=3810, skipped=69, lr=[2.0969834345038455e-07, 2.0969834345038455e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:39:23,608] [INFO] [timer.py:199:stop] epoch=1/micro_step=1745/global_step=3810, RunningAvgSamplesPerSec=175.93086232124165, CurrSamplesPerSec=178.86968226172294, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:39:27,171] [INFO] [logging.py:96:log_dist] [Rank 0] step=3820, skipped=69, lr=[1.9912934949883843e-07, 1.9912934949883843e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:39:27,189] [INFO] [timer.py:199:stop] epoch=1/micro_step=1755/global_step=3820, RunningAvgSamplesPerSec=175.93852666264945, CurrSamplesPerSec=178.80808313610868, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:39:30,753] [INFO] [logging.py:96:log_dist] [Rank 0] step=3830, skipped=69, lr=[1.8882802020608136e-07, 1.8882802020608136e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:39:30,771] [INFO] [timer.py:199:stop] epoch=1/micro_step=1765/global_step=3830, RunningAvgSamplesPerSec=175.94609685322612, CurrSamplesPerSec=178.837745502998, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:39:34,333] [INFO] [logging.py:96:log_dist] [Rank 0] step=3840, skipped=69, lr=[1.787949516332672e-07, 1.787949516332672e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:39:34,351] [INFO] [timer.py:199:stop] epoch=1/micro_step=1775/global_step=3840, RunningAvgSamplesPerSec=175.9538590897691, CurrSamplesPerSec=178.930727493301, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:39:37,922] [INFO] [logging.py:96:log_dist] [Rank 0] step=3850, skipped=69, lr=[1.690307243192982e-07, 1.690307243192982e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:39:37,941] [INFO] [timer.py:199:stop] epoch=1/micro_step=1785/global_step=3850, RunningAvgSamplesPerSec=175.9603153123224, CurrSamplesPerSec=179.08125305628772, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:39:39,702] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:39:40,033] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:39:41,465] [INFO] [logging.py:96:log_dist] [Rank 0] step=3860, skipped=71, lr=[1.6141328836662577e-07, 1.6141328836662577e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:39:41,483] [INFO] [timer.py:199:stop] epoch=1/micro_step=1795/global_step=3860, RunningAvgSamplesPerSec=175.97275122417156, CurrSamplesPerSec=178.53538910120236, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:39:45,050] [INFO] [logging.py:96:log_dist] [Rank 0] step=3870, skipped=71, lr=[1.5213438860148144e-07, 1.5213438860148144e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:39:45,069] [INFO] [timer.py:199:stop] epoch=1/micro_step=1805/global_step=3870, RunningAvgSamplesPerSec=175.97975714629357, CurrSamplesPerSec=178.97868674732536, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:39:48,636] [INFO] [logging.py:96:log_dist] [Rank 0] step=3880, skipped=71, lr=[1.4312587274326855e-07, 1.4312587274326855e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:39:48,919] [INFO] [timer.py:199:stop] epoch=1/micro_step=1815/global_step=3880, RunningAvgSamplesPerSec=175.95367489173373, CurrSamplesPerSec=102.76028663423739, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:39:52,484] [INFO] [logging.py:96:log_dist] [Rank 0] step=3890, skipped=71, lr=[1.3438826204766153e-07, 1.3438826204766153e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:39:52,502] [INFO] [timer.py:199:stop] epoch=1/micro_step=1825/global_step=3890, RunningAvgSamplesPerSec=175.96092390227247, CurrSamplesPerSec=178.98823394486237, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:39:56,072] [INFO] [logging.py:96:log_dist] [Rank 0] step=3900, skipped=71, lr=[1.2592206209507195e-07, 1.2592206209507195e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:39:56,090] [INFO] [timer.py:199:stop] epoch=1/micro_step=1835/global_step=3900, RunningAvgSamplesPerSec=175.96758091575865, CurrSamplesPerSec=177.74074996159612, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:39:59,653] [INFO] [logging.py:96:log_dist] [Rank 0] step=3910, skipped=71, lr=[1.1772776276139321e-07, 1.1772776276139321e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:39:59,672] [INFO] [timer.py:199:stop] epoch=1/micro_step=1845/global_step=3910, RunningAvgSamplesPerSec=175.97492032040483, CurrSamplesPerSec=178.84787346385372, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:40:03,236] [INFO] [logging.py:96:log_dist] [Rank 0] step=3920, skipped=71, lr=[1.0980583818965763e-07, 1.0980583818965763e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:40:03,255] [INFO] [timer.py:199:stop] epoch=1/micro_step=1855/global_step=3920, RunningAvgSamplesPerSec=175.9821089981372, CurrSamplesPerSec=178.92500273284506, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:40:06,819] [INFO] [logging.py:96:log_dist] [Rank 0] step=3930, skipped=71, lr=[1.0215674676259928e-07, 1.0215674676259928e-07], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:40:06,837] [INFO] [timer.py:199:stop] epoch=1/micro_step=1865/global_step=3930, RunningAvgSamplesPerSec=175.98928027953724, CurrSamplesPerSec=179.05939263925632, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:40:10,400] [INFO] [logging.py:96:log_dist] [Rank 0] step=3940, skipped=71, lr=[9.478093107613183e-08, 9.478093107613183e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:40:10,419] [INFO] [timer.py:199:stop] epoch=1/micro_step=1875/global_step=3940, RunningAvgSamplesPerSec=175.9965743300116, CurrSamplesPerSec=178.82630824410964, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:40:14,001] [INFO] [logging.py:96:log_dist] [Rank 0] step=3950, skipped=71, lr=[8.767881791373862e-08, 8.767881791373862e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:40:14,019] [INFO] [timer.py:199:stop] epoch=1/micro_step=1885/global_step=3950, RunningAvgSamplesPerSec=176.00159380857446, CurrSamplesPerSec=178.515798623003, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:40:16,506] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:40:16,835] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:40:17,534] [INFO] [logging.py:96:log_dist] [Rank 0] step=3960, skipped=73, lr=[8.219446988984966e-08, 8.219446988984966e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:40:17,553] [INFO] [timer.py:199:stop] epoch=1/micro_step=1895/global_step=3960, RunningAvgSamplesPerSec=176.0146566278664, CurrSamplesPerSec=178.75926292187506, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:40:21,116] [INFO] [logging.py:96:log_dist] [Rank 0] step=3970, skipped=73, lr=[7.558604619786031e-08, 7.558604619786031e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:40:21,134] [INFO] [timer.py:199:stop] epoch=1/micro_step=1905/global_step=3970, RunningAvgSamplesPerSec=176.02178667241708, CurrSamplesPerSec=178.9197553575523, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:40:24,696] [INFO] [logging.py:96:log_dist] [Rank 0] step=3980, skipped=73, lr=[6.925243569487792e-08, 6.925243569487792e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:40:24,715] [INFO] [timer.py:199:stop] epoch=1/micro_step=1915/global_step=3980, RunningAvgSamplesPerSec=176.02903764540463, CurrSamplesPerSec=178.9604305683243, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:40:28,277] [INFO] [logging.py:96:log_dist] [Rank 0] step=3990, skipped=73, lr=[6.31940048597377e-08, 6.31940048597377e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:40:28,296] [INFO] [timer.py:199:stop] epoch=1/micro_step=1925/global_step=3990, RunningAvgSamplesPerSec=176.0361810000028, CurrSamplesPerSec=178.86253125016657, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:40:31,858] [INFO] [logging.py:96:log_dist] [Rank 0] step=4000, skipped=73, lr=[5.741110424868034e-08, 5.741110424868034e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:40:31,876] [INFO] [timer.py:199:stop] epoch=1/micro_step=1935/global_step=4000, RunningAvgSamplesPerSec=176.0433194587976, CurrSamplesPerSec=178.81618272546748, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:40:36,892] [INFO] [logging.py:96:log_dist] [Rank 0] step=4010, skipped=73, lr=[5.190406847506567e-08, 5.190406847506567e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:40:36,912] [INFO] [timer.py:199:stop] epoch=1/micro_step=1945/global_step=4010, RunningAvgSamplesPerSec=176.050077670805, CurrSamplesPerSec=178.3727404840014, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:40:40,487] [INFO] [logging.py:96:log_dist] [Rank 0] step=4020, skipped=73, lr=[4.6673216190009944e-08, 4.6673216190009944e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:40:40,506] [INFO] [timer.py:199:stop] epoch=1/micro_step=1955/global_step=4020, RunningAvgSamplesPerSec=176.05557309854507, CurrSamplesPerSec=178.93454420379314, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:40:44,068] [INFO] [logging.py:96:log_dist] [Rank 0] step=4030, skipped=73, lr=[4.171885006395189e-08, 4.171885006395189e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:40:44,087] [INFO] [timer.py:199:stop] epoch=1/micro_step=1965/global_step=4030, RunningAvgSamplesPerSec=176.06254517209334, CurrSamplesPerSec=178.95899886865385, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:40:47,669] [INFO] [logging.py:96:log_dist] [Rank 0] step=4040, skipped=73, lr=[3.7041256769133745e-08, 3.7041256769133745e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:40:47,687] [INFO] [timer.py:199:stop] epoch=1/micro_step=1975/global_step=4040, RunningAvgSamplesPerSec=176.06726832577104, CurrSamplesPerSec=178.51520503926284, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:40:51,261] [INFO] [logging.py:96:log_dist] [Rank 0] step=4050, skipped=73, lr=[3.26407069630192e-08, 3.26407069630192e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:40:51,279] [INFO] [timer.py:199:stop] epoch=1/micro_step=1985/global_step=4050, RunningAvgSamplesPerSec=176.072912539877, CurrSamplesPerSec=178.7664056782137, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:40:54,474] [INFO] [loss_scaler.py:188:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, but hysteresis is 2. Reducing hysteresis to 1 +[2023-04-14 07:40:54,803] [INFO] [loss_scaler.py:181:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 65536, reducing to 32768 +[2023-04-14 07:40:54,804] [INFO] [logging.py:96:log_dist] [Rank 0] step=4060, skipped=75, lr=[2.9319910078355846e-08, 2.9319910078355846e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:40:54,804] [INFO] [timer.py:199:stop] epoch=1/micro_step=1995/global_step=4060, RunningAvgSamplesPerSec=176.08649326864563, CurrSamplesPerSec=194.44348700356093, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:40:58,370] [INFO] [logging.py:96:log_dist] [Rank 0] step=4070, skipped=75, lr=[2.5418669430112618e-08, 2.5418669430112618e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:40:58,389] [INFO] [timer.py:199:stop] epoch=1/micro_step=2005/global_step=4070, RunningAvgSamplesPerSec=176.09292012266923, CurrSamplesPerSec=178.70333107209834, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:41:01,952] [INFO] [logging.py:96:log_dist] [Rank 0] step=4080, skipped=75, lr=[2.1795144783081444e-08, 2.1795144783081444e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:41:01,971] [INFO] [timer.py:199:stop] epoch=1/micro_step=2015/global_step=4080, RunningAvgSamplesPerSec=176.09961201592117, CurrSamplesPerSec=178.95780580309375, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:41:05,534] [INFO] [logging.py:96:log_dist] [Rank 0] step=4090, skipped=75, lr=[1.8449545803627945e-08, 1.8449545803627945e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:41:05,553] [INFO] [timer.py:199:stop] epoch=1/micro_step=2025/global_step=4090, RunningAvgSamplesPerSec=176.10625276777216, CurrSamplesPerSec=178.75116848801414, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:41:09,123] [INFO] [logging.py:96:log_dist] [Rank 0] step=4100, skipped=75, lr=[1.538206607663065e-08, 1.538206607663065e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:41:09,141] [INFO] [timer.py:199:stop] epoch=1/micro_step=2035/global_step=4100, RunningAvgSamplesPerSec=176.11224617132692, CurrSamplesPerSec=178.71320585388196, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:41:12,710] [INFO] [logging.py:96:log_dist] [Rank 0] step=4110, skipped=75, lr=[1.2592883094279864e-08, 1.2592883094279864e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:41:12,729] [INFO] [timer.py:199:stop] epoch=1/micro_step=2045/global_step=4110, RunningAvgSamplesPerSec=176.11823327492866, CurrSamplesPerSec=177.62654782346888, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:41:16,292] [INFO] [logging.py:96:log_dist] [Rank 0] step=4120, skipped=75, lr=[1.008215824580811e-08, 1.008215824580811e-08], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:41:17,829] [INFO] [timer.py:199:stop] epoch=1/micro_step=2055/global_step=4120, RunningAvgSamplesPerSec=175.946326218234, CurrSamplesPerSec=34.11944142005577, MemAllocated=4.98GB, MaxMemAllocated=23.6GB +[2023-04-14 07:41:21,369] [INFO] [logging.py:96:log_dist] [Rank 0] step=4130, skipped=75, lr=[7.850036808149419e-09, 7.850036808149419e-09], mom=[(0.9, 0.95), (0.9, 0.95)] +[2023-04-14 07:41:21,387] [INFO] [timer.py:199:stop] epoch=1/micro_step=2065/global_step=4130, RunningAvgSamplesPerSec=175.9562787554597, CurrSamplesPerSec=192.90392440084798, MemAllocated=4.94GB, MaxMemAllocated=23.6GB +***** Evaluating perplexity, Epoch 2/2 ***** +ppl: 2.7940585613250732 +saving the final model ... +[2023-04-14 07:41:43,531] [INFO] [launch.py:460:main] Process 290012 exits successfully. +[2023-04-14 07:41:44,618] [INFO] [launch.py:460:main] Process 290013 exits successfully. +[2023-04-14 07:41:44,618] [INFO] [launch.py:460:main] Process 290009 exits successfully. +[2023-04-14 07:41:44,618] [INFO] [launch.py:460:main] Process 290014 exits successfully. +[2023-04-14 07:41:45,619] [INFO] [launch.py:460:main] Process 290017 exits successfully. +[2023-04-14 07:41:45,619] [INFO] [launch.py:460:main] Process 290016 exits successfully. +[2023-04-14 07:41:46,620] [INFO] [launch.py:460:main] Process 290011 exits successfully. +[2023-04-14 07:42:08,643] [INFO] [launch.py:460:main] Process 290008 exits successfully.