Add model weights and configurations

Browse files

Files changed (5) hide show

config.json +32 -0
merges.txt +0 -0
pytorch_model.bin +3 -0
training.log +567 -0
vocab.json +0 -0

config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "_name_or_path": "facebook/opt-350m",
+  "_remove_final_layer_norm": false,
+  "activation_dropout": 0.0,
+  "activation_function": "relu",
+  "architectures": [
+    "OPTForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 2,
+  "do_layer_norm_before": false,
+  "dropout": 0.0,
+  "enable_bias": true,
+  "end_token_id": 2,
+  "eos_token_id": 2,
+  "ffn_dim": 4096,
+  "hidden_size": 1024,
+  "init_std": 0.02,
+  "layer_norm_elementwise_affine": true,
+  "layerdrop": 0.0,
+  "max_position_embeddings": 2048,
+  "model_type": "opt",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "pad_token_id": 2,
+  "prefix": "</s>",
+  "torch_dtype": "float16",
+  "transformers_version": "4.29.0.dev0",
+  "use_cache": true,
+  "vocab_size": 50272,
+  "word_embed_proj_dim": 512
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2213a4c0aa57841d27f932bf15a5894c84b2004d8244db1b59e7d208ac679c6d
+size 662469311

training.log ADDED Viewed

@@ -0,0 +1,567 @@
  0%|          | 0/2 [00:00<?, ?it/s]
  0%|          | 0/2 [00:00<?, ?it/s]
  0%|          | 0/2 [00:00<?, ?it/s]
 50%|█████     | 1/2 [00:00<00:00,  1.04it/s]
  0%|          | 0/2 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s]
  0%|          | 0/2 [00:00<?, ?it/s]
  0%|          | 0/2 [00:00<?, ?it/s]

+2023-04-24 08:57:03.466438: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
+[2023-04-24 08:57:04,829] [WARNING] [runner.py:190:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
+[2023-04-24 08:57:04,857] [INFO] [runner.py:540:main] cmd = /root/miniconda3/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMV19 --master_addr=127.0.0.1 --master_port=29500 --enable_each_rank_log=None main.py --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets --data_split 2,4,4 --model_name_or_path facebook/opt-350m --num_padding_at_beginning 1 --per_device_train_batch_size 4 --per_device_eval_batch_size 4 --max_seq_len 512 --learning_rate 5e-5 --weight_decay 0.1 --num_train_epochs 1 --disable_dropout --gradient_accumulation_steps 16 --lr_scheduler_type cosine --num_warmup_steps 0 --seed 1234 --zero_stage 0 --deepspeed --output_dir ./output
+2023-04-24 08:57:07.026257: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
+[2023-04-24 08:57:08,460] [INFO] [launch.py:229:main] WORLD INFO DICT: {'localhost': [0, 1]}
+[2023-04-24 08:57:08,460] [INFO] [launch.py:235:main] nnodes=1, num_local_procs=2, node_rank=0
+[2023-04-24 08:57:08,460] [INFO] [launch.py:246:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1]})
+[2023-04-24 08:57:08,460] [INFO] [launch.py:247:main] dist_world_size=2
+[2023-04-24 08:57:08,460] [INFO] [launch.py:249:main] Setting CUDA_VISIBLE_DEVICES=0,1
+2023-04-24 08:57:10.316031: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
+2023-04-24 08:57:10.364356: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0
+[2023-04-24 08:57:12,808] [INFO] [comm.py:586:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
+Found cached dataset parquet (/root/.cache/huggingface/datasets/Dahoas___parquet/default-b9d2c4937d617106/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
+Found cached dataset parquet (/root/.cache/huggingface/datasets/Dahoas___parquet/default-b9d2c4937d617106/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
+Found cached dataset parquet (/root/.cache/huggingface/datasets/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
 50%|█████     | 1/2 [00:00<00:00,  1.04it/s]
+Found cached dataset parquet (/root/.cache/huggingface/datasets/Dahoas___parquet/default-b25c081aeeca3652/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
+Found cached dataset parquet (/root/.cache/huggingface/datasets/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/1 [00:00<?, ?it/s]
+Found cached dataset parquet (/root/.cache/huggingface/datasets/Dahoas___parquet/Dahoas--synthetic-instruct-gptj-pairwise-0b2fd7bd9ea121cb/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/1 [00:00<?, ?it/s]
+Found cached dataset parquet (/root/.cache/huggingface/datasets/yitingxie___parquet/yitingxie--rlhf-reward-datasets-f2627438ff1fb9dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
+Found cached dataset parquet (/root/.cache/huggingface/datasets/yitingxie___parquet/yitingxie--rlhf-reward-datasets-f2627438ff1fb9dd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
  0%|          | 0/2 [00:00<?, ?it/s]
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+Installed CUDA version 11.0 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
+Using /root/.cache/torch_extensions/py38_cu117 as PyTorch extensions root...
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+Detected CUDA files, patching ldflags
+Emitting ninja build file /root/.cache/torch_extensions/py38_cu117/fused_adam/build.ninja...
+Building extension module fused_adam...
+Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+ninja: no work to do.
+Loading extension module fused_adam...
+Time to load fused_adam op: 3.0242199897766113 seconds
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+Installed CUDA version 11.0 does not match the version torch was compiled with 11.7 but since the APIs are compatible, accepting this combination
+Using /root/.cache/torch_extensions/py38_cu117 as PyTorch extensions root...
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+Detected CUDA files, patching ldflags
+Emitting ninja build file /root/.cache/torch_extensions/py38_cu117/fused_adam/build.ninja...
+Building extension module fused_adam...
+Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+ninja: no work to do.
+Loading extension module fused_adam...
+Time to load fused_adam op: 4.067524433135986 seconds
+[2023-04-24 09:03:18,980] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed info: version=0.9.0, git-hash=unknown, git-branch=unknown
+[2023-04-24 09:03:18,985] [INFO] [comm.py:580:init_distributed] Distributed backend already initialized
+[2023-04-24 09:03:22,421] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False
+[2023-04-24 09:03:22,422] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the client Optimizer
+[2023-04-24 09:03:22,422] [INFO] [logging.py:96:log_dist] [Rank 0] Using client Optimizer as basic optimizer
+[2023-04-24 09:03:22,442] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam
+[2023-04-24 09:03:22,442] [INFO] [logging.py:96:log_dist] [Rank 0] Creating fp16 optimizer with dynamic loss scale
+[2023-04-24 09:03:23,341] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam
+[2023-04-24 09:03:23,342] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler
+[2023-04-24 09:03:23,342] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = <torch.optim.lr_scheduler.LambdaLR object at 0x7efef01548e0>
+[2023-04-24 09:03:23,342] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[5e-05, 5e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:03:23,342] [INFO] [config.py:953:print] DeepSpeedEngine configuration:
+Using /root/.cache/torch_extensions/py38_cu117 as PyTorch extensions root...
+[2023-04-24 09:03:23,344] [INFO] [config.py:957:print]   activation_checkpointing_config  {
+    "partition_activations": false,
+    "contiguous_memory_optimization": false,
+    "cpu_checkpointing": false,
+    "number_checkpoints": null,
+    "synchronize_checkpoint_boundary": false,
+    "profile": false
+}
+[2023-04-24 09:03:23,344] [INFO] [config.py:957:print]   aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True}
+[2023-04-24 09:03:23,344] [INFO] [config.py:957:print]   amp_enabled .................. False
+[2023-04-24 09:03:23,344] [INFO] [config.py:957:print]   amp_params ................... False
+[2023-04-24 09:03:23,344] [INFO] [config.py:957:print]   autotuning_config ............ {
+    "enabled": false,
+    "start_step": null,
+    "end_step": null,
+    "metric_path": null,
+    "arg_mappings": null,
+    "metric": "throughput",
+    "model_info": null,
+    "results_dir": "autotuning_results",
+    "exps_dir": "autotuning_exps",
+    "overwrite": true,
+    "fast": true,
+    "start_profile_step": 3,
+    "end_profile_step": 5,
+    "tuner_type": "gridsearch",
+    "tuner_early_stopping": 5,
+    "tuner_num_trials": 50,
+    "model_info_path": null,
+    "mp_size": 1,
+    "max_train_batch_size": null,
+    "min_train_batch_size": 1,
+    "max_train_micro_batch_size_per_gpu": 1.024000e+03,
+    "min_train_micro_batch_size_per_gpu": 1,
+    "num_tuning_micro_batch_sizes": 3
+}
+[2023-04-24 09:03:23,344] [INFO] [config.py:957:print]   bfloat16_enabled ............. False
+[2023-04-24 09:03:23,344] [INFO] [config.py:957:print]   checkpoint_parallel_write_pipeline  False
+[2023-04-24 09:03:23,344] [INFO] [config.py:957:print]   checkpoint_tag_validation_enabled  True
+[2023-04-24 09:03:23,344] [INFO] [config.py:957:print]   checkpoint_tag_validation_fail  False
+[2023-04-24 09:03:23,344] [INFO] [config.py:957:print]   comms_config ................. <deepspeed.comm.config.DeepSpeedCommsConfig object at 0x7efec839e700>
+[2023-04-24 09:03:23,344] [INFO] [config.py:957:print]   communication_data_type ...... None
+[2023-04-24 09:03:23,344] [INFO] [config.py:957:print]   compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}}
+[2023-04-24 09:03:23,344] [INFO] [config.py:957:print]   curriculum_enabled_legacy .... False
+[2023-04-24 09:03:23,344] [INFO] [config.py:957:print]   curriculum_params_legacy ..... False
+[2023-04-24 09:03:23,344] [INFO] [config.py:957:print]   data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}}
+[2023-04-24 09:03:23,344] [INFO] [config.py:957:print]   data_efficiency_enabled ...... False
+[2023-04-24 09:03:23,344] [INFO] [config.py:957:print]   dataloader_drop_last ......... False
+[2023-04-24 09:03:23,344] [INFO] [config.py:957:print]   disable_allgather ............ False
+[2023-04-24 09:03:23,344] [INFO] [config.py:957:print]   dump_state ................... False
+[2023-04-24 09:03:23,344] [INFO] [config.py:957:print]   dynamic_loss_scale_args ...... {'init_scale': 65536, 'scale_window': 100, 'delayed_shift': 2, 'min_scale': 1}
+[2023-04-24 09:03:23,344] [INFO] [config.py:957:print]   eigenvalue_enabled ........... False
+[2023-04-24 09:03:23,345] [INFO] [config.py:957:print]   eigenvalue_gas_boundary_resolution  1
+[2023-04-24 09:03:23,345] [INFO] [config.py:957:print]   eigenvalue_layer_name ........ bert.encoder.layer
+[2023-04-24 09:03:23,345] [INFO] [config.py:957:print]   eigenvalue_layer_num ......... 0
+[2023-04-24 09:03:23,345] [INFO] [config.py:957:print]   eigenvalue_max_iter .......... 100
+[2023-04-24 09:03:23,345] [INFO] [config.py:957:print]   eigenvalue_stability ......... 1e-06
+[2023-04-24 09:03:23,345] [INFO] [config.py:957:print]   eigenvalue_tol ............... 0.01
+[2023-04-24 09:03:23,345] [INFO] [config.py:957:print]   eigenvalue_verbose ........... False
+[2023-04-24 09:03:23,345] [INFO] [config.py:957:print]   elasticity_enabled ........... False
+[2023-04-24 09:03:23,345] [INFO] [config.py:957:print]   flops_profiler_config ........ {
+    "enabled": false,
+    "profile_step": 1,
+    "module_depth": -1,
+    "top_modules": 1,
+    "detailed": true,
+    "output_file": null
+}
+[2023-04-24 09:03:23,345] [INFO] [config.py:957:print]   fp16_auto_cast ............... False
+[2023-04-24 09:03:23,345] [INFO] [config.py:957:print]   fp16_enabled ................. True
+[2023-04-24 09:03:23,345] [INFO] [config.py:957:print]   fp16_master_weights_and_gradients  False
+[2023-04-24 09:03:23,345] [INFO] [config.py:957:print]   global_rank .................. 0
+[2023-04-24 09:03:23,345] [INFO] [config.py:957:print]   grad_accum_dtype ............. None
+[2023-04-24 09:03:23,345] [INFO] [config.py:957:print]   gradient_accumulation_steps .. 16
+[2023-04-24 09:03:23,345] [INFO] [config.py:957:print]   gradient_clipping ............ 1.0
+[2023-04-24 09:03:23,345] [INFO] [config.py:957:print]   gradient_predivide_factor .... 1.0
+[2023-04-24 09:03:23,394] [INFO] [config.py:957:print]   hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8
+[2023-04-24 09:03:23,394] [INFO] [config.py:957:print]   initial_dynamic_scale ........ 65536
+[2023-04-24 09:03:23,394] [INFO] [config.py:957:print]   load_universal_checkpoint .... False
+[2023-04-24 09:03:23,395] [INFO] [config.py:957:print]   loss_scale ................... 0
+[2023-04-24 09:03:23,395] [INFO] [config.py:957:print]   memory_breakdown ............. False
+[2023-04-24 09:03:23,395] [INFO] [config.py:957:print]   monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False
+[2023-04-24 09:03:23,395] [INFO] [config.py:957:print]   nebula_config ................ {
+    "enabled": false,
+    "persistent_storage_path": null,
+    "persistent_time_interval": 100,
+    "num_of_version_in_retention": 2,
+    "enable_nebula_load": true,
+    "load_path": null
+}
+[2023-04-24 09:03:23,395] [INFO] [config.py:957:print]   optimizer_legacy_fusion ...... False
+[2023-04-24 09:03:23,395] [INFO] [config.py:957:print]   optimizer_name ............... None
+[2023-04-24 09:03:23,395] [INFO] [config.py:957:print]   optimizer_params ............. None
+[2023-04-24 09:03:23,395] [INFO] [config.py:957:print]   pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0}
+[2023-04-24 09:03:23,395] [INFO] [config.py:957:print]   pld_enabled .................. False
+[2023-04-24 09:03:23,395] [INFO] [config.py:957:print]   pld_params ................... False
+[2023-04-24 09:03:23,395] [INFO] [config.py:957:print]   prescale_gradients ........... False
+[2023-04-24 09:03:23,395] [INFO] [config.py:957:print]   scheduler_name ............... None
+[2023-04-24 09:03:23,395] [INFO] [config.py:957:print]   scheduler_params ............. None
+[2023-04-24 09:03:23,395] [INFO] [config.py:957:print]   sparse_attention ............. None
+[2023-04-24 09:03:23,395] [INFO] [config.py:957:print]   sparse_gradients_enabled ..... False
+[2023-04-24 09:03:23,395] [INFO] [config.py:957:print]   steps_per_print .............. 10
+[2023-04-24 09:03:23,395] [INFO] [config.py:957:print]   train_batch_size ............. 128
+[2023-04-24 09:03:23,395] [INFO] [config.py:957:print]   train_micro_batch_size_per_gpu  4
+[2023-04-24 09:03:23,395] [INFO] [config.py:957:print]   use_node_local_storage ....... False
+[2023-04-24 09:03:23,395] [INFO] [config.py:957:print]   wall_clock_breakdown ......... False
+[2023-04-24 09:03:23,395] [INFO] [config.py:957:print]   world_size ................... 2
+[2023-04-24 09:03:23,395] [INFO] [config.py:957:print]   zero_allow_untested_optimizer  False
+[2023-04-24 09:03:23,395] [INFO] [config.py:957:print]   zero_config .................. stage=0 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500,000,000 allgather_partitions=True allgather_bucket_size=500,000,000 overlap_comm=False load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='none', nvme_path=None, buffer_count=5, buffer_size=100,000,000, max_in_cpu=1,000,000,000, pin_memory=False) offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='none', nvme_path=None, buffer_count=4, pin_memory=False, pipeline=False, pipeline_read=False, pipeline_write=False, fast_init=False) sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=30000000 param_persistence_threshold=10000 model_persistence_threshold=sys.maxsize max_live_parameters=30000000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False memory_efficient_linear=False
+[2023-04-24 09:03:23,396] [INFO] [config.py:957:print]   zero_enabled ................. False
+[2023-04-24 09:03:23,396] [INFO] [config.py:957:print]   zero_force_ds_cpu_optimizer .. True
+[2023-04-24 09:03:23,396] [INFO] [config.py:957:print]   zero_optimization_stage ...... 0
+[2023-04-24 09:03:23,396] [INFO] [config.py:943:print_user_config]   json = {
+    "train_batch_size": 128,
+    "train_micro_batch_size_per_gpu": 4,
+    "steps_per_print": 10,
+    "zero_optimization": {
+        "stage": 0,
+        "offload_param": {
+            "device": "none"
+        },
+        "offload_optimizer": {
+            "device": "none"
+        },
+        "stage3_param_persistence_threshold": 1.000000e+04,
+        "stage3_max_live_parameters": 3.000000e+07,
+        "stage3_prefetch_bucket_size": 3.000000e+07,
+        "memory_efficient_linear": false
+    },
+    "fp16": {
+        "enabled": true,
+        "loss_scale_window": 100
+    },
+    "gradient_clipping": 1.0,
+    "prescale_gradients": false,
+    "wall_clock_breakdown": false,
+    "hybrid_engine": {
+        "enabled": false,
+        "max_out_tokens": 512,
+        "inference_tp_size": 1,
+        "release_inference_cache": false,
+        "pin_parameters": true,
+        "tp_gather_partition_size": 8
+    }
+}
+Using /root/.cache/torch_extensions/py38_cu117 as PyTorch extensions root...
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+Emitting ninja build file /root/.cache/torch_extensions/py38_cu117/utils/build.ninja...
+Building extension module utils...
+Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
+huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
+To disable this warning, you can either:
+	- Avoid using `tokenizers` before the fork if possible
+	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
+ninja: no work to do.
+Loading extension module utils...
+Loading extension module utils...
+Time to load utils op: 3.2755730152130127 seconds
+Time to load utils op: 3.3291189670562744 seconds***** Running training *****
+***** Evaluating reward, Epoch 0/1 *****
+chosen_last_scores (higher is better) : 2.8115077018737793, acc (higher is better) : 0.4898989796638489
+Beginning of Epoch 1/1, Total Micro Batches 14720
+[2023-04-24 09:03:39,912] [INFO] [fused_optimizer.py:362:_update_scale]
+Grad overflow on iteration 0
+[2023-04-24 09:03:39,912] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0
+[2023-04-24 09:03:39,913] [INFO] [fused_optimizer.py:362:_update_scale]
+Grad overflow on iteration 0
+[2023-04-24 09:03:39,913] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 65536 to 32768.0
+[2023-04-24 09:03:39,913] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 65536, reducing to 32768.0
+[2023-04-24 09:03:41,990] [INFO] [fused_optimizer.py:362:_update_scale]
+Grad overflow on iteration 1
+[2023-04-24 09:03:41,990] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0
+[2023-04-24 09:03:41,990] [INFO] [fused_optimizer.py:362:_update_scale]
+Grad overflow on iteration 1
+[2023-04-24 09:03:41,990] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 32768.0 to 16384.0
+[2023-04-24 09:03:41,990] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 32768.0, reducing to 16384.0
+[2023-04-24 09:03:44,060] [INFO] [fused_optimizer.py:362:_update_scale]
+Grad overflow on iteration 2
+[2023-04-24 09:03:44,060] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0
+[2023-04-24 09:03:44,060] [INFO] [fused_optimizer.py:362:_update_scale]
+Grad overflow on iteration 2
+[2023-04-24 09:03:44,060] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 16384.0 to 8192.0
+[2023-04-24 09:03:44,061] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 16384.0, reducing to 8192.0
+[2023-04-24 09:03:46,124] [INFO] [fused_optimizer.py:362:_update_scale]
+Grad overflow on iteration 3
+[2023-04-24 09:03:46,125] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0
+[2023-04-24 09:03:46,125] [INFO] [fused_optimizer.py:362:_update_scale]
+Grad overflow on iteration 3
+[2023-04-24 09:03:46,125] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 8192.0 to 4096.0
+[2023-04-24 09:03:46,125] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 8192.0, reducing to 4096.0
+[2023-04-24 09:03:48,204] [INFO] [fused_optimizer.py:362:_update_scale]
+Grad overflow on iteration 4
+[2023-04-24 09:03:48,204] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2023-04-24 09:03:48,204] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 4096.0, reducing to 2048.0
+[2023-04-24 09:03:48,204] [INFO] [fused_optimizer.py:362:_update_scale]
+Grad overflow on iteration 4
+[2023-04-24 09:03:48,204] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2023-04-24 09:03:58,774] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=5, lr=[4.999635612423198e-05, 4.999635612423198e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:03:58,783] [INFO] [timer.py:199:stop] epoch=0/micro_step=160/global_step=10, RunningAvgSamplesPerSec=61.51210532361894, CurrSamplesPerSec=60.83514120602428, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:04:19,874] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=5, lr=[4.996721149113682e-05, 4.996721149113682e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:04:19,882] [INFO] [timer.py:199:stop] epoch=0/micro_step=320/global_step=20, RunningAvgSamplesPerSec=61.32068528160361, CurrSamplesPerSec=61.32577447546716, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:04:41,031] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=5, lr=[4.9908956206285e-05, 4.9908956206285e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:04:41,040] [INFO] [timer.py:199:stop] epoch=0/micro_step=480/global_step=30, RunningAvgSamplesPerSec=61.21340431017643, CurrSamplesPerSec=61.170166073067044, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:05:02,177] [INFO] [logging.py:96:log_dist] [Rank 0] step=40, skipped=5, lr=[4.982165819273275e-05, 4.982165819273275e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:05:02,185] [INFO] [timer.py:199:stop] epoch=0/micro_step=640/global_step=40, RunningAvgSamplesPerSec=61.16040850334567, CurrSamplesPerSec=61.111364165150086, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:05:23,353] [INFO] [logging.py:96:log_dist] [Rank 0] step=50, skipped=5, lr=[4.9705419236058825e-05, 4.9705419236058825e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:05:23,361] [INFO] [timer.py:199:stop] epoch=0/micro_step=800/global_step=50, RunningAvgSamplesPerSec=61.11057435313799, CurrSamplesPerSec=60.68853305528212, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:05:44,586] [INFO] [logging.py:96:log_dist] [Rank 0] step=60, skipped=5, lr=[4.956037486568706e-05, 4.956037486568706e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:05:44,595] [INFO] [timer.py:199:stop] epoch=0/micro_step=960/global_step=60, RunningAvgSamplesPerSec=61.05265832097097, CurrSamplesPerSec=60.38742640692105, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:06:05,826] [INFO] [logging.py:96:log_dist] [Rank 0] step=70, skipped=5, lr=[4.938669419686516e-05, 4.938669419686516e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:06:05,835] [INFO] [timer.py:199:stop] epoch=0/micro_step=1120/global_step=70, RunningAvgSamplesPerSec=61.01003172992011, CurrSamplesPerSec=60.64283669780565, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:06:27,080] [INFO] [logging.py:96:log_dist] [Rank 0] step=80, skipped=5, lr=[4.9184579733483796e-05, 4.9184579733483796e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:06:27,088] [INFO] [timer.py:199:stop] epoch=0/micro_step=1280/global_step=80, RunningAvgSamplesPerSec=60.96517960528534, CurrSamplesPerSec=60.81354467704264, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:06:48,267] [INFO] [logging.py:96:log_dist] [Rank 0] step=90, skipped=5, lr=[4.8954267131966225e-05, 4.8954267131966225e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:06:48,276] [INFO] [timer.py:199:stop] epoch=0/micro_step=1440/global_step=90, RunningAvgSamplesPerSec=60.94942552706825, CurrSamplesPerSec=60.94152112103058, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:07:09,486] [INFO] [logging.py:96:log_dist] [Rank 0] step=100, skipped=5, lr=[4.8696024926503396e-05, 4.8696024926503396e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:07:09,495] [INFO] [timer.py:199:stop] epoch=0/micro_step=1600/global_step=100, RunningAvgSamplesPerSec=60.9296893471321, CurrSamplesPerSec=60.876502675978514, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:07:22,212] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations
+[2023-04-24 09:07:22,212] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2023-04-24 09:07:22,213] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations
+[2023-04-24 09:07:22,213] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2023-04-24 09:07:30,748] [INFO] [logging.py:96:log_dist] [Rank 0] step=110, skipped=5, lr=[4.841015421595511e-05, 4.841015421595511e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:07:30,756] [INFO] [timer.py:199:stop] epoch=0/micro_step=1760/global_step=110, RunningAvgSamplesPerSec=60.90683530245522, CurrSamplesPerSec=60.700582142094774, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:07:51,981] [INFO] [logging.py:96:log_dist] [Rank 0] step=120, skipped=5, lr=[4.8096988312782174e-05, 4.8096988312782174e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:07:51,989] [INFO] [timer.py:199:stop] epoch=0/micro_step=1920/global_step=120, RunningAvgSamplesPerSec=60.89160527368584, CurrSamplesPerSec=61.30029339871167, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:08:13,170] [INFO] [logging.py:96:log_dist] [Rank 0] step=130, skipped=5, lr=[4.775689235441906e-05, 4.775689235441906e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:08:13,178] [INFO] [timer.py:199:stop] epoch=0/micro_step=2080/global_step=130, RunningAvgSamplesPerSec=60.886694362059956, CurrSamplesPerSec=61.00571500776447, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:08:34,429] [INFO] [logging.py:96:log_dist] [Rank 0] step=140, skipped=5, lr=[4.739026287753991e-05, 4.739026287753991e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:08:34,437] [INFO] [timer.py:199:stop] epoch=0/micro_step=2240/global_step=140, RunningAvgSamplesPerSec=60.87182179154682, CurrSamplesPerSec=60.8425664012069, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:08:49,207] [INFO] [fused_optimizer.py:362:_update_scale]
+Grad overflow on iteration 146
+[2023-04-24 09:08:49,208] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2023-04-24 09:08:49,208] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 4096.0, reducing to 2048.0
+[2023-04-24 09:08:49,208] [INFO] [fused_optimizer.py:362:_update_scale]
+Grad overflow on iteration 146
+[2023-04-24 09:08:49,211] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2023-04-24 09:08:55,561] [INFO] [logging.py:96:log_dist] [Rank 0] step=150, skipped=6, lr=[4.703796286561679e-05, 4.703796286561679e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:08:55,570] [INFO] [timer.py:199:stop] epoch=0/micro_step=2400/global_step=150, RunningAvgSamplesPerSec=60.88030350132201, CurrSamplesPerSec=60.9012526100533, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:09:16,724] [INFO] [logging.py:96:log_dist] [Rank 0] step=160, skipped=6, lr=[4.662212256151865e-05, 4.662212256151865e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:09:16,732] [INFO] [timer.py:199:stop] epoch=0/micro_step=2560/global_step=160, RunningAvgSamplesPerSec=60.881273455290064, CurrSamplesPerSec=60.82070966172722, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:09:39,439] [INFO] [logging.py:96:log_dist] [Rank 0] step=170, skipped=6, lr=[4.618107182972209e-05, 4.618107182972209e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:09:39,447] [INFO] [timer.py:199:stop] epoch=0/micro_step=2720/global_step=170, RunningAvgSamplesPerSec=60.666898387575976, CurrSamplesPerSec=60.71654288031407, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:10:00,739] [INFO] [logging.py:96:log_dist] [Rank 0] step=180, skipped=6, lr=[4.571532491565115e-05, 4.571532491565115e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:10:00,747] [INFO] [timer.py:199:stop] epoch=0/micro_step=2880/global_step=180, RunningAvgSamplesPerSec=60.659587093664726, CurrSamplesPerSec=60.579854984122456, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:10:22,004] [INFO] [logging.py:96:log_dist] [Rank 0] step=190, skipped=6, lr=[4.522542485937369e-05, 4.522542485937369e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:10:22,012] [INFO] [timer.py:199:stop] epoch=0/micro_step=3040/global_step=190, RunningAvgSamplesPerSec=60.660020471050444, CurrSamplesPerSec=60.73080823974732, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:10:44,226] [INFO] [logging.py:96:log_dist] [Rank 0] step=200, skipped=6, lr=[4.4711942862440933e-05, 4.4711942862440933e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:10:44,235] [INFO] [timer.py:199:stop] epoch=0/micro_step=3200/global_step=200, RunningAvgSamplesPerSec=60.537530952111894, CurrSamplesPerSec=60.675405272412185, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:11:05,529] [INFO] [logging.py:96:log_dist] [Rank 0] step=210, skipped=6, lr=[4.417547762189207e-05, 4.417547762189207e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:11:05,537] [INFO] [timer.py:199:stop] epoch=0/micro_step=3360/global_step=210, RunningAvgSamplesPerSec=60.53805827930432, CurrSamplesPerSec=60.32567200360737, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:11:26,763] [INFO] [logging.py:96:log_dist] [Rank 0] step=220, skipped=6, lr=[4.3616654632200224e-05, 4.3616654632200224e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:11:26,771] [INFO] [timer.py:199:stop] epoch=0/micro_step=3520/global_step=220, RunningAvgSamplesPerSec=60.54675766170691, CurrSamplesPerSec=60.738640896731475, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:11:48,387] [INFO] [logging.py:96:log_dist] [Rank 0] step=230, skipped=6, lr=[4.3036125455973896e-05, 4.3036125455973896e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:11:48,394] [INFO] [timer.py:199:stop] epoch=0/micro_step=3680/global_step=230, RunningAvgSamplesPerSec=60.51558004486595, CurrSamplesPerSec=60.626093127130964, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:12:09,687] [INFO] [logging.py:96:log_dist] [Rank 0] step=240, skipped=6, lr=[4.243456696426415e-05, 4.243456696426415e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:12:09,695] [INFO] [timer.py:199:stop] epoch=0/micro_step=3840/global_step=240, RunningAvgSamplesPerSec=60.51644089459085, CurrSamplesPerSec=60.49591898043134, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:12:26,745] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations
+[2023-04-24 09:12:26,745] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2023-04-24 09:12:26,746] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations
+[2023-04-24 09:12:26,746] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2023-04-24 09:12:31,546] [INFO] [logging.py:96:log_dist] [Rank 0] step=250, skipped=6, lr=[4.181268054736318e-05, 4.181268054736318e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:12:31,554] [INFO] [timer.py:199:stop] epoch=0/micro_step=4000/global_step=250, RunningAvgSamplesPerSec=60.464991187448234, CurrSamplesPerSec=60.65873281120384, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:12:52,760] [INFO] [fused_optimizer.py:362:_update_scale]
+Grad overflow on iteration 259
+[2023-04-24 09:12:52,761] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2023-04-24 09:12:52,761] [INFO] [fused_optimizer.py:362:_update_scale]
+Grad overflow on iteration 259
+[2023-04-24 09:12:52,761] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2023-04-24 09:12:52,761] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 4096.0, reducing to 2048.0
+[2023-04-24 09:12:52,762] [INFO] [logging.py:96:log_dist] [Rank 0] step=260, skipped=7, lr=[4.123620120825459e-05, 4.123620120825459e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:12:52,763] [INFO] [timer.py:199:stop] epoch=0/micro_step=4160/global_step=260, RunningAvgSamplesPerSec=60.477141994274575, CurrSamplesPerSec=61.980818962332464, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:13:13,989] [INFO] [logging.py:96:log_dist] [Rank 0] step=270, skipped=7, lr=[4.057770826965143e-05, 4.057770826965143e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:13:13,998] [INFO] [timer.py:199:stop] epoch=0/micro_step=4320/global_step=270, RunningAvgSamplesPerSec=60.48543743230756, CurrSamplesPerSec=60.899525538747255, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:13:35,911] [INFO] [logging.py:96:log_dist] [Rank 0] step=280, skipped=7, lr=[3.990105242003333e-05, 3.990105242003333e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:13:35,919] [INFO] [timer.py:199:stop] epoch=0/micro_step=4480/global_step=280, RunningAvgSamplesPerSec=60.43255824293708, CurrSamplesPerSec=60.25923447881468, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:13:57,129] [INFO] [logging.py:96:log_dist] [Rank 0] step=290, skipped=7, lr=[3.920702260985253e-05, 3.920702260985253e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:13:57,137] [INFO] [timer.py:199:stop] epoch=0/micro_step=4640/global_step=290, RunningAvgSamplesPerSec=60.44378189792225, CurrSamplesPerSec=60.88470438698686, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:14:18,402] [INFO] [logging.py:96:log_dist] [Rank 0] step=300, skipped=7, lr=[3.849642804682212e-05, 3.849642804682212e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:14:18,411] [INFO] [timer.py:199:stop] epoch=0/micro_step=4800/global_step=300, RunningAvgSamplesPerSec=60.4487078291063, CurrSamplesPerSec=60.269672507682024, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:14:39,685] [INFO] [logging.py:96:log_dist] [Rank 0] step=310, skipped=7, lr=[3.777009725241604e-05, 3.777009725241604e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:14:39,694] [INFO] [timer.py:199:stop] epoch=0/micro_step=4960/global_step=310, RunningAvgSamplesPerSec=60.45285396523987, CurrSamplesPerSec=60.6083187702923, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:15:00,887] [INFO] [logging.py:96:log_dist] [Rank 0] step=320, skipped=7, lr=[3.702887709585007e-05, 3.702887709585007e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:15:00,896] [INFO] [timer.py:199:stop] epoch=0/micro_step=5120/global_step=320, RunningAvgSamplesPerSec=60.46319932441218, CurrSamplesPerSec=60.87013889778603, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:15:22,137] [INFO] [logging.py:96:log_dist] [Rank 0] step=330, skipped=7, lr=[3.627363180667025e-05, 3.627363180667025e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:15:22,145] [INFO] [timer.py:199:stop] epoch=0/micro_step=5280/global_step=330, RunningAvgSamplesPerSec=60.469219800763234, CurrSamplesPerSec=60.56471073453441, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:15:43,479] [INFO] [logging.py:96:log_dist] [Rank 0] step=340, skipped=7, lr=[3.550524196709989e-05, 3.550524196709989e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:15:43,487] [INFO] [timer.py:199:stop] epoch=0/micro_step=5440/global_step=340, RunningAvgSamplesPerSec=60.46675533195661, CurrSamplesPerSec=60.727881817976154, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:16:04,683] [INFO] [logging.py:96:log_dist] [Rank 0] step=350, skipped=7, lr=[3.472460348532002e-05, 3.472460348532002e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:16:04,691] [INFO] [timer.py:199:stop] epoch=0/micro_step=5600/global_step=350, RunningAvgSamplesPerSec=60.47585607444012, CurrSamplesPerSec=60.889683102441296, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:16:25,916] [INFO] [logging.py:96:log_dist] [Rank 0] step=360, skipped=7, lr=[3.3932626550880615e-05, 3.3932626550880615e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:16:25,924] [INFO] [timer.py:199:stop] epoch=0/micro_step=5760/global_step=360, RunningAvgSamplesPerSec=60.48214856403934, CurrSamplesPerSec=60.8817424030914, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:16:28,026] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations
+[2023-04-24 09:16:28,026] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2023-04-24 09:16:28,027] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations
+[2023-04-24 09:16:28,027] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2023-04-24 09:16:47,552] [INFO] [logging.py:96:log_dist] [Rank 0] step=370, skipped=7, lr=[3.313023457346025e-05, 3.313023457346025e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:16:47,560] [INFO] [timer.py:199:stop] epoch=0/micro_step=5920/global_step=370, RunningAvgSamplesPerSec=60.46301516803084, CurrSamplesPerSec=60.85208322848843, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:17:08,718] [INFO] [logging.py:96:log_dist] [Rank 0] step=380, skipped=7, lr=[3.231836310621171e-05, 3.231836310621171e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:17:08,726] [INFO] [timer.py:199:stop] epoch=0/micro_step=6080/global_step=380, RunningAvgSamplesPerSec=60.474004456508, CurrSamplesPerSec=61.13067380947854, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:17:30,054] [INFO] [logging.py:96:log_dist] [Rank 0] step=390, skipped=7, lr=[3.149795875494889e-05, 3.149795875494889e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:17:30,061] [INFO] [timer.py:199:stop] epoch=0/micro_step=6240/global_step=390, RunningAvgSamplesPerSec=60.47432227036013, CurrSamplesPerSec=58.149482485973614, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:17:51,261] [INFO] [logging.py:96:log_dist] [Rank 0] step=400, skipped=7, lr=[3.066997807444675e-05, 3.066997807444675e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:17:51,269] [INFO] [timer.py:199:stop] epoch=0/micro_step=6400/global_step=400, RunningAvgSamplesPerSec=60.48208533846094, CurrSamplesPerSec=60.273394001433886, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:18:12,505] [INFO] [logging.py:96:log_dist] [Rank 0] step=410, skipped=7, lr=[2.9835386453141245e-05, 2.9835386453141245e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:18:12,514] [INFO] [timer.py:199:stop] epoch=0/micro_step=6560/global_step=410, RunningAvgSamplesPerSec=60.486959538097835, CurrSamplesPerSec=60.66644405655291, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:18:25,203] [INFO] [fused_optimizer.py:362:_update_scale]
+Grad overflow on iteration 415
+[2023-04-24 09:18:25,203] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2023-04-24 09:18:25,203] [INFO] [logging.py:96:log_dist] [Rank 0] Overflow detected. Skipping step. Attempted loss scale: 4096.0, reducing to 2048.0
+[2023-04-24 09:18:25,203] [INFO] [fused_optimizer.py:362:_update_scale]
+Grad overflow on iteration 415
+[2023-04-24 09:18:25,204] [INFO] [fused_optimizer.py:363:_update_scale] Reducing dynamic loss scale from 4096.0 to 2048.0
+[2023-04-24 09:18:33,772] [INFO] [logging.py:96:log_dist] [Rank 0] step=420, skipped=8, lr=[2.907940576282856e-05, 2.907940576282856e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:18:33,780] [INFO] [timer.py:199:stop] epoch=0/micro_step=6720/global_step=420, RunningAvgSamplesPerSec=60.491988861973766, CurrSamplesPerSec=60.757282498306424, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:18:54,892] [INFO] [logging.py:96:log_dist] [Rank 0] step=430, skipped=8, lr=[2.8234939691393763e-05, 2.8234939691393763e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:18:54,900] [INFO] [timer.py:199:stop] epoch=0/micro_step=6880/global_step=430, RunningAvgSamplesPerSec=60.503909930629604, CurrSamplesPerSec=61.07392085328382, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:19:16,124] [INFO] [logging.py:96:log_dist] [Rank 0] step=440, skipped=8, lr=[2.7386701824985255e-05, 2.7386701824985255e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:19:16,132] [INFO] [timer.py:199:stop] epoch=0/micro_step=7040/global_step=440, RunningAvgSamplesPerSec=60.50881644289209, CurrSamplesPerSec=60.6803155230895, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:19:37,512] [INFO] [logging.py:96:log_dist] [Rank 0] step=450, skipped=8, lr=[2.653568117101159e-05, 2.653568117101159e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:19:37,522] [INFO] [timer.py:199:stop] epoch=0/micro_step=7200/global_step=450, RunningAvgSamplesPerSec=60.50429386623982, CurrSamplesPerSec=60.66465487297247, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:19:58,781] [INFO] [logging.py:96:log_dist] [Rank 0] step=460, skipped=8, lr=[2.5682869981487152e-05, 2.5682869981487152e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:19:58,789] [INFO] [timer.py:199:stop] epoch=0/micro_step=7360/global_step=460, RunningAvgSamplesPerSec=60.506566014202704, CurrSamplesPerSec=60.70036252595932, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:20:20,094] [INFO] [logging.py:96:log_dist] [Rank 0] step=470, skipped=8, lr=[2.482926259611067e-05, 2.482926259611067e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:20:20,102] [INFO] [timer.py:199:stop] epoch=0/micro_step=7520/global_step=470, RunningAvgSamplesPerSec=60.506473546730156, CurrSamplesPerSec=60.541926625389024, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:20:41,369] [INFO] [logging.py:96:log_dist] [Rank 0] step=480, skipped=8, lr=[2.3975854282909644e-05, 2.3975854282909644e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:20:41,377] [INFO] [timer.py:199:stop] epoch=0/micro_step=7680/global_step=480, RunningAvgSamplesPerSec=60.50861426244519, CurrSamplesPerSec=60.75233915968026, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:21:02,634] [INFO] [logging.py:96:log_dist] [Rank 0] step=490, skipped=8, lr=[2.3123640077802307e-05, 2.3123640077802307e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:21:02,642] [INFO] [timer.py:199:stop] epoch=0/micro_step=7840/global_step=490, RunningAvgSamplesPerSec=60.5114010784725, CurrSamplesPerSec=60.55857591539884, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:21:23,895] [INFO] [logging.py:96:log_dist] [Rank 0] step=500, skipped=8, lr=[2.2273613624430255e-05, 2.2273613624430255e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:21:23,904] [INFO] [timer.py:199:stop] epoch=0/micro_step=8000/global_step=500, RunningAvgSamplesPerSec=60.51378147937269, CurrSamplesPerSec=60.44800739331908, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:21:45,129] [INFO] [logging.py:96:log_dist] [Rank 0] step=510, skipped=8, lr=[2.1426766015614466e-05, 2.1426766015614466e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:21:45,137] [INFO] [timer.py:199:stop] epoch=0/micro_step=8160/global_step=510, RunningAvgSamplesPerSec=60.518117022263134, CurrSamplesPerSec=60.77689189221315, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:21:59,990] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations
+[2023-04-24 09:21:59,991] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations
+[2023-04-24 09:21:59,991] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2023-04-24 09:21:59,991] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 2048.0 to 4096.0
+[2023-04-24 09:22:06,380] [INFO] [logging.py:96:log_dist] [Rank 0] step=520, skipped=8, lr=[2.0584084637785317e-05, 2.0584084637785317e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:22:06,388] [INFO] [timer.py:199:stop] epoch=0/micro_step=8320/global_step=520, RunningAvgSamplesPerSec=60.52123347333187, CurrSamplesPerSec=60.490718188456654, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:22:27,668] [INFO] [logging.py:96:log_dist] [Rank 0] step=530, skipped=8, lr=[1.9746552019734245e-05, 1.9746552019734245e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:22:27,676] [INFO] [timer.py:199:stop] epoch=0/micro_step=8480/global_step=530, RunningAvgSamplesPerSec=60.52210681196064, CurrSamplesPerSec=60.78351833929842, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:22:48,993] [INFO] [logging.py:96:log_dist] [Rank 0] step=540, skipped=8, lr=[1.8915144687029106e-05, 1.8915144687029106e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:22:49,002] [INFO] [timer.py:199:stop] epoch=0/micro_step=8640/global_step=540, RunningAvgSamplesPerSec=60.52100741071719, CurrSamplesPerSec=60.611993244597265, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:23:10,219] [INFO] [logging.py:96:log_dist] [Rank 0] step=550, skipped=8, lr=[1.8090832023429023e-05, 1.8090832023429023e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:23:10,227] [INFO] [timer.py:199:stop] epoch=0/micro_step=8800/global_step=550, RunningAvgSamplesPerSec=60.52494703545672, CurrSamplesPerSec=60.83799524488308, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:23:31,493] [INFO] [logging.py:96:log_dist] [Rank 0] step=560, skipped=8, lr=[1.7274575140626318e-05, 1.7274575140626318e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:23:31,501] [INFO] [timer.py:199:stop] epoch=0/micro_step=8960/global_step=560, RunningAvgSamplesPerSec=60.526290993953985, CurrSamplesPerSec=60.75868520542618, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:23:52,791] [INFO] [logging.py:96:log_dist] [Rank 0] step=570, skipped=8, lr=[1.6467325757633244e-05, 1.6467325757633244e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:23:52,799] [INFO] [timer.py:199:stop] epoch=0/micro_step=9120/global_step=570, RunningAvgSamplesPerSec=60.526357511453874, CurrSamplesPerSec=60.875315405985745, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:24:14,095] [INFO] [logging.py:96:log_dist] [Rank 0] step=580, skipped=8, lr=[1.567002509112022e-05, 1.567002509112022e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:24:14,109] [INFO] [timer.py:199:stop] epoch=0/micro_step=9280/global_step=580, RunningAvgSamplesPerSec=60.52591377902791, CurrSamplesPerSec=60.04186181235818, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:24:35,414] [INFO] [logging.py:96:log_dist] [Rank 0] step=590, skipped=8, lr=[1.4883602757999259e-05, 1.4883602757999259e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:24:35,423] [INFO] [timer.py:199:stop] epoch=0/micro_step=9440/global_step=590, RunningAvgSamplesPerSec=60.525288947752536, CurrSamplesPerSec=60.67137342614858, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:24:56,686] [INFO] [logging.py:96:log_dist] [Rank 0] step=600, skipped=8, lr=[1.4108975691532272e-05, 1.4108975691532272e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:24:56,695] [INFO] [timer.py:199:stop] epoch=0/micro_step=9600/global_step=600, RunningAvgSamplesPerSec=60.526813985603944, CurrSamplesPerSec=60.86025766289617, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:25:17,956] [INFO] [logging.py:96:log_dist] [Rank 0] step=610, skipped=8, lr=[1.334704707222787e-05, 1.334704707222787e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:25:17,964] [INFO] [timer.py:199:stop] epoch=0/micro_step=9760/global_step=610, RunningAvgSamplesPerSec=60.52805583402273, CurrSamplesPerSec=60.2123857161619, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:25:32,842] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations
+[2023-04-24 09:25:32,842] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0
+[2023-04-24 09:25:32,842] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations
+[2023-04-24 09:25:32,842] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 4096.0 to 8192.0
+[2023-04-24 09:25:39,246] [INFO] [logging.py:96:log_dist] [Rank 0] step=620, skipped=8, lr=[1.2598705274773297e-05, 1.2598705274773297e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:25:39,254] [INFO] [timer.py:199:stop] epoch=0/micro_step=9920/global_step=620, RunningAvgSamplesPerSec=60.52822029251744, CurrSamplesPerSec=60.83554792466907, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:26:00,590] [INFO] [logging.py:96:log_dist] [Rank 0] step=630, skipped=8, lr=[1.1864822832229319e-05, 1.1864822832229319e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:26:00,598] [INFO] [timer.py:199:stop] epoch=0/micro_step=10080/global_step=630, RunningAvgSamplesPerSec=60.5263020248062, CurrSamplesPerSec=60.54541552543446, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:26:21,856] [INFO] [logging.py:96:log_dist] [Rank 0] step=640, skipped=8, lr=[1.1146255418695634e-05, 1.1146255418695634e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:26:21,864] [INFO] [timer.py:199:stop] epoch=0/micro_step=10240/global_step=640, RunningAvgSamplesPerSec=60.52784305674766, CurrSamplesPerSec=60.71732568721792, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:26:43,079] [INFO] [logging.py:96:log_dist] [Rank 0] step=650, skipped=8, lr=[1.0443840851633227e-05, 1.0443840851633227e-05], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:26:43,087] [INFO] [timer.py:199:stop] epoch=0/micro_step=10400/global_step=650, RunningAvgSamplesPerSec=60.53106464563775, CurrSamplesPerSec=60.53981026749469, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:27:04,361] [INFO] [logging.py:96:log_dist] [Rank 0] step=660, skipped=8, lr=[9.758398115006636e-06, 9.758398115006636e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:27:04,370] [INFO] [timer.py:199:stop] epoch=0/micro_step=10560/global_step=660, RunningAvgSamplesPerSec=60.531647957004324, CurrSamplesPerSec=60.6890270012412, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:27:25,625] [INFO] [logging.py:96:log_dist] [Rank 0] step=670, skipped=8, lr=[9.090726404385318e-06, 9.090726404385318e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:27:25,634] [INFO] [timer.py:199:stop] epoch=0/micro_step=10720/global_step=670, RunningAvgSamplesPerSec=60.53316559150195, CurrSamplesPerSec=60.11515009614578, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:27:46,860] [INFO] [logging.py:96:log_dist] [Rank 0] step=680, skipped=8, lr=[8.441604195117314e-06, 8.441604195117314e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:27:46,868] [INFO] [timer.py:199:stop] epoch=0/micro_step=10880/global_step=680, RunningAvgSamplesPerSec=60.53583251320617, CurrSamplesPerSec=60.856180518573986, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:28:08,155] [INFO] [logging.py:96:log_dist] [Rank 0] step=690, skipped=8, lr=[7.811788334661871e-06, 7.811788334661871e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:28:08,164] [INFO] [timer.py:199:stop] epoch=0/micro_step=11040/global_step=690, RunningAvgSamplesPerSec=60.53599184033059, CurrSamplesPerSec=60.01363900326577, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:28:29,424] [INFO] [logging.py:96:log_dist] [Rank 0] step=700, skipped=8, lr=[7.202013160139159e-06, 7.202013160139159e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:28:29,432] [INFO] [timer.py:199:stop] epoch=0/micro_step=11200/global_step=700, RunningAvgSamplesPerSec=60.536899328768676, CurrSamplesPerSec=60.566733174475715, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:28:50,680] [INFO] [logging.py:96:log_dist] [Rank 0] step=710, skipped=8, lr=[6.612989642125977e-06, 6.612989642125977e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:28:50,688] [INFO] [timer.py:199:stop] epoch=0/micro_step=11360/global_step=710, RunningAvgSamplesPerSec=60.53802065850707, CurrSamplesPerSec=60.684430870369724, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:29:05,535] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations
+[2023-04-24 09:29:05,535] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 8192.0 to 16384.0
+[2023-04-24 09:29:05,535] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations
+[2023-04-24 09:29:05,535] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 8192.0 to 16384.0
+[2023-04-24 09:29:11,912] [INFO] [logging.py:96:log_dist] [Rank 0] step=720, skipped=8, lr=[6.045404555695935e-06, 6.045404555695935e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:29:11,920] [INFO] [timer.py:199:stop] epoch=0/micro_step=11520/global_step=720, RunningAvgSamplesPerSec=60.539960893126654, CurrSamplesPerSec=60.72665912434403, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:29:33,098] [INFO] [logging.py:96:log_dist] [Rank 0] step=730, skipped=8, lr=[5.499919679670385e-06, 5.499919679670385e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:29:33,106] [INFO] [timer.py:199:stop] epoch=0/micro_step=11680/global_step=730, RunningAvgSamplesPerSec=60.543559486117964, CurrSamplesPerSec=60.87660621916176, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:29:54,353] [INFO] [logging.py:96:log_dist] [Rank 0] step=740, skipped=8, lr=[4.977171025013961e-06, 4.977171025013961e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:29:54,361] [INFO] [timer.py:199:stop] epoch=0/micro_step=11840/global_step=740, RunningAvgSamplesPerSec=60.54431851547979, CurrSamplesPerSec=59.92725649116893, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:30:15,620] [INFO] [logging.py:96:log_dist] [Rank 0] step=750, skipped=8, lr=[4.4777680932742124e-06, 4.4777680932742124e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:30:15,628] [INFO] [timer.py:199:stop] epoch=0/micro_step=12000/global_step=750, RunningAvgSamplesPerSec=60.54506897971798, CurrSamplesPerSec=59.97723355801344, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:30:36,879] [INFO] [logging.py:96:log_dist] [Rank 0] step=760, skipped=8, lr=[4.002293165930087e-06, 4.002293165930087e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:30:36,888] [INFO] [timer.py:199:stop] epoch=0/micro_step=12160/global_step=760, RunningAvgSamplesPerSec=60.545861716922474, CurrSamplesPerSec=60.784619446606946, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:30:58,108] [INFO] [logging.py:96:log_dist] [Rank 0] step=770, skipped=8, lr=[3.5513006254777633e-06, 3.5513006254777633e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:30:58,117] [INFO] [timer.py:199:stop] epoch=0/micro_step=12320/global_step=770, RunningAvgSamplesPerSec=60.54774362050558, CurrSamplesPerSec=60.86130635686519, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:31:19,341] [INFO] [logging.py:96:log_dist] [Rank 0] step=780, skipped=8, lr=[3.125316309045434e-06, 3.125316309045434e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:31:19,346] [INFO] [timer.py:199:stop] epoch=0/micro_step=12480/global_step=780, RunningAvgSamplesPerSec=60.54979854118585, CurrSamplesPerSec=60.50332977926978, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:31:40,621] [INFO] [logging.py:96:log_dist] [Rank 0] step=790, skipped=8, lr=[2.7248368952908053e-06, 2.7248368952908053e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:31:40,629] [INFO] [timer.py:199:stop] epoch=0/micro_step=12640/global_step=790, RunningAvgSamplesPerSec=60.54961566969219, CurrSamplesPerSec=60.700801759819385, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:32:01,909] [INFO] [logging.py:96:log_dist] [Rank 0] step=800, skipped=8, lr=[2.3503293252959136e-06, 2.3503293252959136e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:32:01,917] [INFO] [timer.py:199:stop] epoch=0/micro_step=12800/global_step=800, RunningAvgSamplesPerSec=60.54940154367315, CurrSamplesPerSec=60.67537784307631, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:32:23,227] [INFO] [logging.py:96:log_dist] [Rank 0] step=810, skipped=8, lr=[2.002230258134777e-06, 2.002230258134777e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:32:23,236] [INFO] [timer.py:199:stop] epoch=0/micro_step=12960/global_step=810, RunningAvgSamplesPerSec=60.5486748475695, CurrSamplesPerSec=60.55637642773143, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:32:38,117] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations
+[2023-04-24 09:32:38,117] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 16384.0 to 32768.0
+[2023-04-24 09:32:38,117] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations
+[2023-04-24 09:32:38,118] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 16384.0 to 32768.0
+[2023-04-24 09:32:44,520] [INFO] [logging.py:96:log_dist] [Rank 0] step=820, skipped=8, lr=[1.6809455617484121e-06, 1.6809455617484121e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:32:44,528] [INFO] [timer.py:199:stop] epoch=0/micro_step=13120/global_step=820, RunningAvgSamplesPerSec=60.548375802646994, CurrSamplesPerSec=60.877641670367076, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:33:05,712] [INFO] [logging.py:96:log_dist] [Rank 0] step=830, skipped=8, lr=[1.3868498397209884e-06, 1.3868498397209884e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:33:05,720] [INFO] [timer.py:199:stop] epoch=0/micro_step=13280/global_step=830, RunningAvgSamplesPerSec=60.55139022382489, CurrSamplesPerSec=60.4629571400384, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:33:26,947] [INFO] [logging.py:96:log_dist] [Rank 0] step=840, skipped=8, lr=[1.120285994508799e-06, 1.120285994508799e-06], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:33:26,956] [INFO] [timer.py:199:stop] epoch=0/micro_step=13440/global_step=840, RunningAvgSamplesPerSec=60.55289115435484, CurrSamplesPerSec=60.43421471056814, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:33:48,352] [INFO] [logging.py:96:log_dist] [Rank 0] step=850, skipped=8, lr=[8.815648276313343e-07, 8.815648276313343e-07], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:33:48,360] [INFO] [timer.py:199:stop] epoch=0/micro_step=13600/global_step=850, RunningAvgSamplesPerSec=60.54976302656956, CurrSamplesPerSec=60.760466186705784, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:34:09,519] [INFO] [logging.py:96:log_dist] [Rank 0] step=860, skipped=8, lr=[6.70964677290617e-07, 6.70964677290617e-07], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:34:09,527] [INFO] [timer.py:199:stop] epoch=0/micro_step=13760/global_step=860, RunningAvgSamplesPerSec=60.553324194365025, CurrSamplesPerSec=60.862858766256956, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:34:30,746] [INFO] [logging.py:96:log_dist] [Rank 0] step=870, skipped=8, lr=[4.887310938412992e-07, 4.887310938412992e-07], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:34:30,754] [INFO] [timer.py:199:stop] epoch=0/micro_step=13920/global_step=870, RunningAvgSamplesPerSec=60.55515450846327, CurrSamplesPerSec=60.64690585743714, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:34:52,006] [INFO] [logging.py:96:log_dist] [Rank 0] step=880, skipped=8, lr=[3.350765534899519e-07, 3.350765534899519e-07], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:34:52,015] [INFO] [timer.py:199:stop] epoch=0/micro_step=14080/global_step=880, RunningAvgSamplesPerSec=60.55577531441298, CurrSamplesPerSec=60.50703244993614, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:35:13,281] [INFO] [logging.py:96:log_dist] [Rank 0] step=890, skipped=8, lr=[2.1018021055729253e-07, 2.1018021055729253e-07], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:35:13,288] [INFO] [timer.py:199:stop] epoch=0/micro_step=14240/global_step=890, RunningAvgSamplesPerSec=60.556210297272315, CurrSamplesPerSec=60.90302814146876, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:35:34,603] [INFO] [logging.py:96:log_dist] [Rank 0] step=900, skipped=8, lr=[1.1418768859227935e-07, 1.1418768859227935e-07], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:35:34,611] [INFO] [timer.py:199:stop] epoch=0/micro_step=14400/global_step=900, RunningAvgSamplesPerSec=60.555173309646946, CurrSamplesPerSec=60.68965816611215, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:35:55,889] [INFO] [logging.py:96:log_dist] [Rank 0] step=910, skipped=8, lr=[4.721091058154936e-08, 4.721091058154936e-08], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:35:55,897] [INFO] [timer.py:199:stop] epoch=0/micro_step=14560/global_step=910, RunningAvgSamplesPerSec=60.55538593711447, CurrSamplesPerSec=60.681419753149214, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+[2023-04-24 09:36:10,761] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations
+[2023-04-24 09:36:10,762] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 32768.0 to 65536.0
+[2023-04-24 09:36:10,762] [INFO] [fused_optimizer.py:370:_update_scale] No Grad overflow for 100 iterations
+[2023-04-24 09:36:10,762] [INFO] [fused_optimizer.py:371:_update_scale] Increasing dynamic loss scale from 32768.0 to 65536.0
+[2023-04-24 09:36:17,130] [INFO] [logging.py:96:log_dist] [Rank 0] step=920, skipped=8, lr=[9.327968452232938e-09, 9.327968452232938e-09], mom=[(0.9, 0.95), (0.9, 0.95)]
+[2023-04-24 09:36:17,138] [INFO] [timer.py:199:stop] epoch=0/micro_step=14720/global_step=920, RunningAvgSamplesPerSec=60.55687921469749, CurrSamplesPerSec=61.34238112970018, MemAllocated=4.34GB, MaxMemAllocated=13.4GB
+Epoch 1/1 with loss 0.5625120004881983
+***** Evaluating reward, Epoch 1/1 *****
+chosen_last_scores (higher is better) : 1.9881560802459717, acc (higher is better) : 0.691919207572937
+saving model ...
+[2023-04-24 09:36:43,700] [INFO] [launch.py:460:main] Process 22627 exits successfully.
+[2023-04-24 09:36:46,703] [INFO] [launch.py:460:main] Process 22628 exits successfully.

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff