diff --git "a/training.log" "b/training.log" --- "a/training.log" +++ "b/training.log" @@ -1,32 +1,29 @@ -[2023-12-10 06:04:48,143] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2023-12-10 06:04:50,372] [WARNING] [runner.py:203:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only. -[2023-12-10 06:04:50,372] [INFO] [runner.py:570:main] cmd = /home/t-sokumar/miniconda3/envs/ft/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMSwgMiwgM119 --master_addr=127.0.0.1 --master_port=29500 --enable_each_rank_log=None main.py --data_path local/jsonfile --data_split 1,0,0 --model_name_or_path codellama/CodeLlama-7b-hf --per_device_train_batch_size 8 --per_device_eval_batch_size 8 --max_seq_len 512 --learning_rate 9.65e-6 --weight_decay 0. --num_train_epochs 100 --gradient_accumulation_steps 1 --lr_scheduler_type cosine --num_warmup_steps 0 --seed 1234 --gradient_checkpointing --zero_stage 3 --deepspeed --lora_dim 128 --lora_module_name layers. --output_dir ./output_step1_Codellama_7b_lora_100 --add_eot_token -[2023-12-10 06:04:52,920] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2023-12-10 06:04:54,723] [INFO] [launch.py:145:main] WORLD INFO DICT: {'localhost': [0, 1, 2, 3]} -[2023-12-10 06:04:54,723] [INFO] [launch.py:151:main] nnodes=1, num_local_procs=4, node_rank=0 -[2023-12-10 06:04:54,723] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(, {'localhost': [0, 1, 2, 3]}) -[2023-12-10 06:04:54,723] [INFO] [launch.py:163:main] dist_world_size=4 -[2023-12-10 06:04:54,723] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3 -[2023-12-10 06:04:58,235] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2023-12-10 06:04:58,237] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2023-12-10 06:04:58,242] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) -[2023-12-10 06:04:58,248] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2023-12-11 05:39:03,031] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2023-12-11 05:39:04,827] [WARNING] [runner.py:203:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only. +[2023-12-11 05:39:04,828] [INFO] [runner.py:570:main] cmd = /home/t-sokumar/miniconda3/envs/ft/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMSwgMiwgM119 --master_addr=127.0.0.1 --master_port=29500 --enable_each_rank_log=None main.py --data_path local/jsonfile --data_split 1,0,0 --model_name_or_path codellama/CodeLlama-7b-hf --per_device_train_batch_size 8 --per_device_eval_batch_size 8 --max_seq_len 512 --learning_rate 9.65e-6 --weight_decay 0. --num_train_epochs 3 --gradient_accumulation_steps 1 --lr_scheduler_type cosine --num_warmup_steps 0 --seed 1234 --gradient_checkpointing --zero_stage 3 --deepspeed --lora_dim 128 --lora_module_name layers. --output_dir ./output_step1_Codellama_7b_lora_llamahub-devrev --add_eot_token +[2023-12-11 05:39:07,364] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2023-12-11 05:39:09,159] [INFO] [launch.py:145:main] WORLD INFO DICT: {'localhost': [0, 1, 2, 3]} +[2023-12-11 05:39:09,159] [INFO] [launch.py:151:main] nnodes=1, num_local_procs=4, node_rank=0 +[2023-12-11 05:39:09,159] [INFO] [launch.py:162:main] global_rank_mapping=defaultdict(, {'localhost': [0, 1, 2, 3]}) +[2023-12-11 05:39:09,159] [INFO] [launch.py:163:main] dist_world_size=4 +[2023-12-11 05:39:09,159] [INFO] [launch.py:165:main] Setting CUDA_VISIBLE_DEVICES=0,1,2,3 +[2023-12-11 05:39:12,594] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2023-12-11 05:39:12,600] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2023-12-11 05:39:12,605] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) +[2023-12-11 05:39:12,606] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect) /home/t-sokumar/miniconda3/envs/ft/lib/python3.11/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations warnings.warn( -[2023-12-10 06:04:59,732] [INFO] [comm.py:637:init_distributed] cdb=None -[2023-12-10 06:04:59,732] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl /home/t-sokumar/miniconda3/envs/ft/lib/python3.11/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations warnings.warn( /home/t-sokumar/miniconda3/envs/ft/lib/python3.11/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations warnings.warn( /home/t-sokumar/miniconda3/envs/ft/lib/python3.11/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations warnings.warn( -[2023-12-10 06:05:00,174] [INFO] [comm.py:637:init_distributed] cdb=None -[2023-12-10 06:05:00,206] [INFO] [comm.py:637:init_distributed] cdb=None -[2023-12-10 06:05:00,222] [INFO] [comm.py:637:init_distributed] cdb=None -The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. -The tokenizer class you load from this checkpoint is 'CodeLlamaTokenizer'. -The class this function is called from is 'LlamaTokenizer'. +[2023-12-11 05:39:14,179] [INFO] [comm.py:637:init_distributed] cdb=None +[2023-12-11 05:39:14,179] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl +[2023-12-11 05:39:14,642] [INFO] [comm.py:637:init_distributed] cdb=None +[2023-12-11 05:39:14,646] [INFO] [comm.py:637:init_distributed] cdb=None +[2023-12-11 05:39:14,678] [INFO] [comm.py:637:init_distributed] cdb=None The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. The tokenizer class you load from this checkpoint is 'CodeLlamaTokenizer'. The class this function is called from is 'LlamaTokenizer'. @@ -39,12 +36,15 @@ The class this function is called from is 'LlamaTokenizer'. You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 +The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. +The tokenizer class you load from this checkpoint is 'CodeLlamaTokenizer'. +The class this function is called from is 'LlamaTokenizer'. You are using the default legacy behaviour of the . This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 -[2023-12-10 06:05:02,823] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 291, num_elems = 6.74B - Loading checkpoint shards: 0%| | 0/2 [00:00 -[2023-12-10 06:05:13,148] [INFO] [logging.py:96:log_dist] [Rank 0] Creating fp16 ZeRO stage 3 optimizer, MiCS is enabled False, Hierarchical params gather False -[2023-12-10 06:05:13,148] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.float16 ZeRO stage 3 optimizer -[2023-12-10 06:05:13,281] [INFO] [utils.py:795:see_memory_usage] Stage 3 initialize beginning -[2023-12-10 06:05:13,282] [INFO] [utils.py:796:see_memory_usage] MA 4.37 GB Max_MA 4.75 GB CA 11.16 GB Max_CA 11 GB -[2023-12-10 06:05:13,282] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 46.59 GB, percent = 18.5% -[2023-12-10 06:05:13,284] [INFO] [stage3.py:127:__init__] Reduce bucket size 500,000,000 -[2023-12-10 06:05:13,284] [INFO] [stage3.py:128:__init__] Prefetch bucket size 30000000 -[2023-12-10 06:05:13,418] [INFO] [utils.py:795:see_memory_usage] DeepSpeedZeRoOffload initialize [begin] -[2023-12-10 06:05:13,418] [INFO] [utils.py:796:see_memory_usage] MA 4.37 GB Max_MA 4.37 GB CA 11.16 GB Max_CA 11 GB -[2023-12-10 06:05:13,418] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 46.91 GB, percent = 18.6% +[2023-12-11 05:39:51,163] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Flops Profiler Enabled: False +[2023-12-11 05:39:51,164] [INFO] [logging.py:96:log_dist] [Rank 0] Using client Optimizer as basic optimizer +[2023-12-11 05:39:51,164] [INFO] [logging.py:96:log_dist] [Rank 0] Removing param_group that has no 'params' in the basic Optimizer +[2023-12-11 05:39:51,206] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Basic Optimizer = FusedAdam +[2023-12-11 05:39:51,206] [INFO] [utils.py:56:is_zero_supported_optimizer] Checking ZeRO support for optimizer=FusedAdam type= +[2023-12-11 05:39:51,206] [INFO] [logging.py:96:log_dist] [Rank 0] Creating fp16 ZeRO stage 3 optimizer, MiCS is enabled False, Hierarchical params gather False +[2023-12-11 05:39:51,206] [INFO] [logging.py:96:log_dist] [Rank 0] Creating torch.float16 ZeRO stage 3 optimizer +[2023-12-11 05:39:51,330] [INFO] [utils.py:795:see_memory_usage] Stage 3 initialize beginning +[2023-12-11 05:39:51,331] [INFO] [utils.py:796:see_memory_usage] MA 4.37 GB Max_MA 4.75 GB CA 8.35 GB Max_CA 8 GB +[2023-12-11 05:39:51,331] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 98.11 GB, percent = 39.0% +[2023-12-11 05:39:51,333] [INFO] [stage3.py:127:__init__] Reduce bucket size 500,000,000 +[2023-12-11 05:39:51,333] [INFO] [stage3.py:128:__init__] Prefetch bucket size 30000000 +[2023-12-11 05:39:51,450] [INFO] [utils.py:795:see_memory_usage] DeepSpeedZeRoOffload initialize [begin] +[2023-12-11 05:39:51,450] [INFO] [utils.py:796:see_memory_usage] MA 4.37 GB Max_MA 4.37 GB CA 8.35 GB Max_CA 8 GB +[2023-12-11 05:39:51,450] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 98.13 GB, percent = 39.0% Parameter Offload: Total persistent parameters: 266240 in 65 params -[2023-12-10 06:05:13,731] [INFO] [utils.py:795:see_memory_usage] DeepSpeedZeRoOffload initialize [end] -[2023-12-10 06:05:13,732] [INFO] [utils.py:796:see_memory_usage] MA 3.54 GB Max_MA 4.43 GB CA 11.16 GB Max_CA 11 GB -[2023-12-10 06:05:13,732] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 47.37 GB, percent = 18.8% -[2023-12-10 06:05:13,854] [INFO] [utils.py:795:see_memory_usage] Before creating fp16 partitions -[2023-12-10 06:05:13,855] [INFO] [utils.py:796:see_memory_usage] MA 3.54 GB Max_MA 3.54 GB CA 11.16 GB Max_CA 11 GB -[2023-12-10 06:05:13,855] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 47.42 GB, percent = 18.8% -[2023-12-10 06:05:14,592] [INFO] [utils.py:795:see_memory_usage] After creating fp16 partitions: 3 -[2023-12-10 06:05:14,593] [INFO] [utils.py:796:see_memory_usage] MA 3.54 GB Max_MA 3.54 GB CA 6.31 GB Max_CA 11 GB -[2023-12-10 06:05:14,593] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 47.84 GB, percent = 19.0% -[2023-12-10 06:05:14,725] [INFO] [utils.py:795:see_memory_usage] Before creating fp32 partitions -[2023-12-10 06:05:14,725] [INFO] [utils.py:796:see_memory_usage] MA 3.54 GB Max_MA 3.54 GB CA 6.31 GB Max_CA 6 GB -[2023-12-10 06:05:14,726] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 45.79 GB, percent = 18.2% -[2023-12-10 06:05:14,981] [INFO] [utils.py:795:see_memory_usage] After creating fp32 partitions -[2023-12-10 06:05:14,981] [INFO] [utils.py:796:see_memory_usage] MA 4.08 GB Max_MA 4.23 GB CA 7.01 GB Max_CA 7 GB -[2023-12-10 06:05:14,982] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 45.86 GB, percent = 18.2% -[2023-12-10 06:05:15,101] [INFO] [utils.py:795:see_memory_usage] Before initializing optimizer states -[2023-12-10 06:05:15,102] [INFO] [utils.py:796:see_memory_usage] MA 4.08 GB Max_MA 4.08 GB CA 7.01 GB Max_CA 7 GB -[2023-12-10 06:05:15,102] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 45.92 GB, percent = 18.2% -[2023-12-10 06:05:15,229] [INFO] [utils.py:795:see_memory_usage] After initializing optimizer states -[2023-12-10 06:05:15,229] [INFO] [utils.py:796:see_memory_usage] MA 5.17 GB Max_MA 5.47 GB CA 8.39 GB Max_CA 8 GB -[2023-12-10 06:05:15,229] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 45.97 GB, percent = 18.3% -[2023-12-10 06:05:15,230] [INFO] [stage3.py:479:_setup_for_real_optimizer] optimizer state initialized -[2023-12-10 06:05:15,616] [INFO] [utils.py:795:see_memory_usage] After initializing ZeRO optimizer -[2023-12-10 06:05:15,617] [INFO] [utils.py:796:see_memory_usage] MA 6.37 GB Max_MA 6.86 GB CA 10.08 GB Max_CA 10 GB -[2023-12-10 06:05:15,617] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 46.04 GB, percent = 18.3% -[2023-12-10 06:05:15,617] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam -[2023-12-10 06:05:15,617] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler -[2023-12-10 06:05:15,617] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = -[2023-12-10 06:05:15,617] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[9.65e-06, 0.0005, 9.65e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:05:15,618] [INFO] [config.py:979:print] DeepSpeedEngine configuration: -[2023-12-10 06:05:15,619] [INFO] [config.py:983:print] activation_checkpointing_config { +[2023-12-11 05:39:51,757] [INFO] [utils.py:795:see_memory_usage] DeepSpeedZeRoOffload initialize [end] +[2023-12-11 05:39:51,758] [INFO] [utils.py:796:see_memory_usage] MA 3.54 GB Max_MA 4.43 GB CA 8.35 GB Max_CA 8 GB +[2023-12-11 05:39:51,758] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 98.13 GB, percent = 39.0% +[2023-12-11 05:39:51,866] [INFO] [utils.py:795:see_memory_usage] Before creating fp16 partitions +[2023-12-11 05:39:51,866] [INFO] [utils.py:796:see_memory_usage] MA 3.54 GB Max_MA 3.54 GB CA 8.35 GB Max_CA 8 GB +[2023-12-11 05:39:51,866] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 98.13 GB, percent = 39.0% +[2023-12-11 05:39:52,568] [INFO] [utils.py:795:see_memory_usage] After creating fp16 partitions: 3 +[2023-12-11 05:39:52,569] [INFO] [utils.py:796:see_memory_usage] MA 3.54 GB Max_MA 3.54 GB CA 5.29 GB Max_CA 8 GB +[2023-12-11 05:39:52,569] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 98.22 GB, percent = 39.0% +[2023-12-11 05:39:52,708] [INFO] [utils.py:795:see_memory_usage] Before creating fp32 partitions +[2023-12-11 05:39:52,709] [INFO] [utils.py:796:see_memory_usage] MA 3.54 GB Max_MA 3.54 GB CA 5.29 GB Max_CA 5 GB +[2023-12-11 05:39:52,709] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 96.08 GB, percent = 38.2% +[2023-12-11 05:39:52,831] [INFO] [utils.py:795:see_memory_usage] After creating fp32 partitions +[2023-12-11 05:39:52,832] [INFO] [utils.py:796:see_memory_usage] MA 4.08 GB Max_MA 4.23 GB CA 5.99 GB Max_CA 6 GB +[2023-12-11 05:39:52,832] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 96.11 GB, percent = 38.2% +[2023-12-11 05:39:52,942] [INFO] [utils.py:795:see_memory_usage] Before initializing optimizer states +[2023-12-11 05:39:52,942] [INFO] [utils.py:796:see_memory_usage] MA 4.08 GB Max_MA 4.08 GB CA 5.99 GB Max_CA 6 GB +[2023-12-11 05:39:52,943] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 96.11 GB, percent = 38.2% +[2023-12-11 05:39:53,083] [INFO] [utils.py:795:see_memory_usage] After initializing optimizer states +[2023-12-11 05:39:53,084] [INFO] [utils.py:796:see_memory_usage] MA 5.17 GB Max_MA 5.47 GB CA 7.38 GB Max_CA 7 GB +[2023-12-11 05:39:53,084] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 96.07 GB, percent = 38.2% +[2023-12-11 05:39:53,084] [INFO] [stage3.py:479:_setup_for_real_optimizer] optimizer state initialized +[2023-12-11 05:39:53,479] [INFO] [utils.py:795:see_memory_usage] After initializing ZeRO optimizer +[2023-12-11 05:39:53,480] [INFO] [utils.py:796:see_memory_usage] MA 6.37 GB Max_MA 6.86 GB CA 9.05 GB Max_CA 9 GB +[2023-12-11 05:39:53,480] [INFO] [utils.py:803:see_memory_usage] CPU Virtual Memory: used = 96.07 GB, percent = 38.2% +[2023-12-11 05:39:53,480] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed Final Optimizer = FusedAdam +[2023-12-11 05:39:53,480] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed using client LR scheduler +[2023-12-11 05:39:53,480] [INFO] [logging.py:96:log_dist] [Rank 0] DeepSpeed LR Scheduler = +[2023-12-11 05:39:53,480] [INFO] [logging.py:96:log_dist] [Rank 0] step=0, skipped=0, lr=[9.65e-06, 0.0005, 9.65e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-11 05:39:53,482] [INFO] [config.py:979:print] DeepSpeedEngine configuration: +[2023-12-11 05:39:53,482] [INFO] [config.py:983:print] activation_checkpointing_config { "partition_activations": false, "contiguous_memory_optimization": false, "cpu_checkpointing": false, @@ -126,10 +126,10 @@ Parameter Offload: Total persistent parameters: 266240 in 65 params "synchronize_checkpoint_boundary": false, "profile": false } -[2023-12-10 06:05:15,619] [INFO] [config.py:983:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} -[2023-12-10 06:05:15,619] [INFO] [config.py:983:print] amp_enabled .................. False -[2023-12-10 06:05:15,619] [INFO] [config.py:983:print] amp_params ................... False -[2023-12-10 06:05:15,619] [INFO] [config.py:983:print] autotuning_config ............ { +[2023-12-11 05:39:53,482] [INFO] [config.py:983:print] aio_config ................... {'block_size': 1048576, 'queue_depth': 8, 'thread_count': 1, 'single_submit': False, 'overlap_events': True} +[2023-12-11 05:39:53,482] [INFO] [config.py:983:print] amp_enabled .................. False +[2023-12-11 05:39:53,482] [INFO] [config.py:983:print] amp_params ................... False +[2023-12-11 05:39:53,482] [INFO] [config.py:983:print] autotuning_config ............ { "enabled": false, "start_step": null, "end_step": null, @@ -154,31 +154,31 @@ Parameter Offload: Total persistent parameters: 266240 in 65 params "min_train_micro_batch_size_per_gpu": 1, "num_tuning_micro_batch_sizes": 3 } -[2023-12-10 06:05:15,619] [INFO] [config.py:983:print] bfloat16_enabled ............. False -[2023-12-10 06:05:15,619] [INFO] [config.py:983:print] checkpoint_parallel_write_pipeline False -[2023-12-10 06:05:15,619] [INFO] [config.py:983:print] checkpoint_tag_validation_enabled True -[2023-12-10 06:05:15,619] [INFO] [config.py:983:print] checkpoint_tag_validation_fail False -[2023-12-10 06:05:15,619] [INFO] [config.py:983:print] comms_config ................. -[2023-12-10 06:05:15,619] [INFO] [config.py:983:print] communication_data_type ...... None -[2023-12-10 06:05:15,619] [INFO] [config.py:983:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} -[2023-12-10 06:05:15,619] [INFO] [config.py:983:print] curriculum_enabled_legacy .... False -[2023-12-10 06:05:15,619] [INFO] [config.py:983:print] curriculum_params_legacy ..... False -[2023-12-10 06:05:15,620] [INFO] [config.py:983:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} -[2023-12-10 06:05:15,620] [INFO] [config.py:983:print] data_efficiency_enabled ...... False -[2023-12-10 06:05:15,620] [INFO] [config.py:983:print] dataloader_drop_last ......... False -[2023-12-10 06:05:15,620] [INFO] [config.py:983:print] disable_allgather ............ False -[2023-12-10 06:05:15,620] [INFO] [config.py:983:print] dump_state ................... False -[2023-12-10 06:05:15,620] [INFO] [config.py:983:print] dynamic_loss_scale_args ...... {'init_scale': 65536, 'scale_window': 100, 'delayed_shift': 2, 'consecutive_hysteresis': False, 'min_scale': 1} -[2023-12-10 06:05:15,620] [INFO] [config.py:983:print] eigenvalue_enabled ........... False -[2023-12-10 06:05:15,620] [INFO] [config.py:983:print] eigenvalue_gas_boundary_resolution 1 -[2023-12-10 06:05:15,620] [INFO] [config.py:983:print] eigenvalue_layer_name ........ bert.encoder.layer -[2023-12-10 06:05:15,620] [INFO] [config.py:983:print] eigenvalue_layer_num ......... 0 -[2023-12-10 06:05:15,620] [INFO] [config.py:983:print] eigenvalue_max_iter .......... 100 -[2023-12-10 06:05:15,620] [INFO] [config.py:983:print] eigenvalue_stability ......... 1e-06 -[2023-12-10 06:05:15,620] [INFO] [config.py:983:print] eigenvalue_tol ............... 0.01 -[2023-12-10 06:05:15,620] [INFO] [config.py:983:print] eigenvalue_verbose ........... False -[2023-12-10 06:05:15,620] [INFO] [config.py:983:print] elasticity_enabled ........... False -[2023-12-10 06:05:15,620] [INFO] [config.py:983:print] flops_profiler_config ........ { +[2023-12-11 05:39:53,482] [INFO] [config.py:983:print] bfloat16_enabled ............. False +[2023-12-11 05:39:53,482] [INFO] [config.py:983:print] checkpoint_parallel_write_pipeline False +[2023-12-11 05:39:53,482] [INFO] [config.py:983:print] checkpoint_tag_validation_enabled True +[2023-12-11 05:39:53,482] [INFO] [config.py:983:print] checkpoint_tag_validation_fail False +[2023-12-11 05:39:53,482] [INFO] [config.py:983:print] comms_config ................. +[2023-12-11 05:39:53,482] [INFO] [config.py:983:print] communication_data_type ...... None +[2023-12-11 05:39:53,483] [INFO] [config.py:983:print] compression_config ........... {'weight_quantization': {'shared_parameters': {'enabled': False, 'quantizer_kernel': False, 'schedule_offset': 0, 'quantize_groups': 1, 'quantize_verbose': False, 'quantization_type': 'symmetric', 'quantize_weight_in_forward': False, 'rounding': 'nearest', 'fp16_mixed_quantize': False, 'quantize_change_ratio': 0.001}, 'different_groups': {}}, 'activation_quantization': {'shared_parameters': {'enabled': False, 'quantization_type': 'symmetric', 'range_calibration': 'dynamic', 'schedule_offset': 1000}, 'different_groups': {}}, 'sparse_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'row_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'head_pruning': {'shared_parameters': {'enabled': False, 'method': 'topk', 'schedule_offset': 1000}, 'different_groups': {}}, 'channel_pruning': {'shared_parameters': {'enabled': False, 'method': 'l1', 'schedule_offset': 1000}, 'different_groups': {}}, 'layer_reduction': {'enabled': False}} +[2023-12-11 05:39:53,483] [INFO] [config.py:983:print] curriculum_enabled_legacy .... False +[2023-12-11 05:39:53,483] [INFO] [config.py:983:print] curriculum_params_legacy ..... False +[2023-12-11 05:39:53,483] [INFO] [config.py:983:print] data_efficiency_config ....... {'enabled': False, 'seed': 1234, 'data_sampling': {'enabled': False, 'num_epochs': 1000, 'num_workers': 0, 'curriculum_learning': {'enabled': False}}, 'data_routing': {'enabled': False, 'random_ltd': {'enabled': False, 'layer_token_lr_schedule': {'enabled': False}}}} +[2023-12-11 05:39:53,483] [INFO] [config.py:983:print] data_efficiency_enabled ...... False +[2023-12-11 05:39:53,483] [INFO] [config.py:983:print] dataloader_drop_last ......... False +[2023-12-11 05:39:53,483] [INFO] [config.py:983:print] disable_allgather ............ False +[2023-12-11 05:39:53,483] [INFO] [config.py:983:print] dump_state ................... False +[2023-12-11 05:39:53,483] [INFO] [config.py:983:print] dynamic_loss_scale_args ...... {'init_scale': 65536, 'scale_window': 100, 'delayed_shift': 2, 'consecutive_hysteresis': False, 'min_scale': 1} +[2023-12-11 05:39:53,483] [INFO] [config.py:983:print] eigenvalue_enabled ........... False +[2023-12-11 05:39:53,483] [INFO] [config.py:983:print] eigenvalue_gas_boundary_resolution 1 +[2023-12-11 05:39:53,483] [INFO] [config.py:983:print] eigenvalue_layer_name ........ bert.encoder.layer +[2023-12-11 05:39:53,483] [INFO] [config.py:983:print] eigenvalue_layer_num ......... 0 +[2023-12-11 05:39:53,483] [INFO] [config.py:983:print] eigenvalue_max_iter .......... 100 +[2023-12-11 05:39:53,483] [INFO] [config.py:983:print] eigenvalue_stability ......... 1e-06 +[2023-12-11 05:39:53,483] [INFO] [config.py:983:print] eigenvalue_tol ............... 0.01 +[2023-12-11 05:39:53,483] [INFO] [config.py:983:print] eigenvalue_verbose ........... False +[2023-12-11 05:39:53,483] [INFO] [config.py:983:print] elasticity_enabled ........... False +[2023-12-11 05:39:53,483] [INFO] [config.py:983:print] flops_profiler_config ........ { "enabled": false, "recompute_fwd_factor": 0.0, "profile_step": 1, @@ -187,23 +187,23 @@ Parameter Offload: Total persistent parameters: 266240 in 65 params "detailed": true, "output_file": null } -[2023-12-10 06:05:15,620] [INFO] [config.py:983:print] fp16_auto_cast ............... False -[2023-12-10 06:05:15,620] [INFO] [config.py:983:print] fp16_enabled ................. True -[2023-12-10 06:05:15,620] [INFO] [config.py:983:print] fp16_master_weights_and_gradients False -[2023-12-10 06:05:15,620] [INFO] [config.py:983:print] global_rank .................. 0 -[2023-12-10 06:05:15,620] [INFO] [config.py:983:print] grad_accum_dtype ............. None -[2023-12-10 06:05:15,620] [INFO] [config.py:983:print] gradient_accumulation_steps .. 1 -[2023-12-10 06:05:15,620] [INFO] [config.py:983:print] gradient_clipping ............ 1.0 -[2023-12-10 06:05:15,620] [INFO] [config.py:983:print] gradient_predivide_factor .... 1.0 -[2023-12-10 06:05:15,620] [INFO] [config.py:983:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 -[2023-12-10 06:05:15,620] [INFO] [config.py:983:print] initial_dynamic_scale ........ 65536 -[2023-12-10 06:05:15,620] [INFO] [config.py:983:print] load_universal_checkpoint .... False -[2023-12-10 06:05:15,620] [INFO] [config.py:983:print] loss_scale ................... 0 -[2023-12-10 06:05:15,620] [INFO] [config.py:983:print] memory_breakdown ............. False -[2023-12-10 06:05:15,621] [INFO] [config.py:983:print] mics_hierarchial_params_gather False -[2023-12-10 06:05:15,621] [INFO] [config.py:983:print] mics_shard_size .............. -1 -[2023-12-10 06:05:15,621] [INFO] [config.py:983:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='step1_tensorboard/ds_tensorboard_logs/', job_name='step1_model_tensorboard') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False -[2023-12-10 06:05:15,621] [INFO] [config.py:983:print] nebula_config ................ { +[2023-12-11 05:39:53,483] [INFO] [config.py:983:print] fp16_auto_cast ............... False +[2023-12-11 05:39:53,483] [INFO] [config.py:983:print] fp16_enabled ................. True +[2023-12-11 05:39:53,483] [INFO] [config.py:983:print] fp16_master_weights_and_gradients False +[2023-12-11 05:39:53,483] [INFO] [config.py:983:print] global_rank .................. 0 +[2023-12-11 05:39:53,483] [INFO] [config.py:983:print] grad_accum_dtype ............. None +[2023-12-11 05:39:53,483] [INFO] [config.py:983:print] gradient_accumulation_steps .. 1 +[2023-12-11 05:39:53,483] [INFO] [config.py:983:print] gradient_clipping ............ 1.0 +[2023-12-11 05:39:53,483] [INFO] [config.py:983:print] gradient_predivide_factor .... 1.0 +[2023-12-11 05:39:53,483] [INFO] [config.py:983:print] hybrid_engine ................ enabled=False max_out_tokens=512 inference_tp_size=1 release_inference_cache=False pin_parameters=True tp_gather_partition_size=8 +[2023-12-11 05:39:53,483] [INFO] [config.py:983:print] initial_dynamic_scale ........ 65536 +[2023-12-11 05:39:53,484] [INFO] [config.py:983:print] load_universal_checkpoint .... False +[2023-12-11 05:39:53,484] [INFO] [config.py:983:print] loss_scale ................... 0 +[2023-12-11 05:39:53,484] [INFO] [config.py:983:print] memory_breakdown ............. False +[2023-12-11 05:39:53,484] [INFO] [config.py:983:print] mics_hierarchial_params_gather False +[2023-12-11 05:39:53,484] [INFO] [config.py:983:print] mics_shard_size .............. -1 +[2023-12-11 05:39:53,484] [INFO] [config.py:983:print] monitor_config ............... tensorboard=TensorBoardConfig(enabled=False, output_path='step1_tensorboard/ds_tensorboard_logs/', job_name='step1_model_tensorboard') wandb=WandbConfig(enabled=False, group=None, team=None, project='deepspeed') csv_monitor=CSVConfig(enabled=False, output_path='', job_name='DeepSpeedJobName') enabled=False +[2023-12-11 05:39:53,484] [INFO] [config.py:983:print] nebula_config ................ { "enabled": false, "persistent_storage_path": null, "persistent_time_interval": 100, @@ -211,32 +211,32 @@ Parameter Offload: Total persistent parameters: 266240 in 65 params "enable_nebula_load": true, "load_path": null } -[2023-12-10 06:05:15,621] [INFO] [config.py:983:print] optimizer_legacy_fusion ...... False -[2023-12-10 06:05:15,621] [INFO] [config.py:983:print] optimizer_name ............... None -[2023-12-10 06:05:15,621] [INFO] [config.py:983:print] optimizer_params ............. None -[2023-12-10 06:05:15,621] [INFO] [config.py:983:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} -[2023-12-10 06:05:15,621] [INFO] [config.py:983:print] pld_enabled .................. False -[2023-12-10 06:05:15,621] [INFO] [config.py:983:print] pld_params ................... False -[2023-12-10 06:05:15,621] [INFO] [config.py:983:print] prescale_gradients ........... False -[2023-12-10 06:05:15,621] [INFO] [config.py:983:print] scheduler_name ............... None -[2023-12-10 06:05:15,621] [INFO] [config.py:983:print] scheduler_params ............. None -[2023-12-10 06:05:15,621] [INFO] [config.py:983:print] seq_parallel_communication_data_type torch.float32 -[2023-12-10 06:05:15,621] [INFO] [config.py:983:print] sparse_attention ............. None -[2023-12-10 06:05:15,621] [INFO] [config.py:983:print] sparse_gradients_enabled ..... False -[2023-12-10 06:05:15,621] [INFO] [config.py:983:print] steps_per_print .............. 10 -[2023-12-10 06:05:15,621] [INFO] [config.py:983:print] train_batch_size ............. 32 -[2023-12-10 06:05:15,621] [INFO] [config.py:983:print] train_micro_batch_size_per_gpu 8 -[2023-12-10 06:05:15,621] [INFO] [config.py:983:print] use_data_before_expert_parallel_ False -[2023-12-10 06:05:15,621] [INFO] [config.py:983:print] use_node_local_storage ....... False -[2023-12-10 06:05:15,621] [INFO] [config.py:983:print] wall_clock_breakdown ......... False -[2023-12-10 06:05:15,621] [INFO] [config.py:983:print] weight_quantization_config ... None -[2023-12-10 06:05:15,621] [INFO] [config.py:983:print] world_size ................... 4 -[2023-12-10 06:05:15,621] [INFO] [config.py:983:print] zero_allow_untested_optimizer False -[2023-12-10 06:05:15,621] [INFO] [config.py:983:print] zero_config .................. stage=3 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500,000,000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500,000,000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='none', nvme_path=None, buffer_count=5, buffer_size=100,000,000, max_in_cpu=1,000,000,000, pin_memory=False) offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='none', nvme_path=None, buffer_count=4, pin_memory=False, pipeline=False, pipeline_read=False, pipeline_write=False, fast_init=False, ratio=1.0) sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=30000000 param_persistence_threshold=10000 model_persistence_threshold=sys.maxsize max_live_parameters=30000000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=False pipeline_loading_checkpoint=False override_module_apply=True -[2023-12-10 06:05:15,622] [INFO] [config.py:983:print] zero_enabled ................. True -[2023-12-10 06:05:15,622] [INFO] [config.py:983:print] zero_force_ds_cpu_optimizer .. True -[2023-12-10 06:05:15,622] [INFO] [config.py:983:print] zero_optimization_stage ...... 3 -[2023-12-10 06:05:15,622] [INFO] [config.py:969:print_user_config] json = { +[2023-12-11 05:39:53,484] [INFO] [config.py:983:print] optimizer_legacy_fusion ...... False +[2023-12-11 05:39:53,484] [INFO] [config.py:983:print] optimizer_name ............... None +[2023-12-11 05:39:53,484] [INFO] [config.py:983:print] optimizer_params ............. None +[2023-12-11 05:39:53,484] [INFO] [config.py:983:print] pipeline ..................... {'stages': 'auto', 'partition': 'best', 'seed_layers': False, 'activation_checkpoint_interval': 0, 'pipe_partitioned': True, 'grad_partitioned': True} +[2023-12-11 05:39:53,484] [INFO] [config.py:983:print] pld_enabled .................. False +[2023-12-11 05:39:53,484] [INFO] [config.py:983:print] pld_params ................... False +[2023-12-11 05:39:53,484] [INFO] [config.py:983:print] prescale_gradients ........... False +[2023-12-11 05:39:53,484] [INFO] [config.py:983:print] scheduler_name ............... None +[2023-12-11 05:39:53,484] [INFO] [config.py:983:print] scheduler_params ............. None +[2023-12-11 05:39:53,484] [INFO] [config.py:983:print] seq_parallel_communication_data_type torch.float32 +[2023-12-11 05:39:53,484] [INFO] [config.py:983:print] sparse_attention ............. None +[2023-12-11 05:39:53,484] [INFO] [config.py:983:print] sparse_gradients_enabled ..... False +[2023-12-11 05:39:53,484] [INFO] [config.py:983:print] steps_per_print .............. 10 +[2023-12-11 05:39:53,484] [INFO] [config.py:983:print] train_batch_size ............. 32 +[2023-12-11 05:39:53,484] [INFO] [config.py:983:print] train_micro_batch_size_per_gpu 8 +[2023-12-11 05:39:53,484] [INFO] [config.py:983:print] use_data_before_expert_parallel_ False +[2023-12-11 05:39:53,484] [INFO] [config.py:983:print] use_node_local_storage ....... False +[2023-12-11 05:39:53,484] [INFO] [config.py:983:print] wall_clock_breakdown ......... False +[2023-12-11 05:39:53,484] [INFO] [config.py:983:print] weight_quantization_config ... None +[2023-12-11 05:39:53,484] [INFO] [config.py:983:print] world_size ................... 4 +[2023-12-11 05:39:53,484] [INFO] [config.py:983:print] zero_allow_untested_optimizer False +[2023-12-11 05:39:53,485] [INFO] [config.py:983:print] zero_config .................. stage=3 contiguous_gradients=True reduce_scatter=True reduce_bucket_size=500,000,000 use_multi_rank_bucket_allreduce=True allgather_partitions=True allgather_bucket_size=500,000,000 overlap_comm=True load_from_fp32_weights=True elastic_checkpoint=False offload_param=DeepSpeedZeroOffloadParamConfig(device='none', nvme_path=None, buffer_count=5, buffer_size=100,000,000, max_in_cpu=1,000,000,000, pin_memory=False) offload_optimizer=DeepSpeedZeroOffloadOptimizerConfig(device='none', nvme_path=None, buffer_count=4, pin_memory=False, pipeline=False, pipeline_read=False, pipeline_write=False, fast_init=False, ratio=1.0) sub_group_size=1,000,000,000 cpu_offload_param=None cpu_offload_use_pin_memory=None cpu_offload=None prefetch_bucket_size=30000000 param_persistence_threshold=10000 model_persistence_threshold=sys.maxsize max_live_parameters=30000000 max_reuse_distance=1,000,000,000 gather_16bit_weights_on_model_save=False stage3_gather_fp16_weights_on_model_save=False ignore_unused_parameters=True legacy_stage1=False round_robin_gradients=False zero_hpz_partition_size=1 zero_quantized_weights=False zero_quantized_nontrainable_weights=False zero_quantized_gradients=False mics_shard_size=-1 mics_hierarchical_params_gather=False memory_efficient_linear=False pipeline_loading_checkpoint=False override_module_apply=True +[2023-12-11 05:39:53,485] [INFO] [config.py:983:print] zero_enabled ................. True +[2023-12-11 05:39:53,485] [INFO] [config.py:983:print] zero_force_ds_cpu_optimizer .. True +[2023-12-11 05:39:53,485] [INFO] [config.py:983:print] zero_optimization_stage ...... 3 +[2023-12-11 05:39:53,485] [INFO] [config.py:969:print_user_config] json = { "train_batch_size": 32, "train_micro_batch_size_per_gpu": 8, "steps_per_print": 10, @@ -275,9 +275,9 @@ Parameter Offload: Total persistent parameters: 266240 in 65 params } } ***** Running training ***** -***** Evaluating perplexity, Epoch 0/100 ***** +***** Evaluating perplexity, Epoch 0/3 ***** ppl: 4.460639476776123, loss: 1.4952921867370605 -Beginning of Epoch 1/100, Total Micro Batches 13 +Beginning of Epoch 1/3, Total Micro Batches 13 /home/t-sokumar/miniconda3/envs/ft/lib/python3.11/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants. warnings.warn( /home/t-sokumar/miniconda3/envs/ft/lib/python3.11/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants. @@ -286,1984 +286,65 @@ Beginning of Epoch 1/100, Total Micro Batches 13 warnings.warn( /home/t-sokumar/miniconda3/envs/ft/lib/python3.11/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants. warnings.warn( -Model Parameters: 6.927 B, Latency: 4.16s, TFLOPs: 10.07, Samples/sec: 1.92, Time/seq 0.52s, Batch Size: 8, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 4.07s, TFLOPs: 10.30, Samples/sec: 1.97, Time/seq 0.51s, Batch Size: 8, Sequence Length: 512 Invalidate trace cache @ step 0: expected module 6, but got module 0 -Model Parameters: 6.927 B, Latency: 3.80s, TFLOPs: 11.01, Samples/sec: 2.10, Time/seq 0.48s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.76s, TFLOPs: 11.15, Samples/sec: 2.13, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.64s, TFLOPs: 11.49, Samples/sec: 2.20, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 3.74s, TFLOPs: 11.21, Samples/sec: 2.14, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 3.75s, TFLOPs: 11.17, Samples/sec: 2.13, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 3.73s, TFLOPs: 11.24, Samples/sec: 2.15, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 3.62s, TFLOPs: 11.55, Samples/sec: 2.21, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 3.63s, TFLOPs: 11.53, Samples/sec: 2.20, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 3.62s, TFLOPs: 11.57, Samples/sec: 2.21, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 3.63s, TFLOPs: 11.55, Samples/sec: 2.21, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 3.62s, TFLOPs: 11.56, Samples/sec: 2.21, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 +[2023-12-11 05:40:33,349] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=0, lr=[8.167395005683819e-06, 0.00042318108837739987, 8.167395005683819e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-11 05:40:33,349] [INFO] [timer.py:260:stop] epoch=0/micro_step=10/global_step=10, RunningAvgSamplesPerSec=8.762510054419295, CurrSamplesPerSec=8.82404309088149, MemAllocated=6.88GB, MaxMemAllocated=10.68GB +Model Parameters: 6.927 B, Latency: 3.63s, TFLOPs: 11.54, Samples/sec: 2.20, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 3.62s, TFLOPs: 11.55, Samples/sec: 2.21, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 3.62s, TFLOPs: 11.56, Samples/sec: 2.21, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 3.24s, TFLOPs: 12.92, Samples/sec: 2.47, Time/seq 0.40s, Batch Size: 8, Sequence Length: 512 +***** Evaluating perplexity, Epoch 1/3 ***** +Invalidate trace cache @ step 0: expected module 0, but got module 6 +ppl: 1.7355413436889648, loss: 0.551319420337677 +Beginning of Epoch 2/3, Total Micro Batches 13 +Model Parameters: 6.927 B, Latency: 3.75s, TFLOPs: 11.17, Samples/sec: 2.13, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 3.76s, TFLOPs: 11.14, Samples/sec: 2.13, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 3.63s, TFLOPs: 11.54, Samples/sec: 2.20, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 Model Parameters: 6.927 B, Latency: 3.63s, TFLOPs: 11.52, Samples/sec: 2.20, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.64s, TFLOPs: 11.52, Samples/sec: 2.20, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 3.63s, TFLOPs: 11.52, Samples/sec: 2.20, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 3.63s, TFLOPs: 11.53, Samples/sec: 2.20, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 +[2023-12-11 05:41:11,363] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=0, lr=[4.6307168389720735e-06, 0.0002399335149726463, 4.6307168389720735e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-11 05:41:11,363] [INFO] [timer.py:260:stop] epoch=1/micro_step=7/global_step=20, RunningAvgSamplesPerSec=8.813860487355969, CurrSamplesPerSec=8.815178915300866, MemAllocated=6.88GB, MaxMemAllocated=11.06GB Model Parameters: 6.927 B, Latency: 3.63s, TFLOPs: 11.53, Samples/sec: 2.20, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 Model Parameters: 6.927 B, Latency: 3.63s, TFLOPs: 11.52, Samples/sec: 2.20, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 Model Parameters: 6.927 B, Latency: 3.63s, TFLOPs: 11.52, Samples/sec: 2.20, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:05:55,635] [INFO] [logging.py:96:log_dist] [Rank 0] step=10, skipped=0, lr=[9.648591167936933e-06, 0.0004999270035200482, 9.648591167936933e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:05:55,635] [INFO] [timer.py:260:stop] epoch=0/micro_step=10/global_step=10, RunningAvgSamplesPerSec=8.770094217380155, CurrSamplesPerSec=8.812768185145067, MemAllocated=6.88GB, MaxMemAllocated=10.68GB Model Parameters: 6.927 B, Latency: 3.63s, TFLOPs: 11.53, Samples/sec: 2.20, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 Model Parameters: 6.927 B, Latency: 3.63s, TFLOPs: 11.53, Samples/sec: 2.20, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.65s, TFLOPs: 11.49, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.24s, TFLOPs: 12.91, Samples/sec: 2.47, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 1/100 ***** +Model Parameters: 6.927 B, Latency: 3.63s, TFLOPs: 11.52, Samples/sec: 2.20, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 3.24s, TFLOPs: 12.90, Samples/sec: 2.47, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 +***** Evaluating perplexity, Epoch 2/3 ***** Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.613058090209961, loss: 0.47813183069229126 -Beginning of Epoch 2/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.76s, TFLOPs: 11.13, Samples/sec: 2.13, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 +ppl: 1.0645378828048706, loss: 0.0625406950712204 +Beginning of Epoch 3/3, Total Micro Batches 13 +Model Parameters: 6.927 B, Latency: 3.75s, TFLOPs: 11.16, Samples/sec: 2.13, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 Model Parameters: 6.927 B, Latency: 3.77s, TFLOPs: 11.11, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.64s, TFLOPs: 11.51, Samples/sec: 2.20, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.64s, TFLOPs: 11.49, Samples/sec: 2.20, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 3.63s, TFLOPs: 11.53, Samples/sec: 2.20, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 +[2023-12-11 05:41:49,440] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=0, lr=[1.2134356400744368e-06, 6.28723129572247e-05, 1.2134356400744368e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] +[2023-12-11 05:41:49,441] [INFO] [timer.py:260:stop] epoch=2/micro_step=4/global_step=30, RunningAvgSamplesPerSec=8.8235289391519, CurrSamplesPerSec=8.771353693343526, MemAllocated=6.88GB, MaxMemAllocated=11.06GB Model Parameters: 6.927 B, Latency: 3.65s, TFLOPs: 11.47, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.64s, TFLOPs: 11.50, Samples/sec: 2.20, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:06:33,776] [INFO] [logging.py:96:log_dist] [Rank 0] step=20, skipped=0, lr=[9.644365494465977e-06, 0.0004997080567080817, 9.644365494465977e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:06:33,776] [INFO] [timer.py:260:stop] epoch=1/micro_step=7/global_step=20, RunningAvgSamplesPerSec=8.800732650555023, CurrSamplesPerSec=8.785517932788014, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.64s, TFLOPs: 11.49, Samples/sec: 2.20, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.64s, TFLOPs: 11.49, Samples/sec: 2.20, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.64s, TFLOPs: 11.49, Samples/sec: 2.20, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.64s, TFLOPs: 11.50, Samples/sec: 2.20, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 3.63s, TFLOPs: 11.53, Samples/sec: 2.20, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 3.64s, TFLOPs: 11.52, Samples/sec: 2.20, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 Model Parameters: 6.927 B, Latency: 3.64s, TFLOPs: 11.51, Samples/sec: 2.20, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.64s, TFLOPs: 11.49, Samples/sec: 2.20, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.25s, TFLOPs: 12.88, Samples/sec: 2.46, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 2/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.011894941329956, loss: 0.01182475220412016 -Beginning of Epoch 3/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.77s, TFLOPs: 11.10, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 Model Parameters: 6.927 B, Latency: 3.65s, TFLOPs: 11.48, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:07:11,959] [INFO] [logging.py:96:log_dist] [Rank 0] step=30, skipped=0, lr=[9.637325447261447e-06, 0.0004993432874228728, 9.637325447261447e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:07:11,960] [INFO] [timer.py:260:stop] epoch=2/micro_step=4/global_step=30, RunningAvgSamplesPerSec=8.806181561579107, CurrSamplesPerSec=8.772985388568388, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.65s, TFLOPs: 11.47, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.65s, TFLOPs: 11.49, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 3.63s, TFLOPs: 11.52, Samples/sec: 2.20, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 3.63s, TFLOPs: 11.52, Samples/sec: 2.20, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 Model Parameters: 6.927 B, Latency: 3.64s, TFLOPs: 11.50, Samples/sec: 2.20, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.64s, TFLOPs: 11.49, Samples/sec: 2.20, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.65s, TFLOPs: 11.48, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.64s, TFLOPs: 11.49, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.65s, TFLOPs: 11.48, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.65s, TFLOPs: 11.48, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.25s, TFLOPs: 12.87, Samples/sec: 2.46, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 3/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0034215450286865, loss: 0.0034157028421759605 -Beginning of Epoch 4/100, Total Micro Batches 13 -[2023-12-10 06:07:50,043] [INFO] [logging.py:96:log_dist] [Rank 0] step=40, skipped=0, lr=[9.627475137512655e-06, 0.0004988329086794121, 9.627475137512655e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:07:50,043] [INFO] [timer.py:260:stop] epoch=3/micro_step=1/global_step=40, RunningAvgSamplesPerSec=8.81523384114727, CurrSamplesPerSec=8.490803308622581, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.77s, TFLOPs: 11.10, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.08, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.65s, TFLOPs: 11.48, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.65s, TFLOPs: 11.46, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.65s, TFLOPs: 11.46, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.65s, TFLOPs: 11.46, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.65s, TFLOPs: 11.47, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.65s, TFLOPs: 11.46, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:08:26,727] [INFO] [logging.py:96:log_dist] [Rank 0] step=50, skipped=0, lr=[9.61482031752311e-06, 0.0004981772185245135, 9.61482031752311e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:08:26,728] [INFO] [timer.py:260:stop] epoch=3/micro_step=11/global_step=50, RunningAvgSamplesPerSec=8.79769325607726, CurrSamplesPerSec=8.762633908755163, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.65s, TFLOPs: 11.46, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.65s, TFLOPs: 11.48, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.26s, TFLOPs: 12.85, Samples/sec: 2.46, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 4/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0029047727584839, loss: 0.002900516614317894 -Beginning of Epoch 5/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.77s, TFLOPs: 11.11, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.08, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.64s, TFLOPs: 11.49, Samples/sec: 2.20, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.65s, TFLOPs: 11.48, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.65s, TFLOPs: 11.48, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.65s, TFLOPs: 11.47, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.65s, TFLOPs: 11.46, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:09:04,947] [INFO] [logging.py:96:log_dist] [Rank 0] step=60, skipped=0, lr=[9.599368377351322e-06, 0.0004973765998627628, 9.599368377351322e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:09:04,947] [INFO] [timer.py:260:stop] epoch=4/micro_step=8/global_step=60, RunningAvgSamplesPerSec=8.799524695813936, CurrSamplesPerSec=8.765039035842445, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.65s, TFLOPs: 11.46, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.70s, TFLOPs: 11.32, Samples/sec: 2.16, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.65s, TFLOPs: 11.47, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.26s, TFLOPs: 12.84, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 5/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.002308964729309, loss: 0.0023063267581164837 -Beginning of Epoch 6/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.77s, TFLOPs: 11.09, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.07, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.65s, TFLOPs: 11.47, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:09:43,276] [INFO] [logging.py:96:log_dist] [Rank 0] step=70, skipped=0, lr=[9.581128340495216e-06, 0.0004964315202329127, 9.581128340495216e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:09:43,277] [INFO] [timer.py:260:stop] epoch=5/micro_step=5/global_step=70, RunningAvgSamplesPerSec=8.79692553787264, CurrSamplesPerSec=8.743091166931249, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.70s, TFLOPs: 11.32, Samples/sec: 2.16, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.69s, TFLOPs: 11.34, Samples/sec: 2.17, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.69s, TFLOPs: 11.36, Samples/sec: 2.17, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.70s, TFLOPs: 11.30, Samples/sec: 2.16, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.31s, TFLOPs: 12.65, Samples/sec: 2.42, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 6/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0021849870681763, loss: 0.0021825172007083893 -Beginning of Epoch 7/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.82s, TFLOPs: 10.97, Samples/sec: 2.10, Time/seq 0.48s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:10:22,736] [INFO] [logging.py:96:log_dist] [Rank 0] step=80, skipped=0, lr=[9.560110858622671e-06, 0.0004953425315348534, 9.560110858622671e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:10:22,820] [INFO] [timer.py:260:stop] epoch=6/micro_step=2/global_step=80, RunningAvgSamplesPerSec=8.781695830911861, CurrSamplesPerSec=8.046538098532851, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.98s, TFLOPs: 10.52, Samples/sec: 2.01, Time/seq 0.50s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.70s, TFLOPs: 11.30, Samples/sec: 2.16, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.36, Samples/sec: 2.17, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:11:00,855] [INFO] [logging.py:96:log_dist] [Rank 0] step=90, skipped=0, lr=[9.53632820535124e-06, 0.0004941102697073181, 9.53632820535124e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:11:01,019] [INFO] [timer.py:260:stop] epoch=6/micro_step=12/global_step=90, RunningAvgSamplesPerSec=8.770842773254097, CurrSamplesPerSec=8.35984490861131, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.83s, TFLOPs: 10.93, Samples/sec: 2.09, Time/seq 0.48s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.30s, TFLOPs: 12.70, Samples/sec: 2.43, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 7/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0021474361419678, loss: 0.00214507058262825 -Beginning of Epoch 8/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.77s, TFLOPs: 11.09, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.05, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.70s, TFLOPs: 11.32, Samples/sec: 2.16, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.70s, TFLOPs: 11.32, Samples/sec: 2.16, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:11:43,227] [INFO] [logging.py:96:log_dist] [Rank 0] step=100, skipped=0, lr=[9.509794269080702e-06, 0.000492735454356513, 9.509794269080702e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:11:43,228] [INFO] [timer.py:260:stop] epoch=7/micro_step=9/global_step=100, RunningAvgSamplesPerSec=8.768811649563167, CurrSamplesPerSec=8.694228610909258, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.37, Samples/sec: 2.17, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.28s, TFLOPs: 12.76, Samples/sec: 2.44, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 8/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0022187232971191, loss: 0.0022162303794175386 -Beginning of Epoch 9/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.09, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.80s, TFLOPs: 11.02, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:12:23,815] [INFO] [logging.py:96:log_dist] [Rank 0] step=110, skipped=0, lr=[9.480524544882636e-06, 0.0004912188883358879, 9.480524544882636e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:12:23,815] [INFO] [timer.py:260:stop] epoch=8/micro_step=6/global_step=110, RunningAvgSamplesPerSec=8.768396866710981, CurrSamplesPerSec=8.724755375091526, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.80, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 9/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0022855997085571, loss: 0.00228301715105772 -Beginning of Epoch 10/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.80s, TFLOPs: 11.03, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:13:02,236] [INFO] [logging.py:96:log_dist] [Rank 0] step=120, skipped=0, lr=[9.448536125451729e-06, 0.0004895614572772916, 9.448536125451729e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:13:02,236] [INFO] [timer.py:260:stop] epoch=9/micro_step=3/global_step=120, RunningAvgSamplesPerSec=8.767926816186657, CurrSamplesPerSec=8.732357242865396, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:13:38,538] [INFO] [logging.py:96:log_dist] [Rank 0] step=130, skipped=0, lr=[9.413847691124117e-06, 0.0004877641290737884, 9.413847691124117e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:13:38,539] [INFO] [timer.py:260:stop] epoch=9/micro_step=13/global_step=130, RunningAvgSamplesPerSec=8.77231690277168, CurrSamplesPerSec=9.77733836559356, MemAllocated=6.82GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.79, Samples/sec: 2.44, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 10/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0020976066589355, loss: 0.00209536449983716 -Beginning of Epoch 11/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.04, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.82s, TFLOPs: 10.97, Samples/sec: 2.10, Time/seq 0.48s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.69s, TFLOPs: 11.33, Samples/sec: 2.17, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.37, Samples/sec: 2.17, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:14:17,423] [INFO] [logging.py:96:log_dist] [Rank 0] step=140, skipped=0, lr=[9.37647949896861e-06, 0.00048582795331443573, 9.37647949896861e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:14:17,423] [INFO] [timer.py:260:stop] epoch=10/micro_step=10/global_step=140, RunningAvgSamplesPerSec=8.763724937418859, CurrSamplesPerSec=8.740363947048987, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.79, Samples/sec: 2.44, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 11/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0021694898605347, loss: 0.0021671149879693985 -Beginning of Epoch 12/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.80s, TFLOPs: 11.03, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:14:55,832] [INFO] [logging.py:96:log_dist] [Rank 0] step=150, skipped=0, lr=[9.336453370957128e-06, 0.00048375406067135377, 9.336453370957128e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:14:55,833] [INFO] [timer.py:260:stop] epoch=11/micro_step=7/global_step=150, RunningAvgSamplesPerSec=8.763776234037667, CurrSamplesPerSec=8.74112443685157, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.80, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 12/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0020347833633423, loss: 0.0020326757803559303 -Beginning of Epoch 13/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.05, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.03, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:15:34,214] [INFO] [logging.py:96:log_dist] [Rank 0] step=160, skipped=0, lr=[9.293792681221273e-06, 0.00048154366223944413, 9.293792681221273e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:15:34,214] [INFO] [timer.py:260:stop] epoch=12/micro_step=4/global_step=160, RunningAvgSamplesPerSec=8.764217993868474, CurrSamplesPerSec=8.729752543554216, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.82, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 13/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0021640062332153, loss: 0.0021617079619318247 -Beginning of Epoch 14/100, Total Micro Batches 13 -[2023-12-10 06:16:12,462] [INFO] [logging.py:96:log_dist] [Rank 0] step=170, skipped=0, lr=[9.248522342402511e-06, 0.00047919804882914566, 9.248522342402511e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:16:12,462] [INFO] [timer.py:260:stop] epoch=13/micro_step=1/global_step=170, RunningAvgSamplesPerSec=8.766551039648265, CurrSamplesPerSec=8.457650443586763, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.94s, TFLOPs: 10.61, Samples/sec: 2.03, Time/seq 0.49s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:16:49,394] [INFO] [logging.py:96:log_dist] [Rank 0] step=180, skipped=0, lr=[9.200668791103882e-06, 0.00047671859021263636, 9.200668791103882e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:16:49,395] [INFO] [timer.py:260:stop] epoch=13/micro_step=11/global_step=180, RunningAvgSamplesPerSec=8.7612386842032, CurrSamplesPerSec=8.731694277273524, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.82, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 14/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0020993947982788, loss: 0.002097158692777157 -Beginning of Epoch 15/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.05, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:17:27,739] [INFO] [logging.py:96:log_dist] [Rank 0] step=190, skipped=0, lr=[9.150259972451771e-06, 0.0004741067343239259, 9.150259972451771e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:17:27,739] [INFO] [timer.py:260:stop] epoch=14/micro_step=8/global_step=190, RunningAvgSamplesPerSec=8.762218133298138, CurrSamplesPerSec=8.750748425937209, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.81, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 15/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0020793676376343, loss: 0.0020771552808582783 -Beginning of Epoch 16/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.07, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:18:06,128] [INFO] [logging.py:96:log_dist] [Rank 0] step=200, skipped=0, lr=[9.097325323776738e-06, 0.00047136400641330245, 9.097325323776738e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:18:06,129] [INFO] [timer.py:260:stop] epoch=15/micro_step=5/global_step=200, RunningAvgSamplesPerSec=8.762503247400744, CurrSamplesPerSec=8.739762934963915, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.80, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 16/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0021138191223145, loss: 0.00211157719604671 -Beginning of Epoch 17/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.07, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:18:44,522] [INFO] [logging.py:96:log_dist] [Rank 0] step=210, skipped=0, lr=[9.04189575742295e-06, 0.0004684920081566295, 9.04189575742295e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:18:44,523] [INFO] [timer.py:260:stop] epoch=16/micro_step=2/global_step=210, RunningAvgSamplesPerSec=8.762756655398805, CurrSamplesPerSec=8.436487704123307, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.03, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.38, Samples/sec: 2.17, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:19:21,218] [INFO] [logging.py:96:log_dist] [Rank 0] step=220, skipped=0, lr=[8.984003642696238e-06, 0.0004654924167200123, 8.984003642696238e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:19:21,218] [INFO] [timer.py:260:stop] epoch=16/micro_step=12/global_step=220, RunningAvgSamplesPerSec=8.761208547827984, CurrSamplesPerSec=8.744008783506157, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.82, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 17/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0021439790725708, loss: 0.002141612581908703 -Beginning of Epoch 18/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.80s, TFLOPs: 11.03, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:19:59,608] [INFO] [logging.py:96:log_dist] [Rank 0] step=230, skipped=0, lr=[8.92368278696134e-06, 0.00046236698378038026, 8.92368278696134e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:19:59,608] [INFO] [timer.py:260:stop] epoch=17/micro_step=9/global_step=230, RunningAvgSamplesPerSec=8.761535357546784, CurrSamplesPerSec=8.736660159517372, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.82, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 18/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.002083420753479, loss: 0.002081236569210887 -Beginning of Epoch 19/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.07, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.04, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:20:38,000] [INFO] [logging.py:96:log_dist] [Rank 0] step=240, skipped=0, lr=[8.860968415899345e-06, 0.0004591175345025567, 8.860968415899345e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:20:38,000] [INFO] [timer.py:260:stop] epoch=18/micro_step=6/global_step=240, RunningAvgSamplesPerSec=8.761861385257518, CurrSamplesPerSec=8.740853468722689, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.79, Samples/sec: 2.44, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 19/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0020439624786377, loss: 0.0020417894702404737 -Beginning of Epoch 20/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.05, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.04, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:21:16,395] [INFO] [logging.py:96:log_dist] [Rank 0] step=250, skipped=0, lr=[8.795897152936893e-06, 0.0004557459664734141, 8.795897152936893e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:21:16,395] [INFO] [timer.py:260:stop] epoch=19/micro_step=3/global_step=250, RunningAvgSamplesPerSec=8.762087699908628, CurrSamplesPerSec=8.736853520805871, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:21:52,667] [INFO] [logging.py:96:log_dist] [Rank 0] step=260, skipped=0, lr=[8.728506997859123e-06, 0.0004522542485937369, 8.728506997859123e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:21:52,668] [INFO] [timer.py:260:stop] epoch=19/micro_step=13/global_step=260, RunningAvgSamplesPerSec=8.76473200629737, CurrSamplesPerSec=9.788978234944851, MemAllocated=6.82GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.80, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 20/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.001988172531128, loss: 0.001986186020076275 -Beginning of Epoch 21/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.05, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.93s, TFLOPs: 10.65, Samples/sec: 2.04, Time/seq 0.49s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:22:31,606] [INFO] [logging.py:96:log_dist] [Rank 0] step=270, skipped=0, lr=[8.658837304618866e-06, 0.0004486444199284386, 8.658837304618866e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:22:31,607] [INFO] [timer.py:260:stop] epoch=20/micro_step=10/global_step=270, RunningAvgSamplesPerSec=8.759990713351002, CurrSamplesPerSec=8.728342928365052, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.81, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 21/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019750595092773, loss: 0.0019731405191123486 -Beginning of Epoch 22/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.04, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:23:09,985] [INFO] [logging.py:96:log_dist] [Rank 0] step=280, skipped=0, lr=[8.586928758355048e-06, 0.00044491858851580557, 8.586928758355048e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:23:09,985] [INFO] [timer.py:260:stop] epoch=21/micro_step=7/global_step=280, RunningAvgSamplesPerSec=8.760459465054195, CurrSamplesPerSec=8.728172079651284, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.81, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 22/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019866228103638, loss: 0.0019846009090542793 -Beginning of Epoch 23/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.09, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.05, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:23:48,345] [INFO] [logging.py:96:log_dist] [Rank 0] step=290, skipped=0, lr=[8.512823351633718e-06, 0.00044107893013646207, 8.512823351633718e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:23:48,345] [INFO] [timer.py:260:stop] epoch=22/micro_step=4/global_step=290, RunningAvgSamplesPerSec=8.761002932058826, CurrSamplesPerSec=8.734654825288652, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.80, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 23/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019848346710205, loss: 0.001982894027605653 -Beginning of Epoch 24/100, Total Micro Batches 13 -[2023-12-10 06:24:26,613] [INFO] [logging.py:96:log_dist] [Rank 0] step=300, skipped=0, lr=[8.436564359925563e-06, 0.00043712768704277526, 8.436564359925563e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:24:26,614] [INFO] [timer.py:260:stop] epoch=23/micro_step=1/global_step=300, RunningAvgSamplesPerSec=8.76221319606686, CurrSamplesPerSec=8.457991014792093, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.80s, TFLOPs: 11.03, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:25:03,429] [INFO] [logging.py:96:log_dist] [Rank 0] step=310, skipped=0, lr=[8.358196316334257e-06, 0.0004330671666494434, 8.358196316334257e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:25:03,429] [INFO] [timer.py:260:stop] epoch=23/micro_step=11/global_step=310, RunningAvgSamplesPerSec=8.76019414115666, CurrSamplesPerSec=8.721755625162983, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.82, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 24/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019934177398682, loss: 0.001991403056308627 -Beginning of Epoch 25/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:25:41,810] [INFO] [logging.py:96:log_dist] [Rank 0] step=320, skipped=0, lr=[8.277764985590385e-06, 0.0004288997401860303, 8.277764985590385e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:25:41,810] [INFO] [timer.py:260:stop] epoch=24/micro_step=8/global_step=320, RunningAvgSamplesPerSec=8.760533422986926, CurrSamplesPerSec=8.721463187648036, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.80, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 25/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019837617874146, loss: 0.00198176596313715 -Beginning of Epoch 26/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.80s, TFLOPs: 11.01, Samples/sec: 2.10, Time/seq 0.48s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:26:20,201] [INFO] [logging.py:96:log_dist] [Rank 0] step=330, skipped=0, lr=[8.195317337326122e-06, 0.0004246278413122343, 8.195317337326122e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:26:20,202] [INFO] [timer.py:260:stop] epoch=25/micro_step=5/global_step=330, RunningAvgSamplesPerSec=8.760781750803856, CurrSamplesPerSec=8.730233495094511, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.82, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 26/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019780397415161, loss: 0.0019761149305850267 -Beginning of Epoch 27/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:26:58,599] [INFO] [logging.py:96:log_dist] [Rank 0] step=340, skipped=0, lr=[8.110901518646297e-06, 0.0004202539646966993, 8.110901518646297e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:26:58,600] [INFO] [timer.py:260:stop] epoch=26/micro_step=2/global_step=340, RunningAvgSamplesPerSec=8.7609737847085, CurrSamplesPerSec=8.435801564615645, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.80s, TFLOPs: 11.03, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.38, Samples/sec: 2.17, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:27:35,280] [INFO] [logging.py:96:log_dist] [Rank 0] step=350, skipped=0, lr=[8.024566826011837e-06, 0.0004157806645601988, 8.024566826011837e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:27:35,280] [INFO] [timer.py:260:stop] epoch=26/micro_step=12/global_step=350, RunningAvgSamplesPerSec=8.760150708120399, CurrSamplesPerSec=8.731716431310115, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.81, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 27/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019859075546265, loss: 0.0019840081222355366 -Beginning of Epoch 28/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.94s, TFLOPs: 10.62, Samples/sec: 2.03, Time/seq 0.49s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:28:13,824] [INFO] [logging.py:96:log_dist] [Rank 0] step=360, skipped=0, lr=[7.936363676452024e-06, 0.00041121055318404264, 7.936363676452024e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:28:13,825] [INFO] [timer.py:260:stop] epoch=27/micro_step=9/global_step=360, RunningAvgSamplesPerSec=8.759349400910232, CurrSamplesPerSec=8.719349266456572, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.81, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 28/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019837617874146, loss: 0.0019818597938865423 -Beginning of Epoch 29/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.08, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.08, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:28:52,203] [INFO] [logging.py:96:log_dist] [Rank 0] step=370, skipped=0, lr=[7.846343578122364e-06, 0.0004065462993845784, 7.846343578122364e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:28:52,204] [INFO] [timer.py:260:stop] epoch=28/micro_step=6/global_step=370, RunningAvgSamplesPerSec=8.759678646979514, CurrSamplesPerSec=8.735564986231912, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.81, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 29/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019816160202026, loss: 0.0019796588458120823 -Beginning of Epoch 30/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.05, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.04, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:29:30,596] [INFO] [logging.py:96:log_dist] [Rank 0] step=380, skipped=0, lr=[7.754559100225283e-06, 0.0004017906269546778, 7.754559100225283e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:29:30,596] [INFO] [timer.py:260:stop] epoch=29/micro_step=3/global_step=380, RunningAvgSamplesPerSec=8.759915664900369, CurrSamplesPerSec=8.729445375556242, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:30:06,902] [INFO] [logging.py:96:log_dist] [Rank 0] step=390, skipped=0, lr=[7.661063842311183e-06, 0.0003969463130731183, 7.661063842311183e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:30:06,902] [INFO] [timer.py:260:stop] epoch=29/micro_step=13/global_step=390, RunningAvgSamplesPerSec=8.761517407363788, CurrSamplesPerSec=9.79549231866994, MemAllocated=6.82GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.81, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 30/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019906759262085, loss: 0.001988681498914957 -Beginning of Epoch 31/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.80s, TFLOPs: 11.02, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:30:45,695] [INFO] [logging.py:96:log_dist] [Rank 0] step=400, skipped=0, lr=[7.565912402977827e-06, 0.00039201618668278893, 7.565912402977827e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:30:45,696] [INFO] [timer.py:260:stop] epoch=30/micro_step=10/global_step=400, RunningAvgSamplesPerSec=8.759268605058443, CurrSamplesPerSec=8.731193853218743, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.82, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 31/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.001999020576477, loss: 0.0019970331341028214 -Beginning of Epoch 32/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.05, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:31:24,079] [INFO] [logging.py:96:log_dist] [Rank 0] step=410, skipped=0, lr=[7.469160347986284e-06, 0.0003870031268386676, 7.469160347986284e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:31:24,080] [INFO] [timer.py:260:stop] epoch=31/micro_step=7/global_step=410, RunningAvgSamplesPerSec=8.759518508960396, CurrSamplesPerSec=8.734736112400034, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.26s, TFLOPs: 12.82, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 32/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019973516464233, loss: 0.001995313446968794 -Beginning of Epoch 33/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.81s, TFLOPs: 10.99, Samples/sec: 2.10, Time/seq 0.48s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.69s, TFLOPs: 11.36, Samples/sec: 2.17, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:32:02,527] [INFO] [logging.py:96:log_dist] [Rank 0] step=420, skipped=0, lr=[7.370864177812091e-06, 0.00038191006102653317, 7.370864177812091e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:32:02,527] [INFO] [timer.py:260:stop] epoch=32/micro_step=4/global_step=420, RunningAvgSamplesPerSec=8.759433177378359, CurrSamplesPerSec=8.719396848152986, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.37, Samples/sec: 2.17, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.37, Samples/sec: 2.17, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.79, Samples/sec: 2.44, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 33/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0020171403884888, loss: 0.002015142235904932 -Beginning of Epoch 34/100, Total Micro Batches 13 -[2023-12-10 06:32:40,832] [INFO] [logging.py:96:log_dist] [Rank 0] step=430, skipped=0, lr=[7.2710812946505745e-06, 0.00037673996345339764, 7.2710812946505745e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:32:40,833] [INFO] [timer.py:260:stop] epoch=33/micro_step=1/global_step=430, RunningAvgSamplesPerSec=8.76014384902512, CurrSamplesPerSec=8.456370482243685, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.04, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.69s, TFLOPs: 11.35, Samples/sec: 2.17, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:33:17,698] [INFO] [logging.py:96:log_dist] [Rank 0] step=440, skipped=0, lr=[7.169869968895575e-06, 0.0003714958533106515, 7.169869968895575e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:33:17,698] [INFO] [timer.py:260:stop] epoch=33/micro_step=11/global_step=440, RunningAvgSamplesPerSec=8.758518708557201, CurrSamplesPerSec=8.718908593993401, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.28s, TFLOPs: 12.76, Samples/sec: 2.44, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 34/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0020219087600708, loss: 0.002019959269091487 -Beginning of Epoch 35/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.07, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.97s, TFLOPs: 10.55, Samples/sec: 2.02, Time/seq 0.50s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:33:56,322] [INFO] [logging.py:96:log_dist] [Rank 0] step=450, skipped=0, lr=[7.067289305111184e-06, 0.00036618079301094214, 7.067289305111184e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:33:56,323] [INFO] [timer.py:260:stop] epoch=34/micro_step=8/global_step=450, RunningAvgSamplesPerSec=8.757549033653733, CurrSamplesPerSec=8.716146060455438, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.72s, TFLOPs: 11.26, Samples/sec: 2.15, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.28s, TFLOPs: 12.78, Samples/sec: 2.44, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 35/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0020458698272705, loss: 0.002043678890913725 -Beginning of Epoch 36/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.80s, TFLOPs: 11.03, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.83s, TFLOPs: 10.92, Samples/sec: 2.09, Time/seq 0.48s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.38, Samples/sec: 2.17, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:34:34,856] [INFO] [logging.py:96:log_dist] [Rank 0] step=460, skipped=0, lr=[6.963399207516341e-06, 0.00036079788639981037, 6.963399207516341e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:34:34,857] [INFO] [timer.py:260:stop] epoch=35/micro_step=5/global_step=460, RunningAvgSamplesPerSec=8.757098448726934, CurrSamplesPerSec=8.732749275023798, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.28s, TFLOPs: 12.78, Samples/sec: 2.44, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 36/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0020115375518799, loss: 0.002009537536650896 -Beginning of Epoch 37/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.05, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:35:13,262] [INFO] [logging.py:96:log_dist] [Rank 0] step=470, skipped=0, lr=[6.858260345002454e-06, 0.0003553502769431323, 6.858260345002454e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:35:13,262] [INFO] [timer.py:260:stop] epoch=36/micro_step=2/global_step=470, RunningAvgSamplesPerSec=8.757270211763464, CurrSamplesPerSec=8.428941009539459, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.80s, TFLOPs: 11.02, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:35:49,937] [INFO] [logging.py:96:log_dist] [Rank 0] step=480, skipped=0, lr=[6.7519341157044806e-06, 0.0003498411458914239, 6.7519341157044806e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:35:49,938] [INFO] [timer.py:260:stop] epoch=36/micro_step=12/global_step=480, RunningAvgSamplesPerSec=8.756782242458922, CurrSamplesPerSec=8.739350356146602, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.28s, TFLOPs: 12.76, Samples/sec: 2.44, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 37/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019840002059937, loss: 0.0019820639863610268 -Beginning of Epoch 38/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.07, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.80s, TFLOPs: 11.03, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:36:28,334] [INFO] [logging.py:96:log_dist] [Rank 0] step=490, skipped=0, lr=[6.6444826111461464e-06, 0.00034427371042208013, 6.6444826111461464e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:36:28,334] [INFO] [timer.py:260:stop] epoch=37/micro_step=9/global_step=490, RunningAvgSamplesPerSec=8.756981470477982, CurrSamplesPerSec=8.730753686516143, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.80, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 38/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019906759262085, loss: 0.0019887189846485853 -Beginning of Epoch 39/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.07, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.05, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:37:06,720] [INFO] [logging.py:96:log_dist] [Rank 0] step=500, skipped=0, lr=[6.535968579980235e-06, 0.0003386512217606339, 6.535968579980235e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:37:06,720] [INFO] [timer.py:260:stop] epoch=38/micro_step=6/global_step=500, RunningAvgSamplesPerSec=8.757241627741788, CurrSamplesPerSec=8.741387451396294, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:37:21,338] [INFO] [loss_scaler.py:190:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, but hysteresis is 2. Reducing hysteresis to 1 -Model Parameters: 6.927 B, Latency: 3.62s, TFLOPs: 11.57, Samples/sec: 2.21, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:37:31,899] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, reducing to 1048576 -Model Parameters: 6.927 B, Latency: 3.22s, TFLOPs: 12.99, Samples/sec: 2.48, Time/seq 0.40s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 39/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.00199294090271, loss: 0.001990982796996832 -Beginning of Epoch 40/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.07, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.05, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:37:45,004] [INFO] [logging.py:96:log_dist] [Rank 0] step=510, skipped=2, lr=[6.448434900045946e-06, 0.0003341157979298418, 6.448434900045946e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:37:45,004] [INFO] [timer.py:260:stop] epoch=39/micro_step=3/global_step=510, RunningAvgSamplesPerSec=8.757979560309195, CurrSamplesPerSec=8.734847529360374, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:38:21,288] [INFO] [logging.py:96:log_dist] [Rank 0] step=520, skipped=2, lr=[6.338168404565823e-06, 0.00032840250800859184, 6.338168404565823e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:38:21,288] [INFO] [timer.py:260:stop] epoch=39/micro_step=13/global_step=520, RunningAvgSamplesPerSec=8.759310897972284, CurrSamplesPerSec=9.807558875395493, MemAllocated=6.82GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.26s, TFLOPs: 12.83, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 40/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019882917404175, loss: 0.0019863243214786053 -Beginning of Epoch 41/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.05, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.38, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:39:00,089] [INFO] [logging.py:96:log_dist] [Rank 0] step=530, skipped=2, lr=[6.22701826134884e-06, 0.0003226434332305098, 6.22701826134884e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:39:00,090] [INFO] [timer.py:260:stop] epoch=40/micro_step=10/global_step=530, RunningAvgSamplesPerSec=8.757646010895444, CurrSamplesPerSec=8.723811742389577, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.82, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 41/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0020133256912231, loss: 0.002011272357776761 -Beginning of Epoch 42/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.08, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.95s, TFLOPs: 10.60, Samples/sec: 2.03, Time/seq 0.49s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:39:38,628] [INFO] [logging.py:96:log_dist] [Rank 0] step=540, skipped=2, lr=[6.115049378948603e-06, 0.00031684193673308823, 6.115049378948603e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:39:38,628] [INFO] [timer.py:260:stop] epoch=41/micro_step=7/global_step=540, RunningAvgSamplesPerSec=8.757203294152472, CurrSamplesPerSec=8.726267087197247, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.80, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 42/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.001994252204895, loss: 0.001992281526327133 -Beginning of Epoch 43/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.81s, TFLOPs: 11.00, Samples/sec: 2.10, Time/seq 0.48s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.04, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:40:17,078] [INFO] [logging.py:96:log_dist] [Rank 0] step=550, skipped=2, lr=[6.002327144039352e-06, 0.00031100140642690937, 6.002327144039352e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:40:17,078] [INFO] [timer.py:260:stop] epoch=42/micro_step=4/global_step=550, RunningAvgSamplesPerSec=8.757276849684748, CurrSamplesPerSec=8.7264809814499, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.79, Samples/sec: 2.44, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 43/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019875764846802, loss: 0.0019855573773384094 -Beginning of Epoch 44/100, Total Micro Batches 13 -[2023-12-10 06:40:55,345] [INFO] [logging.py:96:log_dist] [Rank 0] step=560, skipped=2, lr=[5.888917383231966e-06, 0.0003051252530172003, 5.888917383231966e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:40:55,346] [INFO] [timer.py:260:stop] epoch=43/micro_step=1/global_step=560, RunningAvgSamplesPerSec=8.758010154262562, CurrSamplesPerSec=8.452355666436262, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.05, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.04, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:41:32,147] [INFO] [logging.py:96:log_dist] [Rank 0] step=570, skipped=2, lr=[5.774886324633097e-06, 0.00029921690801207757, 5.774886324633097e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:41:32,147] [INFO] [timer.py:260:stop] epoch=43/micro_step=11/global_step=570, RunningAvgSamplesPerSec=8.757055866331749, CurrSamplesPerSec=8.73437971090679, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.82, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 44/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019879341125488, loss: 0.00198596203699708 -Beginning of Epoch 45/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.80s, TFLOPs: 11.02, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.71s, TFLOPs: 11.30, Samples/sec: 2.16, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.70s, TFLOPs: 11.32, Samples/sec: 2.16, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:42:10,623] [INFO] [logging.py:96:log_dist] [Rank 0] step=580, skipped=2, lr=[5.660300559169808e-06, 0.00029327982171864286, 5.660300559169808e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:42:10,623] [INFO] [timer.py:260:stop] epoch=44/micro_step=8/global_step=580, RunningAvgSamplesPerSec=8.756930598042342, CurrSamplesPerSec=8.738201600533337, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.37, Samples/sec: 2.17, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.28s, TFLOPs: 12.76, Samples/sec: 2.44, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 45/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019927024841309, loss: 0.0019906465895473957 -Beginning of Epoch 46/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.84s, TFLOPs: 10.89, Samples/sec: 2.08, Time/seq 0.48s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.82s, TFLOPs: 10.97, Samples/sec: 2.10, Time/seq 0.48s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:42:49,141] [INFO] [logging.py:96:log_dist] [Rank 0] step=590, skipped=2, lr=[5.54522700170235e-06, 0.00028731746122810105, 5.54522700170235e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:42:49,142] [INFO] [timer.py:260:stop] epoch=45/micro_step=5/global_step=590, RunningAvgSamplesPerSec=8.75664799117546, CurrSamplesPerSec=8.730038155374544, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.81, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 46/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019900798797607, loss: 0.0019881378393620253 -Beginning of Epoch 47/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:43:27,520] [INFO] [logging.py:96:log_dist] [Rank 0] step=600, skipped=2, lr=[5.429732851947769e-06, 0.00028133330839107606, 5.429732851947769e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:43:27,521] [INFO] [timer.py:260:stop] epoch=46/micro_step=2/global_step=600, RunningAvgSamplesPerSec=8.756893916122376, CurrSamplesPerSec=8.444532443892959, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.04, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:43:56,811] [INFO] [loss_scaler.py:190:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, but hysteresis is 2. Reducing hysteresis to 1 -Model Parameters: 6.927 B, Latency: 3.62s, TFLOPs: 11.56, Samples/sec: 2.21, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.38, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:44:04,158] [INFO] [logging.py:96:log_dist] [Rank 0] step=610, skipped=3, lr=[5.325484249888706e-06, 0.0002759318264191039, 5.325484249888706e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:44:04,158] [INFO] [timer.py:260:stop] epoch=46/micro_step=12/global_step=610, RunningAvgSamplesPerSec=8.756652745113481, CurrSamplesPerSec=8.736660159517372, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:44:07,385] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, reducing to 1048576 -Model Parameters: 6.927 B, Latency: 3.22s, TFLOPs: 12.98, Samples/sec: 2.48, Time/seq 0.40s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 47/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.001990795135498, loss: 0.0019888815004378557 -Beginning of Epoch 48/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.04, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.05, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:44:42,517] [INFO] [logging.py:96:log_dist] [Rank 0] step=620, skipped=4, lr=[5.220998907987013e-06, 0.0002705180781340421, 5.220998907987013e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:44:42,518] [INFO] [timer.py:260:stop] epoch=47/micro_step=9/global_step=620, RunningAvgSamplesPerSec=8.757010418347626, CurrSamplesPerSec=8.725467771521235, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.81, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 48/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019954442977905, loss: 0.0019934314768761396 -Beginning of Epoch 49/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.05, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.91s, TFLOPs: 10.70, Samples/sec: 2.04, Time/seq 0.49s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:45:21,022] [INFO] [logging.py:96:log_dist] [Rank 0] step=630, skipped=4, lr=[5.104686542050893e-06, 0.00026449153067621207, 5.104686542050893e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:45:21,023] [INFO] [timer.py:260:stop] epoch=48/micro_step=6/global_step=630, RunningAvgSamplesPerSec=8.756772702957846, CurrSamplesPerSec=8.743277978279146, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.26s, TFLOPs: 12.82, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 49/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.001994013786316, loss: 0.001992016565054655 -Beginning of Epoch 50/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.04, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:45:59,393] [INFO] [logging.py:96:log_dist] [Rank 0] step=640, skipped=4, lr=[4.988210847050295e-06, 0.00025845652057255415, 4.988210847050295e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:45:59,393] [INFO] [timer.py:260:stop] epoch=49/micro_step=3/global_step=640, RunningAvgSamplesPerSec=8.757038711052212, CurrSamplesPerSec=8.733600503696909, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:46:35,665] [INFO] [logging.py:96:log_dist] [Rank 0] step=650, skipped=4, lr=[4.8716398415111015e-06, 0.0002524165720990208, 4.8716398415111015e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:46:35,665] [INFO] [timer.py:260:stop] epoch=49/micro_step=13/global_step=650, RunningAvgSamplesPerSec=8.758165577180671, CurrSamplesPerSec=9.784767057072504, MemAllocated=6.82GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.80, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 50/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019865036010742, loss: 0.001984527800232172 -Beginning of Epoch 51/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.07, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.81s, TFLOPs: 10.98, Samples/sec: 2.10, Time/seq 0.48s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.70s, TFLOPs: 11.31, Samples/sec: 2.16, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.69s, TFLOPs: 11.34, Samples/sec: 2.17, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:47:14,546] [INFO] [logging.py:96:log_dist] [Rank 0] step=660, skipped=4, lr=[4.75504159961786e-06, 0.00024637521241543313, 4.75504159961786e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:47:14,546] [INFO] [timer.py:260:stop] epoch=50/micro_step=10/global_step=660, RunningAvgSamplesPerSec=8.756568923899803, CurrSamplesPerSec=8.740621223697266, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.81, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 51/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019893646240234, loss: 0.0019873580895364285 -Beginning of Epoch 52/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.04, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.80s, TFLOPs: 11.03, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:47:52,955] [INFO] [logging.py:96:log_dist] [Rank 0] step=670, skipped=4, lr=[4.6384842114603854e-06, 0.00024033596950571943, 4.6384842114603854e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:47:52,956] [INFO] [timer.py:260:stop] epoch=51/micro_step=7/global_step=670, RunningAvgSamplesPerSec=8.75669245395709, CurrSamplesPerSec=8.717789554116033, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.38, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.82, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 52/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.001989722251892, loss: 0.001987772760912776 -Beginning of Epoch 53/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.08, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.03, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:48:31,341] [INFO] [logging.py:96:log_dist] [Rank 0] step=680, skipped=4, lr=[4.522035743271063e-06, 0.00023430237011767165, 4.522035743271063e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:48:31,341] [INFO] [timer.py:260:stop] epoch=52/micro_step=4/global_step=680, RunningAvgSamplesPerSec=8.75687750932279, CurrSamplesPerSec=8.74954647872707, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.38, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.65s, TFLOPs: 11.46, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.28s, TFLOPs: 12.78, Samples/sec: 2.44, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 53/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019886493682861, loss: 0.0019866677466779947 -Beginning of Epoch 54/100, Total Micro Batches 13 -[2023-12-10 06:49:09,603] [INFO] [logging.py:96:log_dist] [Rank 0] step=690, skipped=4, lr=[4.40576419767608e-06, 0.00022827793770342384, 4.40576419767608e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:49:09,603] [INFO] [timer.py:260:stop] epoch=53/micro_step=1/global_step=690, RunningAvgSamplesPerSec=8.757490332351503, CurrSamplesPerSec=8.455287453771373, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.04, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.38, Samples/sec: 2.17, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:49:46,423] [INFO] [logging.py:96:log_dist] [Rank 0] step=700, skipped=4, lr=[4.289737473983813e-06, 0.00022226619036185558, 4.289737473983813e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:49:46,423] [INFO] [timer.py:260:stop] epoch=53/micro_step=11/global_step=700, RunningAvgSamplesPerSec=8.75665710884377, CurrSamplesPerSec=8.730453262728776, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.81, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 54/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019887685775757, loss: 0.001986747607588768 -Beginning of Epoch 55/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.05, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.80s, TFLOPs: 11.03, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:50:24,822] [INFO] [logging.py:96:log_dist] [Rank 0] step=710, skipped=4, lr=[4.174023328533537e-06, 0.00021627063878412107, 4.174023328533537e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:50:24,823] [INFO] [timer.py:260:stop] epoch=54/micro_step=8/global_step=710, RunningAvgSamplesPerSec=8.756808692911292, CurrSamplesPerSec=8.744898673889441, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:50:32,122] [INFO] [loss_scaler.py:190:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, but hysteresis is 2. Reducing hysteresis to 1 -Model Parameters: 6.927 B, Latency: 3.62s, TFLOPs: 11.56, Samples/sec: 2.21, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:50:42,694] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, reducing to 1048576 -Model Parameters: 6.927 B, Latency: 3.22s, TFLOPs: 12.98, Samples/sec: 2.48, Time/seq 0.40s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 55/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.001991629600525, loss: 0.0019896512385457754 -Beginning of Epoch 56/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.80s, TFLOPs: 11.01, Samples/sec: 2.10, Time/seq 0.48s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.94s, TFLOPs: 10.63, Samples/sec: 2.03, Time/seq 0.49s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:51:03,302] [INFO] [logging.py:96:log_dist] [Rank 0] step=720, skipped=6, lr=[4.081722485244305e-06, 0.00021148821167068938, 4.081722485244305e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:51:03,303] [INFO] [timer.py:260:stop] epoch=55/micro_step=5/global_step=720, RunningAvgSamplesPerSec=8.756687792333802, CurrSamplesPerSec=8.728982109446067, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.81, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 56/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.001996636390686, loss: 0.0019946619868278503 -Beginning of Epoch 57/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.05, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:51:41,690] [INFO] [logging.py:96:log_dist] [Rank 0] step=730, skipped=6, lr=[3.966741118751625e-06, 0.00020553062791459192, 3.966741118751625e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:51:41,690] [INFO] [timer.py:260:stop] epoch=56/micro_step=2/global_step=730, RunningAvgSamplesPerSec=8.756847290046304, CurrSamplesPerSec=8.425095508943828, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.80s, TFLOPs: 11.02, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:52:18,389] [INFO] [logging.py:96:log_dist] [Rank 0] step=740, skipped=6, lr=[3.852260951276693e-06, 0.00019959901301951777, 3.852260951276693e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:52:18,390] [INFO] [timer.py:260:stop] epoch=56/micro_step=12/global_step=740, RunningAvgSamplesPerSec=8.756458619656524, CurrSamplesPerSec=8.731193853218743, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.81, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 57/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019850730895996, loss: 0.001983076333999634 -Beginning of Epoch 58/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.07, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.80s, TFLOPs: 11.03, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:52:56,775] [INFO] [logging.py:96:log_dist] [Rank 0] step=750, skipped=6, lr=[3.7383488360135085e-06, 0.00019369683088152893, 3.7383488360135085e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:52:56,776] [INFO] [timer.py:260:stop] epoch=57/micro_step=9/global_step=750, RunningAvgSamplesPerSec=8.75662969537434, CurrSamplesPerSec=8.728977000167532, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.28s, TFLOPs: 12.77, Samples/sec: 2.44, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 58/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019882917404175, loss: 0.001986331306397915 -Beginning of Epoch 59/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.04, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.37, Samples/sec: 2.17, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:53:35,209] [INFO] [logging.py:96:log_dist] [Rank 0] step=760, skipped=6, lr=[3.6250712944295767e-06, 0.00018782752820878634, 3.6250712944295767e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:53:35,209] [INFO] [timer.py:260:stop] epoch=58/micro_step=6/global_step=760, RunningAvgSamplesPerSec=8.756658328593598, CurrSamplesPerSec=8.713949853232839, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.74s, TFLOPs: 11.19, Samples/sec: 2.14, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.80, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 59/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019879341125488, loss: 0.001985935727134347 -Beginning of Epoch 60/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.04, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:54:13,668] [INFO] [logging.py:96:log_dist] [Rank 0] step=770, skipped=6, lr=[3.5124944774192407e-06, 0.00018199453250876892, 3.5124944774192407e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:54:13,669] [INFO] [timer.py:260:stop] epoch=59/micro_step=3/global_step=770, RunningAvgSamplesPerSec=8.756605266277033, CurrSamplesPerSec=8.746151208018237, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.38, Samples/sec: 2.17, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:54:49,971] [INFO] [logging.py:96:log_dist] [Rank 0] step=780, skipped=6, lr=[3.4006841266734316e-06, 0.0001762012500867063, 3.4006841266734316e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:54:49,971] [INFO] [timer.py:260:stop] epoch=59/micro_step=13/global_step=780, RunningAvgSamplesPerSec=8.757453970758894, CurrSamplesPerSec=9.769155788573897, MemAllocated=6.82GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.28s, TFLOPs: 12.78, Samples/sec: 2.44, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 60/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.001989483833313, loss: 0.0019874281715601683 -Beginning of Epoch 61/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.05, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:55:28,766] [INFO] [logging.py:96:log_dist] [Rank 0] step=790, skipped=6, lr=[3.2897055362883607e-06, 0.00017045106405639174, 3.2897055362883607e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:55:28,767] [INFO] [timer.py:260:stop] epoch=60/micro_step=10/global_step=790, RunningAvgSamplesPerSec=8.756376319349751, CurrSamplesPerSec=8.735969815808637, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.28s, TFLOPs: 12.75, Samples/sec: 2.44, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 61/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.001990556716919, loss: 0.0019886246882379055 -Beginning of Epoch 62/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.80s, TFLOPs: 11.03, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:56:07,203] [INFO] [logging.py:96:log_dist] [Rank 0] step=800, skipped=6, lr=[3.1796235146356122e-06, 0.00016474733236453948, 3.1796235146356122e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:56:07,203] [INFO] [timer.py:260:stop] epoch=61/micro_step=7/global_step=800, RunningAvgSamplesPerSec=8.756400794124774, CurrSamplesPerSec=8.72152552733405, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.81, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 62/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019898414611816, loss: 0.0019879003521054983 -Beginning of Epoch 63/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.91s, TFLOPs: 10.70, Samples/sec: 2.04, Time/seq 0.49s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:56:45,711] [INFO] [logging.py:96:log_dist] [Rank 0] step=810, skipped=6, lr=[3.070502346515878e-06, 0.00015909338582983823, 3.070502346515878e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:56:45,712] [INFO] [timer.py:260:stop] epoch=62/micro_step=4/global_step=810, RunningAvgSamplesPerSec=8.756204598529076, CurrSamplesPerSec=8.731304611829303, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:57:07,676] [INFO] [loss_scaler.py:190:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, but hysteresis is 2. Reducing hysteresis to 1 -Model Parameters: 6.927 B, Latency: 3.62s, TFLOPs: 11.56, Samples/sec: 2.21, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:57:18,239] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, reducing to 1048576 -Model Parameters: 6.927 B, Latency: 3.23s, TFLOPs: 12.96, Samples/sec: 2.48, Time/seq 0.40s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 63/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.001989483833313, loss: 0.0019875008147209883 -Beginning of Epoch 64/100, Total Micro Batches 13 -[2023-12-10 06:57:23,896] [INFO] [logging.py:96:log_dist] [Rank 0] step=820, skipped=8, lr=[2.983940068688427e-06, 0.00015460829371442624, 2.983940068688427e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:57:23,897] [INFO] [timer.py:260:stop] epoch=63/micro_step=1/global_step=820, RunningAvgSamplesPerSec=8.756981945778838, CurrSamplesPerSec=8.459428751363525, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.03, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.38, Samples/sec: 2.17, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:58:00,748] [INFO] [logging.py:96:log_dist] [Rank 0] step=830, skipped=8, lr=[2.8767086197927732e-06, 0.0001490522600928898, 2.8767086197927732e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:58:00,749] [INFO] [timer.py:260:stop] epoch=63/micro_step=11/global_step=830, RunningAvgSamplesPerSec=8.756202086066972, CurrSamplesPerSec=8.722553127109675, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.28s, TFLOPs: 12.77, Samples/sec: 2.44, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 64/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019956827163696, loss: 0.001993693644180894 -Beginning of Epoch 65/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.80s, TFLOPs: 11.03, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:58:39,142] [INFO] [logging.py:96:log_dist] [Rank 0] step=840, skipped=8, lr=[2.7706149181985245e-06, 0.00014355517710873183, 2.7706149181985245e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:58:39,142] [INFO] [timer.py:260:stop] epoch=64/micro_step=8/global_step=840, RunningAvgSamplesPerSec=8.756336412694067, CurrSamplesPerSec=8.751983803534918, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.28s, TFLOPs: 12.78, Samples/sec: 2.44, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 65/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019856691360474, loss: 0.0019837485160678625 -Beginning of Epoch 66/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.07, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.04, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:59:17,518] [INFO] [logging.py:96:log_dist] [Rank 0] step=850, skipped=8, lr=[2.665720919639773e-06, 0.00013812025490361516, 2.665720919639773e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:59:17,518] [INFO] [timer.py:260:stop] epoch=65/micro_step=5/global_step=850, RunningAvgSamplesPerSec=8.756513508822625, CurrSamplesPerSec=8.73228338567457, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.81, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 66/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019901990890503, loss: 0.0019882130436599255 -Beginning of Epoch 67/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.04, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 06:59:55,915] [INFO] [logging.py:96:log_dist] [Rank 0] step=860, skipped=8, lr=[2.562087879257817e-06, 0.0001327506673190579, 2.562087879257817e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 06:59:55,915] [INFO] [timer.py:260:stop] epoch=66/micro_step=2/global_step=860, RunningAvgSamplesPerSec=8.75664056300971, CurrSamplesPerSec=8.422012332790702, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.80s, TFLOPs: 11.01, Samples/sec: 2.10, Time/seq 0.48s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:00:32,620] [INFO] [logging.py:96:log_dist] [Rank 0] step=870, skipped=8, lr=[2.459776315829897e-06, 0.00012744955004299983, 2.459776315829897e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:00:32,621] [INFO] [timer.py:260:stop] epoch=66/micro_step=12/global_step=870, RunningAvgSamplesPerSec=8.756298878424012, CurrSamplesPerSec=8.744567649972655, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.26s, TFLOPs: 12.82, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 67/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.001989722251892, loss: 0.001987697323784232 -Beginning of Epoch 68/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.07, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.04, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:01:10,990] [INFO] [logging.py:96:log_dist] [Rank 0] step=880, skipped=8, lr=[2.35884597642792e-06, 0.0001222199987786487, 2.35884597642792e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:01:10,991] [INFO] [timer.py:260:stop] epoch=67/micro_step=9/global_step=880, RunningAvgSamplesPerSec=8.756499448900463, CurrSamplesPerSec=8.729208059077616, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.80, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 68/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019900798797607, loss: 0.001988109201192856 -Beginning of Epoch 69/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.05, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.80s, TFLOPs: 11.03, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:01:49,382] [INFO] [logging.py:96:log_dist] [Rank 0] step=890, skipped=8, lr=[2.25935580152786e-06, 0.00011706506743667666, 2.25935580152786e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:01:49,382] [INFO] [timer.py:260:stop] epoch=68/micro_step=6/global_step=890, RunningAvgSamplesPerSec=8.756642381475096, CurrSamplesPerSec=8.744226396818043, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.38, Samples/sec: 2.17, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.82, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 69/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.001988410949707, loss: 0.0019863341003656387 -Beginning of Epoch 70/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.95s, TFLOPs: 10.59, Samples/sec: 2.02, Time/seq 0.49s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:02:27,963] [INFO] [logging.py:96:log_dist] [Rank 0] step=900, skipped=8, lr=[2.161363890590179e-06, 0.00011198776635182272, 2.161363890590179e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:02:27,963] [INFO] [timer.py:260:stop] epoch=69/micro_step=3/global_step=900, RunningAvgSamplesPerSec=8.756286711986952, CurrSamplesPerSec=8.717809372667588, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.70s, TFLOPs: 11.31, Samples/sec: 2.16, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.65s, TFLOPs: 11.46, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.37, Samples/sec: 2.17, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.69s, TFLOPs: 11.35, Samples/sec: 2.17, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:03:04,351] [INFO] [logging.py:96:log_dist] [Rank 0] step=910, skipped=8, lr=[2.0649274681313685e-06, 0.00010699106052494137, 2.0649274681313685e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:03:04,352] [INFO] [timer.py:260:stop] epoch=69/micro_step=13/global_step=910, RunningAvgSamplesPerSec=8.756806097447003, CurrSamplesPerSec=9.774563521792176, MemAllocated=6.82GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.28s, TFLOPs: 12.78, Samples/sec: 2.44, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 70/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019901990890503, loss: 0.0019881760235875845 -Beginning of Epoch 71/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.07, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.80s, TFLOPs: 11.03, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:03:43,146] [INFO] [loss_scaler.py:190:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, but hysteresis is 2. Reducing hysteresis to 1 -[2023-12-10 07:03:43,147] [INFO] [logging.py:96:log_dist] [Rank 0] step=920, skipped=9, lr=[1.9795111951706703e-06, 0.00010256534690003472, 1.9795111951706703e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:03:43,147] [INFO] [timer.py:260:stop] epoch=70/micro_step=10/global_step=920, RunningAvgSamplesPerSec=8.755943526469096, CurrSamplesPerSec=8.849367462589164, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.62s, TFLOPs: 11.57, Samples/sec: 2.21, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.70s, TFLOPs: 11.33, Samples/sec: 2.16, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:03:53,750] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, reducing to 1048576 -Model Parameters: 6.927 B, Latency: 3.23s, TFLOPs: 12.98, Samples/sec: 2.48, Time/seq 0.40s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 71/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.001989483833313, loss: 0.001987537369132042 -Beginning of Epoch 72/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.80s, TFLOPs: 11.03, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.80s, TFLOPs: 11.02, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:04:21,600] [INFO] [logging.py:96:log_dist] [Rank 0] step=930, skipped=10, lr=[1.8954408997747187e-06, 9.820937304532221e-05, 1.8954408997747187e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:04:21,601] [INFO] [timer.py:260:stop] epoch=71/micro_step=7/global_step=930, RunningAvgSamplesPerSec=8.755995279168372, CurrSamplesPerSec=8.723122293719557, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.28s, TFLOPs: 12.77, Samples/sec: 2.44, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 72/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.001994013786316, loss: 0.001991999102756381 -Beginning of Epoch 73/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.07, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.03, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:04:59,997] [INFO] [logging.py:96:log_dist] [Rank 0] step=940, skipped=10, lr=[1.8036564218776367e-06, 9.345370061542158e-05, 1.8036564218776367e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:04:59,997] [INFO] [timer.py:260:stop] epoch=72/micro_step=4/global_step=940, RunningAvgSamplesPerSec=8.75611402814881, CurrSamplesPerSec=8.725566472558208, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.81, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 73/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019896030426025, loss: 0.0019876346923410892 -Beginning of Epoch 74/100, Total Micro Batches 13 -[2023-12-10 07:05:38,237] [INFO] [logging.py:96:log_dist] [Rank 0] step=950, skipped=10, lr=[1.7136363235479782e-06, 8.878944681595741e-05, 1.7136363235479782e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:05:38,237] [INFO] [timer.py:260:stop] epoch=73/micro_step=1/global_step=950, RunningAvgSamplesPerSec=8.756619866389785, CurrSamplesPerSec=8.46787845644278, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.07, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.04, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:06:15,033] [INFO] [logging.py:96:log_dist] [Rank 0] step=960, skipped=10, lr=[1.6254331739881644e-06, 8.421933543980126e-05, 1.6254331739881644e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:06:15,034] [INFO] [timer.py:260:stop] epoch=73/micro_step=11/global_step=960, RunningAvgSamplesPerSec=8.756076195954643, CurrSamplesPerSec=8.735572377446898, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.82, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 74/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019892454147339, loss: 0.001987285679206252 -Beginning of Epoch 75/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.07, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.04, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:06:53,402] [INFO] [logging.py:96:log_dist] [Rank 0] step=970, skipped=10, lr=[1.5390984813537033e-06, 7.974603530330068e-05, 1.5390984813537033e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:06:53,403] [INFO] [timer.py:260:stop] epoch=74/micro_step=8/global_step=970, RunningAvgSamplesPerSec=8.756260297649952, CurrSamplesPerSec=8.730096642804085, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 +Model Parameters: 6.927 B, Latency: 3.63s, TFLOPs: 11.52, Samples/sec: 2.20, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 Model Parameters: 6.927 B, Latency: 3.26s, TFLOPs: 12.83, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 75/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019913911819458, loss: 0.0019894628785550594 -Beginning of Epoch 76/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.07, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.05, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:07:31,751] [INFO] [logging.py:96:log_dist] [Rank 0] step=980, skipped=10, lr=[1.4546826626738797e-06, 7.537215868776579e-05, 1.4546826626738797e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:07:31,752] [INFO] [timer.py:260:stop] epoch=75/micro_step=5/global_step=980, RunningAvgSamplesPerSec=8.756470841404647, CurrSamplesPerSec=8.740949671870188, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.81, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 76/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019885301589966, loss: 0.0019865629728883505 -Beginning of Epoch 77/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.07, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:08:10,258] [INFO] [logging.py:96:log_dist] [Rank 0] step=990, skipped=10, lr=[1.3722350144096164e-06, 7.110025981396976e-05, 1.3722350144096164e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:08:10,258] [INFO] [timer.py:260:stop] epoch=76/micro_step=2/global_step=990, RunningAvgSamplesPerSec=8.756344040689795, CurrSamplesPerSec=8.187218792150677, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.91s, TFLOPs: 10.71, Samples/sec: 2.05, Time/seq 0.49s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:08:46,927] [INFO] [logging.py:96:log_dist] [Rank 0] step=1000, skipped=10, lr=[1.2918036836657442e-06, 6.69328333505567e-05, 1.2918036836657442e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:08:46,928] [INFO] [timer.py:260:stop] epoch=76/micro_step=12/global_step=1000, RunningAvgSamplesPerSec=8.756129613928351, CurrSamplesPerSec=8.727611334200084, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.82, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 77/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.001990795135498, loss: 0.0019888202659785748 -Beginning of Epoch 78/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.07, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.04, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:09:25,336] [INFO] [logging.py:96:log_dist] [Rank 0] step=1010, skipped=10, lr=[1.2134356400744368e-06, 6.28723129572247e-05, 1.2134356400744368e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:09:25,336] [INFO] [timer.py:260:stop] epoch=77/micro_step=9/global_step=1010, RunningAvgSamplesPerSec=8.756216438645868, CurrSamplesPerSec=8.714569387346051, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.70s, TFLOPs: 11.31, Samples/sec: 2.16, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.80, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 78/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019917488098145, loss: 0.0019897310994565487 -Beginning of Epoch 79/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.03, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:10:03,783] [INFO] [logging.py:96:log_dist] [Rank 0] step=1020, skipped=10, lr=[1.1371766483662815e-06, 5.89210698635379e-05, 1.1371766483662815e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:10:03,783] [INFO] [timer.py:260:stop] epoch=78/micro_step=6/global_step=1020, RunningAvgSamplesPerSec=8.756219272074173, CurrSamplesPerSec=8.7231228606556, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:10:18,429] [INFO] [loss_scaler.py:190:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, but hysteresis is 2. Reducing hysteresis to 1 -Model Parameters: 6.927 B, Latency: 3.62s, TFLOPs: 11.56, Samples/sec: 2.21, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:10:28,988] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, reducing to 1048576 -Model Parameters: 6.927 B, Latency: 3.22s, TFLOPs: 12.99, Samples/sec: 2.48, Time/seq 0.40s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 79/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019909143447876, loss: 0.001988966017961502 -Beginning of Epoch 80/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.07, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.05, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:10:42,096] [INFO] [logging.py:96:log_dist] [Rank 0] step=1030, skipped=12, lr=[1.077717940651778e-06, 5.5840307805791605e-05, 1.077717940651778e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:10:42,096] [INFO] [timer.py:260:stop] epoch=79/micro_step=3/global_step=1030, RunningAvgSamplesPerSec=8.756544002479464, CurrSamplesPerSec=8.74106238594404, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:11:18,355] [INFO] [logging.py:96:log_dist] [Rank 0] step=1040, skipped=12, lr=[1.0053666214803133e-06, 5.209153479172607e-05, 1.0053666214803133e-06], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:11:18,355] [INFO] [timer.py:260:stop] epoch=79/micro_step=13/global_step=1040, RunningAvgSamplesPerSec=8.757281001737269, CurrSamplesPerSec=9.795721805549514, MemAllocated=6.82GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.81, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 80/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.00199294090271, loss: 0.0019909366965293884 -Beginning of Epoch 81/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.80s, TFLOPs: 11.03, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:11:57,131] [INFO] [logging.py:96:log_dist] [Rank 0] step=1050, skipped=12, lr=[9.352458606395539e-07, 4.8458334748163416e-05, 9.352458606395539e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:11:57,132] [INFO] [timer.py:260:stop] epoch=80/micro_step=10/global_step=1050, RunningAvgSamplesPerSec=8.756501458995137, CurrSamplesPerSec=8.736748877163606, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.82, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 81/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019934177398682, loss: 0.001991452882066369 -Beginning of Epoch 82/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.08, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.03, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:12:35,519] [INFO] [logging.py:96:log_dist] [Rank 0] step=1060, skipped=12, lr=[8.673966066792045e-07, 4.494282936161681e-05, 8.673966066792045e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:12:35,519] [INFO] [timer.py:260:stop] epoch=81/micro_step=7/global_step=1060, RunningAvgSamplesPerSec=8.756615415835967, CurrSamplesPerSec=8.737323310651409, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.65s, TFLOPs: 11.46, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.65s, TFLOPs: 11.46, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.80, Samples/sec: 2.44, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 82/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019924640655518, loss: 0.0019904817454516888 -Beginning of Epoch 83/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.05, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:13:13,884] [INFO] [logging.py:96:log_dist] [Rank 0] step=1070, skipped=12, lr=[8.018584816529132e-07, 4.154707158823384e-05, 8.018584816529132e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:13:13,884] [INFO] [timer.py:260:stop] epoch=82/micro_step=4/global_step=1070, RunningAvgSamplesPerSec=8.756783604985005, CurrSamplesPerSec=8.729509532705126, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.81, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 83/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019915103912354, loss: 0.0019894870929419994 -Beginning of Epoch 84/100, Total Micro Batches 13 -[2023-12-10 07:13:52,139] [INFO] [logging.py:96:log_dist] [Rank 0] step=1080, skipped=12, lr=[7.38669757980121e-07, 3.8273044454928546e-05, 7.38669757980121e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:13:52,139] [INFO] [timer.py:260:stop] epoch=83/micro_step=1/global_step=1080, RunningAvgSamplesPerSec=8.75719689887582, CurrSamplesPerSec=8.457020006212757, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.95s, TFLOPs: 10.60, Samples/sec: 2.03, Time/seq 0.49s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:14:29,133] [INFO] [logging.py:96:log_dist] [Rank 0] step=1090, skipped=12, lr=[6.778673360960303e-07, 3.512265990134872e-05, 6.778673360960303e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:14:29,133] [INFO] [timer.py:260:stop] epoch=83/micro_step=11/global_step=1090, RunningAvgSamplesPerSec=8.756286116456815, CurrSamplesPerSec=8.726870783883534, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.28s, TFLOPs: 12.77, Samples/sec: 2.44, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 84/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019923448562622, loss: 0.001990363933146 -Beginning of Epoch 85/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.07, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.80s, TFLOPs: 11.02, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:15:07,543] [INFO] [logging.py:96:log_dist] [Rank 0] step=1100, skipped=12, lr=[6.194867229028038e-07, 3.209775766335771e-05, 6.194867229028038e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:15:07,544] [INFO] [timer.py:260:stop] epoch=84/micro_step=8/global_step=1100, RunningAvgSamplesPerSec=8.756382314992669, CurrSamplesPerSec=8.733562427841314, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.28s, TFLOPs: 12.78, Samples/sec: 2.44, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 85/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019913911819458, loss: 0.001989389769732952 -Beginning of Epoch 86/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.05, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.80s, TFLOPs: 11.03, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:15:45,978] [INFO] [logging.py:96:log_dist] [Rank 0] step=1110, skipped=12, lr=[5.63562011034525e-07, 2.920010419868005e-05, 5.63562011034525e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:15:45,978] [INFO] [timer.py:260:stop] epoch=85/micro_step=5/global_step=1110, RunningAvgSamplesPerSec=8.756410409100466, CurrSamplesPerSec=8.67285709697849, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.69s, TFLOPs: 11.34, Samples/sec: 2.17, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.65s, TFLOPs: 11.46, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.79, Samples/sec: 2.44, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 86/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019928216934204, loss: 0.0019907946698367596 -Beginning of Epoch 87/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.07, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:16:24,378] [INFO] [logging.py:96:log_dist] [Rank 0] step=1120, skipped=12, lr=[5.101258589480637e-07, 2.6431391655340085e-05, 5.101258589480637e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:16:24,378] [INFO] [timer.py:260:stop] epoch=86/micro_step=2/global_step=1120, RunningAvgSamplesPerSec=8.756499415833007, CurrSamplesPerSec=8.388570251433869, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.82s, TFLOPs: 10.97, Samples/sec: 2.10, Time/seq 0.48s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:16:53,676] [INFO] [loss_scaler.py:190:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, but hysteresis is 2. Reducing hysteresis to 1 -Model Parameters: 6.927 B, Latency: 3.62s, TFLOPs: 11.56, Samples/sec: 2.21, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:17:01,012] [INFO] [logging.py:96:log_dist] [Rank 0] step=1130, skipped=13, lr=[4.641868621121236e-07, 2.40511327519235e-05, 4.641868621121236e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:17:01,012] [INFO] [timer.py:260:stop] epoch=86/micro_step=12/global_step=1130, RunningAvgSamplesPerSec=8.756386303215358, CurrSamplesPerSec=8.744179683099883, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:17:04,241] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, reducing to 1048576 -Model Parameters: 6.927 B, Latency: 3.23s, TFLOPs: 12.97, Samples/sec: 2.48, Time/seq 0.40s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 87/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019917488098145, loss: 0.0019897359889000654 -Beginning of Epoch 88/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.04, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.71s, TFLOPs: 11.28, Samples/sec: 2.16, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:17:39,409] [INFO] [logging.py:96:log_dist] [Rank 0] step=1140, skipped=14, lr=[4.203106239268923e-07, 2.1777752535072138e-05, 4.203106239268923e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:17:39,409] [INFO] [timer.py:260:stop] epoch=87/micro_step=9/global_step=1140, RunningAvgSamplesPerSec=8.756500867976731, CurrSamplesPerSec=8.72198516834522, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.26s, TFLOPs: 12.82, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 88/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.001994252204895, loss: 0.0019922100473195314 -Beginning of Epoch 89/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.04, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.69s, TFLOPs: 11.35, Samples/sec: 2.17, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:18:17,827] [INFO] [logging.py:96:log_dist] [Rank 0] step=1150, skipped=14, lr=[3.7400374289118037e-07, 1.9378432274154422e-05, 3.7400374289118037e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:18:17,828] [INFO] [timer.py:260:stop] epoch=88/micro_step=6/global_step=1150, RunningAvgSamplesPerSec=8.756553998326424, CurrSamplesPerSec=8.752224072205832, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.81, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 89/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019944906234741, loss: 0.0019924496300518513 -Beginning of Epoch 90/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.07, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.04, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:18:56,199] [INFO] [logging.py:96:log_dist] [Rank 0] step=1160, skipped=14, lr=[3.302961183278485e-07, 1.7113788514396295e-05, 3.302961183278485e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:18:56,200] [INFO] [timer.py:260:stop] epoch=89/micro_step=3/global_step=1160, RunningAvgSamplesPerSec=8.756692419631815, CurrSamplesPerSec=8.73828409148986, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:19:33,769] [INFO] [logging.py:96:log_dist] [Rank 0] step=1170, skipped=14, lr=[2.892132742588155e-07, 1.4985143743980079e-05, 2.892132742588155e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:19:33,877] [INFO] [timer.py:260:stop] epoch=89/micro_step=13/global_step=1170, RunningAvgSamplesPerSec=8.757091347986577, CurrSamplesPerSec=9.484395319284761, MemAllocated=6.82GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.38s, TFLOPs: 12.40, Samples/sec: 2.37, Time/seq 0.42s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 90/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.001993179321289, loss: 0.0019911343697458506 -Beginning of Epoch 91/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.07, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.98s, TFLOPs: 10.53, Samples/sec: 2.01, Time/seq 0.50s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.69s, TFLOPs: 11.35, Samples/sec: 2.17, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:20:12,884] [INFO] [logging.py:96:log_dist] [Rank 0] step=1180, skipped=14, lr=[2.507792019081123e-07, 1.299374103150841e-05, 2.507792019081123e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:20:13,418] [INFO] [timer.py:260:stop] epoch=90/micro_step=10/global_step=1180, RunningAvgSamplesPerSec=8.754870497437546, CurrSamplesPerSec=7.608543503867855, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 4.21s, TFLOPs: 9.95, Samples/sec: 1.90, Time/seq 0.53s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.37, Samples/sec: 2.17, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.80, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 91/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019943714141846, loss: 0.0019924272783100605 -Beginning of Epoch 92/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.09, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.03, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:20:53,518] [INFO] [logging.py:96:log_dist] [Rank 0] step=1190, skipped=14, lr=[2.1501634569167147e-07, 1.1140743300086603e-05, 2.1501634569167147e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:20:53,518] [INFO] [timer.py:260:stop] epoch=91/micro_step=7/global_step=1190, RunningAvgSamplesPerSec=8.754980631984049, CurrSamplesPerSec=8.704417316882472, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.38, Samples/sec: 2.17, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.28s, TFLOPs: 12.78, Samples/sec: 2.44, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 92/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.00199294090271, loss: 0.0019910093396902084 -Beginning of Epoch 93/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.05, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.83s, TFLOPs: 10.94, Samples/sec: 2.09, Time/seq 0.48s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.38, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:21:35,397] [INFO] [logging.py:96:log_dist] [Rank 0] step=1200, skipped=14, lr=[1.8194559011042718e-07, 9.42723264820866e-06, 1.8194559011042718e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:21:35,398] [INFO] [timer.py:260:stop] epoch=92/micro_step=4/global_step=1200, RunningAvgSamplesPerSec=8.754961520687639, CurrSamplesPerSec=8.71396229962644, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.38, Samples/sec: 2.17, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.38, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.81, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 93/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019943714141846, loss: 0.001992370467633009 -Beginning of Epoch 94/100, Total Micro Batches 13 -[2023-12-10 07:22:13,691] [INFO] [logging.py:96:log_dist] [Rank 0] step=1210, skipped=14, lr=[1.5158624755435507e-07, 7.854209717842232e-06, 1.5158624755435507e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:22:13,691] [INFO] [timer.py:260:stop] epoch=93/micro_step=1/global_step=1210, RunningAvgSamplesPerSec=8.75527129673384, CurrSamplesPerSec=8.44843985102496, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.05, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.05, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:22:50,502] [INFO] [logging.py:96:log_dist] [Rank 0] step=1220, skipped=14, lr=[1.2395604702457572e-07, 6.422593110081643e-06, 1.2395604702457572e-07], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:22:50,503] [INFO] [timer.py:260:stop] epoch=93/micro_step=11/global_step=1220, RunningAvgSamplesPerSec=8.754831503995051, CurrSamplesPerSec=8.712668630674683, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.81, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 94/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.001991629600525, loss: 0.001989718759432435 -Beginning of Epoch 95/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.05, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:23:28,889] [INFO] [logging.py:96:log_dist] [Rank 0] step=1230, skipped=14, lr=[9.907112378012256e-08, 5.133218848711013e-06, 9.907112378012256e-08], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:23:28,890] [INFO] [timer.py:260:stop] epoch=94/micro_step=8/global_step=1230, RunningAvgSamplesPerSec=8.754950450079868, CurrSamplesPerSec=8.740926332450499, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:23:36,179] [INFO] [loss_scaler.py:190:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, but hysteresis is 2. Reducing hysteresis to 1 -Model Parameters: 6.927 B, Latency: 3.62s, TFLOPs: 11.56, Samples/sec: 2.21, Time/seq 0.45s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.39, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:23:46,759] [INFO] [loss_scaler.py:183:update_scale] [deepspeed] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 2097152, reducing to 1048576 -Model Parameters: 6.927 B, Latency: 3.23s, TFLOPs: 12.96, Samples/sec: 2.48, Time/seq 0.40s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 95/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019959211349487, loss: 0.0019938969053328037 -Beginning of Epoch 96/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.04, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.04, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:24:07,238] [INFO] [logging.py:96:log_dist] [Rank 0] step=1240, skipped=16, lr=[8.114960448527267e-08, 4.204642719444179e-06, 8.114960448527267e-08], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:24:07,239] [INFO] [timer.py:260:stop] epoch=95/micro_step=5/global_step=1240, RunningAvgSamplesPerSec=8.755180925255077, CurrSamplesPerSec=8.732734502159342, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.81, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 96/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019934177398682, loss: 0.001991386990994215 -Beginning of Epoch 97/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:24:45,616] [INFO] [logging.py:96:log_dist] [Rank 0] step=1250, skipped=16, lr=[6.124171846399337e-08, 3.173146034403801e-06, 6.124171846399337e-08], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:24:45,617] [INFO] [timer.py:260:stop] epoch=96/micro_step=2/global_step=1250, RunningAvgSamplesPerSec=8.755322918509043, CurrSamplesPerSec=8.431795127656661, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.80s, TFLOPs: 11.03, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.70s, TFLOPs: 11.33, Samples/sec: 2.16, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.68s, TFLOPs: 11.37, Samples/sec: 2.17, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:25:22,339] [INFO] [logging.py:96:log_dist] [Rank 0] step=1260, skipped=16, lr=[4.4115733129859654e-08, 2.2857892813398785e-06, 4.4115733129859654e-08], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:25:22,340] [INFO] [timer.py:260:stop] epoch=96/micro_step=12/global_step=1260, RunningAvgSamplesPerSec=8.755074062821468, CurrSamplesPerSec=8.743711434136921, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.28s, TFLOPs: 12.76, Samples/sec: 2.44, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 97/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019935369491577, loss: 0.001991543686017394 -Beginning of Epoch 98/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.06, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.94s, TFLOPs: 10.63, Samples/sec: 2.03, Time/seq 0.49s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.40, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:26:00,893] [INFO] [logging.py:96:log_dist] [Rank 0] step=1270, skipped=16, lr=[2.9781649576030932e-08, 1.5430906516078202e-06, 2.9781649576030932e-08], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:26:00,894] [INFO] [timer.py:260:stop] epoch=97/micro_step=9/global_step=1270, RunningAvgSamplesPerSec=8.75487287153051, CurrSamplesPerSec=8.724669736538726, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.41, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.81, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 98/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019922256469727, loss: 0.0019902645144611597 -Beginning of Epoch 99/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.05, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.79s, TFLOPs: 11.04, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:26:39,264] [INFO] [logging.py:96:log_dist] [Rank 0] step=1280, skipped=16, lr=[1.8247838503651697e-08, 9.45483860292834e-07, 1.8247838503651697e-08], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:26:39,265] [INFO] [timer.py:260:stop] epoch=98/micro_step=6/global_step=1280, RunningAvgSamplesPerSec=8.755019119398893, CurrSamplesPerSec=8.737869376862575, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.80, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 99/100 ***** -Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019930601119995, loss: 0.001991098280996084 -Beginning of Epoch 100/100, Total Micro Batches 13 -Model Parameters: 6.927 B, Latency: 3.78s, TFLOPs: 11.09, Samples/sec: 2.12, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.80s, TFLOPs: 11.03, Samples/sec: 2.11, Time/seq 0.47s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:27:17,635] [INFO] [logging.py:96:log_dist] [Rank 0] step=1290, skipped=16, lr=[9.52103533358973e-09, 4.933178929321103e-07, 9.52103533358973e-09], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:27:17,636] [INFO] [timer.py:260:stop] epoch=99/micro_step=3/global_step=1290, RunningAvgSamplesPerSec=8.755154382846982, CurrSamplesPerSec=8.744964197945766, MemAllocated=6.88GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.43, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.45, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.67s, TFLOPs: 11.42, Samples/sec: 2.18, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -Model Parameters: 6.927 B, Latency: 3.66s, TFLOPs: 11.44, Samples/sec: 2.19, Time/seq 0.46s, Batch Size: 8, Sequence Length: 512 -[2023-12-10 07:27:53,883] [INFO] [logging.py:96:log_dist] [Rank 0] step=1300, skipped=16, lr=[3.6063362731468903e-09, 1.8685680171745544e-07, 3.6063362731468903e-09], mom=[(0.9, 0.95), (0.9, 0.95), (0.9, 0.95)] -[2023-12-10 07:27:53,883] [INFO] [timer.py:260:stop] epoch=99/micro_step=13/global_step=1300, RunningAvgSamplesPerSec=8.755775861050216, CurrSamplesPerSec=9.804857102490763, MemAllocated=6.82GB, MaxMemAllocated=11.06GB -Model Parameters: 6.927 B, Latency: 3.27s, TFLOPs: 12.82, Samples/sec: 2.45, Time/seq 0.41s, Batch Size: 8, Sequence Length: 512 -***** Evaluating perplexity, Epoch 100/100 ***** +***** Evaluating perplexity, Epoch 3/3 ***** Invalidate trace cache @ step 0: expected module 0, but got module 6 -ppl: 1.0019930601119995, loss: 0.001991098513826728 +ppl: 1.031400442123413, loss: 0.03091755136847496 saving the final model ... -[2023-12-10 07:28:07,345] [INFO] [launch.py:347:main] Process 3367326 exits successfully. -[2023-12-10 07:28:07,346] [INFO] [launch.py:347:main] Process 3367329 exits successfully. -[2023-12-10 07:28:07,346] [INFO] [launch.py:347:main] Process 3367328 exits successfully. -[2023-12-10 07:29:54,359] [INFO] [launch.py:347:main] Process 3367325 exits successfully. +[2023-12-11 05:42:35,188] [INFO] [launch.py:347:main] Process 1247715 exits successfully. +[2023-12-11 05:42:35,188] [INFO] [launch.py:347:main] Process 1247716 exits successfully. +[2023-12-11 05:42:35,189] [INFO] [launch.py:347:main] Process 1247717 exits successfully. +[2023-12-11 05:44:14,200] [INFO] [launch.py:347:main] Process 1247714 exits successfully.