diff --git "a/exp/log/log-train-2024-01-15-11-07-41-1" "b/exp/log/log-train-2024-01-15-11-07-41-1" new file mode 100644--- /dev/null +++ "b/exp/log/log-train-2024-01-15-11-07-41-1" @@ -0,0 +1,4772 @@ +2024-01-15 11:07:41,463 INFO [train.py:1062] (1/2) Training started +2024-01-15 11:07:41,463 INFO [train.py:1072] (1/2) Device: cuda:1 +2024-01-15 11:07:41,466 INFO [train.py:1081] (1/2) {'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 50, 'reset_interval': 200, 'valid_interval': 3000, 'feature_dim': 80, 'subsampling_factor': 4, 'warm_step': 2000, 'env_info': {'k2-version': '1.24.4', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '2989b0b1186fa6022932804f5b39fbb2781ebf42', 'k2-git-date': 'Fri Nov 24 11:34:10 2023', 'lhotse-version': '1.19.0.dev+git.d1ae9c05.dirty', 'torch-version': '1.11.0+cu102', 'torch-cuda-available': True, 'torch-cuda-version': '10.2', 'python-version': '3.9', 'icefall-git-branch': 'dev/aishell-zipformer-bbpe', 'icefall-git-sha1': 'bce81394-clean', 'icefall-git-date': 'Thu Jan 11 09:56:01 2024', 'icefall-path': '/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/icefall-1.0-py3.9.egg', 'k2-path': '/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/k2-1.24.4.dev20231207+cuda10.2.torch1.11.0-py3.9-linux-x86_64.egg/k2/__init__.py', 'lhotse-path': '/star-home/jinzengrui/lib/miniconda3/envs/dev39/lib/python3.9/site-packages/lhotse-1.19.0.dev0+git.d1ae9c05.dirty-py3.9.egg/lhotse/__init__.py', 'hostname': 'de-74279-k2-train-1-1207150822-75498b8c5f-55j4z', 'IP address': '10.177.74.211'}, 'world_size': 2, 'master_port': 12354, 'tensorboard': True, 'num_epochs': 40, 'start_epoch': 1, 'start_batch': 0, 'exp_dir': PosixPath('zipformer_bbpe/exp-context-size-2-lr-epochs-10-spec-aug-20-disable-musan'), 'bpe_model': 'data/lang_bbpe_500/bbpe.model', 'base_lr': 0.045, 'lr_batches': 7500, 'lr_epochs': 10.0, 'ref_duration': 600, 'context_size': 2, 'prune_range': 5, 'lm_scale': 0.25, 'am_scale': 0.0, 'simple_loss_scale': 0.5, 'seed': 42, 'print_diagnostics': False, 'inf_check': False, 'save_every_n': 4000, 'keep_last_k': 30, 'average_period': 200, 'use_fp16': True, 'num_encoder_layers': '2,2,3,4,3,2', 'downsampling_factor': '1,2,4,8,4,2', 'feedforward_dim': '512,768,1024,1536,1024,768', 'num_heads': '4,4,4,8,4,4', 'encoder_dim': '192,256,384,512,384,256', 'query_head_dim': '32', 'value_head_dim': '12', 'pos_head_dim': '4', 'pos_dim': 48, 'encoder_unmasked_dim': '192,192,256,256,256,192', 'cnn_module_kernel': '31,31,15,15,15,31', 'decoder_dim': 512, 'joiner_dim': 512, 'causal': False, 'chunk_size': '16,32,64,-1', 'left_context_frames': '64,128,256,-1', 'manifest_dir': PosixPath('data/fbank'), 'max_duration': 1000, 'bucketing_sampler': True, 'num_buckets': 30, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0, 'on_the_fly_feats': False, 'shuffle': True, 'drop_last': True, 'return_cuts': True, 'num_workers': 2, 'enable_spec_aug': True, 'spec_aug_time_warp_factor': 20, 'enable_musan': True, 'blank_id': 0, 'vocab_size': 500} +2024-01-15 11:07:41,466 INFO [train.py:1083] (1/2) About to create model +2024-01-15 11:07:42,008 INFO [train.py:1087] (1/2) Number of model parameters: 65549011 +2024-01-15 11:07:46,250 INFO [train.py:1102] (1/2) Using DDP +2024-01-15 11:07:46,831 INFO [asr_datamodule.py:363] (1/2) About to get train cuts +2024-01-15 11:07:46,848 INFO [asr_datamodule.py:371] (1/2) About to get dev cuts +2024-01-15 11:07:46,849 INFO [asr_datamodule.py:194] (1/2) About to get Musan cuts +2024-01-15 11:07:49,248 INFO [asr_datamodule.py:199] (1/2) Enable MUSAN +2024-01-15 11:07:49,249 INFO [asr_datamodule.py:222] (1/2) Enable SpecAugment +2024-01-15 11:07:49,249 INFO [asr_datamodule.py:223] (1/2) Time warp factor: 20 +2024-01-15 11:07:49,249 INFO [asr_datamodule.py:233] (1/2) Num frame mask: 10 +2024-01-15 11:07:49,249 INFO [asr_datamodule.py:246] (1/2) About to create train dataset +2024-01-15 11:07:49,249 INFO [asr_datamodule.py:272] (1/2) Using DynamicBucketingSampler. +2024-01-15 11:07:53,203 INFO [asr_datamodule.py:287] (1/2) About to create train dataloader +2024-01-15 11:07:53,204 INFO [asr_datamodule.py:312] (1/2) About to create dev dataset +2024-01-15 11:07:53,967 INFO [asr_datamodule.py:329] (1/2) About to create dev dataloader +2024-01-15 11:08:17,812 INFO [train.py:994] (1/2) Epoch 1, batch 0, loss[loss=7.43, simple_loss=6.763, pruned_loss=6.656, over 24456.00 frames. ], tot_loss[loss=7.43, simple_loss=6.763, pruned_loss=6.656, over 24456.00 frames. ], batch size: 222, lr: 2.25e-02, grad_scale: 1.0 +2024-01-15 11:08:17,812 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 11:08:38,149 INFO [train.py:1026] (1/2) Epoch 1, validation: loss=7.38, simple_loss=6.719, pruned_loss=6.6, over 1622729.00 frames. +2024-01-15 11:08:38,150 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 13586MB +2024-01-15 11:08:40,750 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module1.balancer2.min_abs, batch_count=0.0, ans=0.2 +2024-01-15 11:08:43,596 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module2.whiten.whitening_limit, batch_count=0.0, ans=7.5 +2024-01-15 11:08:54,739 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 4.492e+03 4.857e+03 5.193e+03 6.740e+03 7.683e+03, threshold=2.077e+04, percent-clipped=0.0 +2024-01-15 11:08:56,406 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module1.balancer1.max_abs, batch_count=33.333333333333336, ans=5.020833333333333 +2024-01-15 11:09:03,744 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=512, metric=505.04 vs. limit=7.525 +2024-01-15 11:09:09,683 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 5.412e+02 1.511e+03 4.492e+03 5.717e+03 7.711e+03, threshold=1.797e+04, percent-clipped=0.0 +2024-01-15 11:09:24,424 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=100.25 vs. limit=7.5375 +2024-01-15 11:09:32,097 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=256, metric=252.05 vs. limit=7.575 +2024-01-15 11:09:38,478 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 5.412e+02 9.988e+02 1.994e+03 4.935e+03 9.600e+03, threshold=7.974e+03, percent-clipped=0.0 +2024-01-15 11:09:40,458 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.whiten, num_groups=1, num_channels=384, metric=25.24 vs. limit=4.053333333333334 +2024-01-15 11:09:40,650 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=512, metric=414.91 vs. limit=7.55 +2024-01-15 11:09:47,531 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.conv_module2.whiten, num_groups=1, num_channels=512, metric=180.22 vs. limit=7.5625 +2024-01-15 11:09:48,213 INFO [train.py:994] (1/2) Epoch 1, batch 50, loss[loss=1.387, simple_loss=1.251, pruned_loss=1.235, over 23904.00 frames. ], tot_loss[loss=3.058, simple_loss=2.816, pruned_loss=2.369, over 1090248.16 frames. ], batch size: 328, lr: 2.48e-02, grad_scale: 0.25 +2024-01-15 11:09:59,173 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module1.balancer2.prob, batch_count=166.66666666666666, ans=0.4921875 +2024-01-15 11:10:06,647 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.whiten.whitening_limit, batch_count=200.0, ans=4.08 +2024-01-15 11:10:11,009 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=384, metric=44.09 vs. limit=7.65 +2024-01-15 11:10:24,996 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward3.hidden_balancer.prob, batch_count=233.33333333333334, ans=0.4890625 +2024-01-15 11:10:28,902 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=47.09 vs. limit=7.5875 +2024-01-15 11:10:36,602 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=266.6666666666667, ans=0.29733333333333334 +2024-01-15 11:10:45,328 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=40.09 vs. limit=5.15 +2024-01-15 11:11:01,198 INFO [train.py:994] (1/2) Epoch 1, batch 100, loss[loss=1.033, simple_loss=0.9023, pruned_loss=1.055, over 24450.00 frames. ], tot_loss[loss=1.997, simple_loss=1.816, pruned_loss=1.683, over 1922338.53 frames. ], batch size: 170, lr: 2.70e-02, grad_scale: 0.5 +2024-01-15 11:11:05,536 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 7.447e+01 1.465e+02 3.596e+02 1.658e+03 9.600e+03, threshold=7.192e+02, percent-clipped=2.0 +2024-01-15 11:11:10,626 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=512, metric=23.46 vs. limit=7.625 +2024-01-15 11:11:11,784 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=256, metric=15.47 vs. limit=7.625 +2024-01-15 11:11:16,340 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.whiten, num_groups=1, num_channels=192, metric=15.81 vs. limit=4.1466666666666665 +2024-01-15 11:11:19,774 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module2.balancer2.prob, batch_count=366.6666666666667, ans=0.4828125 +2024-01-15 11:11:22,649 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module1.balancer2.prob, batch_count=366.6666666666667, ans=0.4828125 +2024-01-15 11:11:30,215 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=86.58 vs. limit=7.65 +2024-01-15 11:11:31,590 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=384, metric=234.90 vs. limit=5.2 +2024-01-15 11:11:35,700 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=512, metric=139.20 vs. limit=7.65 +2024-01-15 11:11:45,570 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.conv_module1.whiten, num_groups=1, num_channels=512, metric=37.02 vs. limit=7.6625 +2024-01-15 11:11:46,921 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=92.15 vs. limit=7.6625 +2024-01-15 11:11:48,553 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.feed_forward1.out_whiten, num_groups=1, num_channels=384, metric=253.51 vs. limit=7.6625 +2024-01-15 11:11:54,350 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.whiten, num_groups=1, num_channels=384, metric=6.24 vs. limit=4.173333333333334 +2024-01-15 11:11:58,329 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=384, metric=103.95 vs. limit=7.6625 +2024-01-15 11:12:00,751 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.nonlin_attention.balancer.min_positive, batch_count=466.6666666666667, ans=0.24533333333333332 +2024-01-15 11:12:03,734 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=28.38 vs. limit=7.675 +2024-01-15 11:12:04,935 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_module2.balancer2.prob, batch_count=466.6666666666667, ans=0.478125 +2024-01-15 11:12:05,700 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=192, metric=48.21 vs. limit=7.675 +2024-01-15 11:12:07,696 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module1.balancer2.min_positive, batch_count=466.6666666666667, ans=0.09708333333333334 +2024-01-15 11:12:09,598 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=9.41 vs. limit=5.116666666666666 +2024-01-15 11:12:12,283 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=512, metric=46.20 vs. limit=7.85 +2024-01-15 11:12:14,353 INFO [train.py:994] (1/2) Epoch 1, batch 150, loss[loss=0.9605, simple_loss=0.8215, pruned_loss=1.012, over 24538.00 frames. ], tot_loss[loss=1.561, simple_loss=1.401, pruned_loss=1.394, over 2564692.29 frames. ], batch size: 193, lr: 2.93e-02, grad_scale: 0.5 +2024-01-15 11:12:23,041 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.4.encoder.layers.2.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 11:12:36,656 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.self_attn1.whiten, num_groups=1, num_channels=384, metric=24.26 vs. limit=7.9 +2024-01-15 11:12:37,917 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=384, metric=39.57 vs. limit=7.7 +2024-01-15 11:12:39,552 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.self_attn2.whiten, num_groups=1, num_channels=384, metric=16.26 vs. limit=7.9 +2024-01-15 11:13:00,567 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=66.08 vs. limit=7.725 +2024-01-15 11:13:01,342 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module1.balancer1.min_positive, batch_count=600.0, ans=0.048125 +2024-01-15 11:13:13,893 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=146.02 vs. limit=7.7375 +2024-01-15 11:13:18,596 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=633.3333333333334, ans=0.4703125 +2024-01-15 11:13:21,719 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=28.69 vs. limit=7.7375 +2024-01-15 11:13:22,874 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_module2.balancer2.prob, batch_count=633.3333333333334, ans=0.4703125 +2024-01-15 11:13:27,136 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module1.balancer2.min_positive, batch_count=633.3333333333334, ans=0.09604166666666668 +2024-01-15 11:13:28,921 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.feed_forward1.out_whiten, num_groups=1, num_channels=384, metric=48.49 vs. limit=7.75 +2024-01-15 11:13:30,348 INFO [train.py:994] (1/2) Epoch 1, batch 200, loss[loss=0.8126, simple_loss=0.6926, pruned_loss=0.8128, over 24513.00 frames. ], tot_loss[loss=1.319, simple_loss=1.171, pruned_loss=1.218, over 3064226.17 frames. ], batch size: 229, lr: 3.15e-02, grad_scale: 1.0 +2024-01-15 11:13:31,108 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_whiten, num_groups=1, num_channels=512, metric=56.94 vs. limit=7.75 +2024-01-15 11:13:34,533 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 7.780e+01 1.010e+02 1.260e+02 1.654e+02 3.214e+02, threshold=2.519e+02, percent-clipped=0.0 +2024-01-15 11:13:36,893 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=384, metric=224.61 vs. limit=5.333333333333333 +2024-01-15 11:13:38,025 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.conv_module1.whiten, num_groups=1, num_channels=384, metric=13.34 vs. limit=7.75 +2024-01-15 11:13:42,088 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=29.79 vs. limit=7.75 +2024-01-15 11:13:42,171 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.nonlin_attention.whiten1, num_groups=1, num_channels=384, metric=6.67 vs. limit=5.166666666666667 +2024-01-15 11:13:52,800 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module2.balancer2.min_positive, batch_count=700.0, ans=0.095625 +2024-01-15 11:13:53,410 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=106.94 vs. limit=7.7625 +2024-01-15 11:13:59,820 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=733.3333333333334, ans=0.465625 +2024-01-15 11:14:04,019 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_skip_rate, batch_count=733.3333333333334, ans=0.17250000000000001 +2024-01-15 11:14:05,571 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=16.02 vs. limit=7.775 +2024-01-15 11:14:08,226 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.nonlin_attention.balancer.max_positive, batch_count=733.3333333333334, ans=0.7573333333333333 +2024-01-15 11:14:10,528 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=144, metric=41.77 vs. limit=5.183333333333334 +2024-01-15 11:14:11,684 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=512, metric=99.93 vs. limit=7.775 +2024-01-15 11:14:12,883 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.feed_forward1.out_whiten, num_groups=1, num_channels=384, metric=51.67 vs. limit=7.7875 +2024-01-15 11:14:22,792 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=17.81 vs. limit=7.7875 +2024-01-15 11:14:43,002 INFO [train.py:994] (1/2) Epoch 1, batch 250, loss[loss=0.8235, simple_loss=0.7024, pruned_loss=0.7756, over 24347.00 frames. ], tot_loss[loss=1.164, simple_loss=1.024, pruned_loss=1.087, over 3440459.13 frames. ], batch size: 298, lr: 3.38e-02, grad_scale: 1.0 +2024-01-15 11:14:47,463 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module1.balancer2.prob, batch_count=833.3333333333334, ans=0.4609375 +2024-01-15 11:14:55,302 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=256, metric=15.34 vs. limit=7.8125 +2024-01-15 11:14:59,292 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.3.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 11:15:08,362 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=866.6666666666666, ans=0.39166666666666666 +2024-01-15 11:15:09,640 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module1.balancer1.prob, batch_count=866.6666666666666, ans=0.459375 +2024-01-15 11:15:15,505 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=384, metric=12.44 vs. limit=7.8375 +2024-01-15 11:15:18,510 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=384, metric=6.50 vs. limit=5.225 +2024-01-15 11:15:22,240 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.balancer_na.min_abs, batch_count=900.0, ans=0.0076 +2024-01-15 11:15:24,217 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.whiten, num_groups=1, num_channels=384, metric=5.10 vs. limit=4.36 +2024-01-15 11:15:32,632 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=256, metric=10.97 vs. limit=8.2 +2024-01-15 11:15:33,338 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module2.balancer1.prob, batch_count=933.3333333333334, ans=0.45625 +2024-01-15 11:15:33,548 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.whiten, num_groups=1, num_channels=256, metric=3.21 vs. limit=4.373333333333333 +2024-01-15 11:15:33,865 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_whiten, num_groups=1, num_channels=512, metric=71.25 vs. limit=7.85 +2024-01-15 11:15:36,660 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=384, metric=17.99 vs. limit=8.2 +2024-01-15 11:15:38,300 INFO [scaling.py:1022] (1/2) Whitening: name=encoder_embed.convnext.out_whiten, num_groups=1, num_channels=128, metric=98.60 vs. limit=5.0 +2024-01-15 11:15:40,651 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=384, metric=13.69 vs. limit=7.8625 +2024-01-15 11:15:45,001 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=384, metric=7.29 vs. limit=5.241666666666666 +2024-01-15 11:15:55,981 INFO [train.py:994] (1/2) Epoch 1, batch 300, loss[loss=0.8129, simple_loss=0.6843, pruned_loss=0.765, over 24504.00 frames. ], tot_loss[loss=1.065, simple_loss=0.9292, pruned_loss=0.9963, over 3743164.50 frames. ], batch size: 204, lr: 3.60e-02, grad_scale: 2.0 +2024-01-15 11:16:00,787 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 9.498e+01 1.343e+02 1.625e+02 2.207e+02 3.846e+02, threshold=3.251e+02, percent-clipped=10.0 +2024-01-15 11:16:04,019 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.attention_skip_rate, batch_count=1000.0, ans=0.1625 +2024-01-15 11:16:08,227 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.self_attn1.whiten.whitening_limit, batch_count=1000.0, ans=8.25 +2024-01-15 11:16:10,789 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.balancer.min_positive, batch_count=1033.3333333333333, ans=0.23966666666666667 +2024-01-15 11:16:12,841 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_whiten, num_groups=1, num_channels=384, metric=52.89 vs. limit=7.8875 +2024-01-15 11:16:19,256 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward3.hidden_balancer.prob, batch_count=1033.3333333333333, ans=0.4515625 +2024-01-15 11:16:41,665 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=1100.0, ans=0.4484375 +2024-01-15 11:16:47,676 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=384, metric=27.28 vs. limit=7.9125 +2024-01-15 11:16:55,924 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.feed_forward1.out_whiten, num_groups=1, num_channels=512, metric=111.13 vs. limit=7.925 +2024-01-15 11:17:07,343 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=512, metric=15.56 vs. limit=8.35 +2024-01-15 11:17:09,398 INFO [train.py:994] (1/2) Epoch 1, batch 350, loss[loss=0.8511, simple_loss=0.7139, pruned_loss=0.7747, over 24486.00 frames. ], tot_loss[loss=0.9957, simple_loss=0.862, pruned_loss=0.9289, over 3975645.24 frames. ], batch size: 267, lr: 3.83e-02, grad_scale: 2.0 +2024-01-15 11:17:11,339 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.feed_forward1.out_whiten, num_groups=1, num_channels=384, metric=41.37 vs. limit=7.9375 +2024-01-15 11:17:13,770 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module2.balancer2.prob, batch_count=1166.6666666666667, ans=0.4453125 +2024-01-15 11:17:15,824 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=6.99 vs. limit=5.291666666666667 +2024-01-15 11:17:19,527 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.nonlin_attention.balancer.max_positive, batch_count=1166.6666666666667, ans=0.7616666666666667 +2024-01-15 11:17:23,015 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_whiten.whitening_limit, batch_count=1200.0, ans=7.95 +2024-01-15 11:17:29,787 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=8.68 vs. limit=7.95 +2024-01-15 11:17:30,590 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=1200.0, ans=0.288 +2024-01-15 11:17:35,469 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.whiten, num_groups=1, num_channels=256, metric=3.65 vs. limit=4.48 +2024-01-15 11:17:41,170 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.self_attn1.whiten, num_groups=1, num_channels=192, metric=13.54 vs. limit=8.425 +2024-01-15 11:17:44,020 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=384, metric=198.19 vs. limit=5.616666666666666 +2024-01-15 11:17:44,910 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=1233.3333333333333, ans=0.2876666666666667 +2024-01-15 11:17:53,937 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=1.89 vs. limit=3.19 +2024-01-15 11:17:57,037 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.feed_forward3.out_whiten, num_groups=1, num_channels=512, metric=129.50 vs. limit=7.975 +2024-01-15 11:18:07,723 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module1.balancer2.min_abs, batch_count=1300.0, ans=0.2195 +2024-01-15 11:18:07,724 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=1300.0, ans=0.4390625 +2024-01-15 11:18:22,120 INFO [train.py:994] (1/2) Epoch 1, batch 400, loss[loss=0.8291, simple_loss=0.6869, pruned_loss=0.7524, over 24492.00 frames. ], tot_loss[loss=0.9441, simple_loss=0.8106, pruned_loss=0.8764, over 4161371.46 frames. ], batch size: 210, lr: 4.05e-02, grad_scale: 4.0 +2024-01-15 11:18:26,103 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.116e+02 1.629e+02 2.109e+02 2.635e+02 4.747e+02, threshold=4.218e+02, percent-clipped=14.0 +2024-01-15 11:18:40,208 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.nonlin_attention.whiten2, num_groups=1, num_channels=384, metric=6.33 vs. limit=5.683333333333334 +2024-01-15 11:18:57,501 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.out_whiten.whitening_limit, batch_count=1400.0, ans=4.28 +2024-01-15 11:19:03,805 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.self_attn2.whiten, num_groups=1, num_channels=384, metric=13.05 vs. limit=8.55 +2024-01-15 11:19:06,476 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.whiten, num_groups=1, num_channels=512, metric=5.91 vs. limit=4.573333333333333 +2024-01-15 11:19:08,890 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=1433.3333333333333, ans=0.3208333333333333 +2024-01-15 11:19:11,995 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.feed_forward2.out_whiten, num_groups=1, num_channels=384, metric=24.65 vs. limit=8.0375 +2024-01-15 11:19:13,585 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.conv_module2.whiten, num_groups=1, num_channels=384, metric=10.44 vs. limit=8.0375 +2024-01-15 11:19:17,784 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=384, metric=25.03 vs. limit=8.0375 +2024-01-15 11:19:19,067 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=512, metric=6.37 vs. limit=5.733333333333333 +2024-01-15 11:19:21,379 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.ff3_skip_rate, batch_count=1466.6666666666667, ans=0.067 +2024-01-15 11:19:31,313 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=1.65 vs. limit=3.22 +2024-01-15 11:19:33,746 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_skip_rate, batch_count=1500.0, ans=0.14375 +2024-01-15 11:19:34,825 INFO [train.py:994] (1/2) Epoch 1, batch 450, loss[loss=0.8328, simple_loss=0.6903, pruned_loss=0.7275, over 22494.00 frames. ], tot_loss[loss=0.9082, simple_loss=0.7735, pruned_loss=0.8365, over 4292121.32 frames. ], batch size: 357, lr: 4.28e-02, grad_scale: 4.0 +2024-01-15 11:19:42,371 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=384, metric=13.42 vs. limit=8.625 +2024-01-15 11:19:43,501 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.conv_module2.whiten, num_groups=1, num_channels=512, metric=38.04 vs. limit=8.0625 +2024-01-15 11:20:01,653 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=1.95 vs. limit=3.23 +2024-01-15 11:20:02,672 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.conv_module2.whiten, num_groups=1, num_channels=384, metric=23.40 vs. limit=8.0875 +2024-01-15 11:20:03,731 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module2.balancer2.min_abs, batch_count=1566.6666666666667, ans=0.2235 +2024-01-15 11:20:08,228 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.whiten, num_groups=1, num_channels=384, metric=4.41 vs. limit=4.626666666666667 +2024-01-15 11:20:12,211 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.conv_module2.whiten, num_groups=1, num_channels=512, metric=30.41 vs. limit=8.0875 +2024-01-15 11:20:14,542 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass_mid.scale_min, batch_count=1566.6666666666667, ans=0.8451666666666667 +2024-01-15 11:20:17,957 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=1600.0, ans=0.425 +2024-01-15 11:20:21,002 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=16.05 vs. limit=8.1 +2024-01-15 11:20:34,444 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=256, metric=13.47 vs. limit=8.725 +2024-01-15 11:20:36,406 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.balancer1.min_positive, batch_count=1633.3333333333333, ans=0.044895833333333336 +2024-01-15 11:20:45,784 INFO [train.py:994] (1/2) Epoch 1, batch 500, loss[loss=0.7892, simple_loss=0.6474, pruned_loss=0.6849, over 24415.00 frames. ], tot_loss[loss=0.8828, simple_loss=0.7458, pruned_loss=0.8052, over 4418197.94 frames. ], batch size: 258, lr: 4.49e-02, grad_scale: 8.0 +2024-01-15 11:20:47,934 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.convnext.hidden_balancer.prob, batch_count=1666.6666666666667, ans=0.421875 +2024-01-15 11:20:50,434 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.250e+02 1.671e+02 2.007e+02 2.442e+02 4.981e+02, threshold=4.014e+02, percent-clipped=2.0 +2024-01-15 11:20:51,369 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=384, metric=71.65 vs. limit=5.833333333333333 +2024-01-15 11:20:52,667 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=512, metric=13.74 vs. limit=8.75 +2024-01-15 11:20:59,810 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward3.hidden_balancer.prob, batch_count=1700.0, ans=0.4203125 +2024-01-15 11:21:15,365 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.conv_module2.whiten, num_groups=1, num_channels=512, metric=23.44 vs. limit=8.15 +2024-01-15 11:21:18,297 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.balancer2.prob, batch_count=1733.3333333333333, ans=0.41875 +2024-01-15 11:21:21,364 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=384, metric=18.22 vs. limit=8.8 +2024-01-15 11:21:30,623 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=192, metric=19.68 vs. limit=5.883333333333334 +2024-01-15 11:21:34,485 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.conv_module1.whiten, num_groups=1, num_channels=512, metric=27.62 vs. limit=8.1625 +2024-01-15 11:21:35,405 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=1766.6666666666667, ans=0.4171875 +2024-01-15 11:21:35,944 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.conv_module1.whiten, num_groups=1, num_channels=384, metric=25.35 vs. limit=8.1625 +2024-01-15 11:21:53,903 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.nonlin_attention.balancer.prob, batch_count=1800.0, ans=0.415625 +2024-01-15 11:21:58,268 INFO [train.py:994] (1/2) Epoch 1, batch 550, loss[loss=0.6174, simple_loss=0.5009, pruned_loss=0.533, over 17648.00 frames. ], tot_loss[loss=0.8604, simple_loss=0.7216, pruned_loss=0.7758, over 4499040.36 frames. ], batch size: 77, lr: 4.49e-02, grad_scale: 8.0 +2024-01-15 11:22:07,553 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.conv_module2.whiten, num_groups=1, num_channels=192, metric=12.99 vs. limit=8.1875 +2024-01-15 11:22:10,098 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_whiten, num_groups=1, num_channels=512, metric=9.95 vs. limit=8.1875 +2024-01-15 11:22:12,309 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=1866.6666666666667, ans=0.2813333333333333 +2024-01-15 11:22:17,320 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=256, metric=19.84 vs. limit=8.9 +2024-01-15 11:22:18,512 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_skip_rate, batch_count=1866.6666666666667, ans=0.13 +2024-01-15 11:22:24,239 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.conv_module2.whiten, num_groups=1, num_channels=256, metric=13.22 vs. limit=8.2 +2024-01-15 11:22:29,001 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.conv_module2.whiten, num_groups=1, num_channels=256, metric=11.14 vs. limit=8.2125 +2024-01-15 11:22:33,070 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_module2.balancer1.min_positive, batch_count=1900.0, ans=0.044062500000000004 +2024-01-15 11:22:49,005 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=512, metric=29.26 vs. limit=5.966666666666667 +2024-01-15 11:22:51,792 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward2.out_whiten.whitening_limit, batch_count=1933.3333333333333, ans=8.225 +2024-01-15 11:22:51,847 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.whiten, num_groups=1, num_channels=512, metric=6.07 vs. limit=4.773333333333333 +2024-01-15 11:22:56,320 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_skip_rate, batch_count=1966.6666666666667, ans=0.12625 +2024-01-15 11:23:00,359 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.attention_skip_rate, batch_count=1966.6666666666667, ans=0.12625 +2024-01-15 11:23:10,031 INFO [train.py:994] (1/2) Epoch 1, batch 600, loss[loss=0.7404, simple_loss=0.6023, pruned_loss=0.6172, over 24279.00 frames. ], tot_loss[loss=0.8428, simple_loss=0.7023, pruned_loss=0.7493, over 4574067.36 frames. ], batch size: 147, lr: 4.49e-02, grad_scale: 8.0 +2024-01-15 11:23:12,817 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.bypass.skip_rate, batch_count=2000.0, ans=0.5 +2024-01-15 11:23:13,326 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=2.98 vs. limit=3.3 +2024-01-15 11:23:13,353 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=256, metric=11.96 vs. limit=9.0 +2024-01-15 11:23:15,121 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.379e+02 2.269e+02 2.746e+02 3.463e+02 7.914e+02, threshold=5.491e+02, percent-clipped=16.0 +2024-01-15 11:23:18,566 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=384, metric=27.74 vs. limit=8.25 +2024-01-15 11:23:23,747 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.whiten, num_groups=1, num_channels=256, metric=5.89 vs. limit=4.8133333333333335 +2024-01-15 11:23:30,990 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module2.balancer2.prob, batch_count=2033.3333333333333, ans=0.4046875 +2024-01-15 11:23:41,836 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.1.encoder.layers.0.self_attn_weights, loss-sum=2.623e+01 +2024-01-15 11:23:43,816 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.ff3_skip_rate, batch_count=2066.6666666666665, ans=0.053500000000000006 +2024-01-15 11:23:43,818 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.attention_skip_rate, batch_count=2066.6666666666665, ans=0.12250000000000001 +2024-01-15 11:23:44,044 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward2.out_whiten.whitening_limit, batch_count=2066.6666666666665, ans=8.275 +2024-01-15 11:23:51,067 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.conv_module2.whiten, num_groups=1, num_channels=512, metric=19.42 vs. limit=8.275 +2024-01-15 11:23:53,648 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.conv_module1.whiten, num_groups=1, num_channels=384, metric=13.20 vs. limit=8.2875 +2024-01-15 11:23:55,438 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module1.balancer2.prob, batch_count=2100.0, ans=0.4015625 +2024-01-15 11:24:15,816 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.self_attn2.whiten, num_groups=1, num_channels=512, metric=13.98 vs. limit=9.1 +2024-01-15 11:24:20,922 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module1.balancer2.prob, batch_count=2166.6666666666665, ans=0.3984375 +2024-01-15 11:24:20,949 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=2166.6666666666665, ans=0.2783333333333333 +2024-01-15 11:24:22,064 INFO [train.py:994] (1/2) Epoch 1, batch 650, loss[loss=0.6833, simple_loss=0.5602, pruned_loss=0.5453, over 23995.00 frames. ], tot_loss[loss=0.8245, simple_loss=0.684, pruned_loss=0.7201, over 4634439.24 frames. ], batch size: 131, lr: 4.49e-02, grad_scale: 8.0 +2024-01-15 11:24:47,797 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.balancer1.prob, batch_count=2200.0, ans=0.396875 +2024-01-15 11:24:58,572 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=30.98 vs. limit=8.3375 +2024-01-15 11:24:59,331 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.ff2_skip_rate, batch_count=2233.3333333333335, ans=0.049749999999999996 +2024-01-15 11:25:01,284 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=32.19 vs. limit=8.3375 +2024-01-15 11:25:05,195 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.self_attn2.whiten, num_groups=1, num_channels=512, metric=12.88 vs. limit=9.2 +2024-01-15 11:25:09,602 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.balancer2.prob, batch_count=2266.6666666666665, ans=0.39375 +2024-01-15 11:25:11,461 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.whiten, num_groups=1, num_channels=512, metric=6.45 vs. limit=4.906666666666666 +2024-01-15 11:25:19,667 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=512, metric=10.69 vs. limit=9.225 +2024-01-15 11:25:33,016 INFO [train.py:994] (1/2) Epoch 1, batch 700, loss[loss=0.7034, simple_loss=0.5857, pruned_loss=0.5304, over 24489.00 frames. ], tot_loss[loss=0.8021, simple_loss=0.6646, pruned_loss=0.685, over 4658701.71 frames. ], batch size: 165, lr: 4.49e-02, grad_scale: 8.0 +2024-01-15 11:25:37,151 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.567e+02 2.797e+02 3.404e+02 4.140e+02 8.487e+02, threshold=6.807e+02, percent-clipped=8.0 +2024-01-15 11:25:46,144 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_skip_rate, batch_count=2366.6666666666665, ans=0.11125 +2024-01-15 11:25:48,185 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.self_attn1.whiten.whitening_limit, batch_count=2366.6666666666665, ans=9.275 +2024-01-15 11:25:55,507 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.self_attn1.whiten, num_groups=1, num_channels=384, metric=11.88 vs. limit=9.275 +2024-01-15 11:25:59,576 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=192, metric=12.90 vs. limit=8.3875 +2024-01-15 11:26:03,337 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=512, metric=11.15 vs. limit=9.3 +2024-01-15 11:26:12,354 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=22.50 vs. limit=8.4 +2024-01-15 11:26:18,606 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module1.balancer1.max_abs, batch_count=2433.3333333333335, ans=6.520833333333334 +2024-01-15 11:26:25,164 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.self_attn_weights.whiten_keys, num_groups=8, num_channels=256, metric=3.23 vs. limit=3.365 +2024-01-15 11:26:25,250 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=384, metric=20.40 vs. limit=8.4125 +2024-01-15 11:26:29,505 INFO [scaling.py:1022] (1/2) Whitening: name=encoder_embed.convnext.out_whiten, num_groups=1, num_channels=128, metric=5.20 vs. limit=5.0 +2024-01-15 11:26:38,254 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.nonlin_attention.whiten1, num_groups=1, num_channels=384, metric=7.29 vs. limit=5.616666666666666 +2024-01-15 11:26:41,009 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=384, metric=7.93 vs. limit=5.616666666666666 +2024-01-15 11:26:43,468 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.attention_skip_rate, batch_count=2500.0, ans=0.10625 +2024-01-15 11:26:44,091 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.conv_module1.whiten, num_groups=1, num_channels=192, metric=8.22 vs. limit=8.4375 +2024-01-15 11:26:44,505 INFO [train.py:994] (1/2) Epoch 1, batch 750, loss[loss=0.6931, simple_loss=0.5872, pruned_loss=0.4935, over 24464.00 frames. ], tot_loss[loss=0.777, simple_loss=0.6449, pruned_loss=0.6459, over 4683763.27 frames. ], batch size: 258, lr: 4.49e-02, grad_scale: 8.0 +2024-01-15 11:26:46,673 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=384, metric=11.63 vs. limit=9.375 +2024-01-15 11:27:08,232 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.nonlin_attention.balancer.min_positive, batch_count=2533.3333333333335, ans=0.22466666666666665 +2024-01-15 11:27:08,596 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.feed_forward3.out_whiten, num_groups=1, num_channels=512, metric=14.75 vs. limit=8.45 +2024-01-15 11:27:08,652 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.feed_forward1.out_whiten, num_groups=1, num_channels=512, metric=9.34 vs. limit=8.45 +2024-01-15 11:27:14,222 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=144, metric=6.23 vs. limit=5.641666666666667 +2024-01-15 11:27:19,930 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.whiten.whitening_limit, batch_count=2566.6666666666665, ans=5.026666666666666 +2024-01-15 11:27:33,709 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=6.11 vs. limit=5.65 +2024-01-15 11:27:40,094 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.whiten, num_groups=1, num_channels=512, metric=5.84 vs. limit=5.053333333333334 +2024-01-15 11:27:43,999 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=512, metric=13.34 vs. limit=8.4875 +2024-01-15 11:27:47,165 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.whiten, num_groups=1, num_channels=384, metric=4.62 vs. limit=5.053333333333334 +2024-01-15 11:27:53,392 INFO [train.py:994] (1/2) Epoch 1, batch 800, loss[loss=0.5914, simple_loss=0.5074, pruned_loss=0.4032, over 24228.00 frames. ], tot_loss[loss=0.7461, simple_loss=0.6221, pruned_loss=0.6014, over 4715999.27 frames. ], batch size: 140, lr: 4.49e-02, grad_scale: 16.0 +2024-01-15 11:27:53,941 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.self_attn2.whiten.whitening_limit, batch_count=2666.6666666666665, ans=9.5 +2024-01-15 11:27:54,077 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.self_attn_weights.whiten_keys, num_groups=8, num_channels=256, metric=3.15 vs. limit=3.4 +2024-01-15 11:27:57,232 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.772e+02 2.597e+02 3.607e+02 4.454e+02 7.869e+02, threshold=7.214e+02, percent-clipped=4.0 +2024-01-15 11:28:03,217 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_skip_rate, batch_count=2666.6666666666665, ans=0.09999999999999999 +2024-01-15 11:28:04,401 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.ff3_skip_rate, batch_count=2666.6666666666665, ans=0.04 +2024-01-15 11:28:06,303 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=9.91 vs. limit=8.5125 +2024-01-15 11:28:11,306 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.whiten, num_groups=1, num_channels=512, metric=5.80 vs. limit=5.08 +2024-01-15 11:28:14,556 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.bypass_mid.scale_min, batch_count=2700.0, ans=0.8055 +2024-01-15 11:28:17,718 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.conv_module2.whiten, num_groups=1, num_channels=192, metric=7.20 vs. limit=8.5125 +2024-01-15 11:28:20,839 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=2733.3333333333335, ans=0.27266666666666667 +2024-01-15 11:28:46,689 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=2800.0, ans=0.27199999999999996 +2024-01-15 11:29:12,821 INFO [train.py:994] (1/2) Epoch 2, batch 0, loss[loss=0.5991, simple_loss=0.5199, pruned_loss=0.3936, over 24502.00 frames. ], tot_loss[loss=0.5991, simple_loss=0.5199, pruned_loss=0.3936, over 24502.00 frames. ], batch size: 181, lr: 4.47e-02, grad_scale: 32.0 +2024-01-15 11:29:12,822 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 11:29:23,661 INFO [zipformer.py:1858] (1/2) name=encoder.encoders.2.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([2.8283, 2.6250, 2.6490, 2.4580], device='cuda:1') +2024-01-15 11:29:32,745 INFO [train.py:1026] (1/2) Epoch 2, validation: loss=0.5319, simple_loss=0.4791, pruned_loss=0.3207, over 1622729.00 frames. +2024-01-15 11:29:32,746 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 11:29:36,316 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=256, metric=11.56 vs. limit=9.6075 +2024-01-15 11:29:57,509 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.self_attn2.whiten, num_groups=1, num_channels=192, metric=11.17 vs. limit=9.6325 +2024-01-15 11:30:00,851 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.balancer.max_positive, batch_count=2876.6666666666665, ans=0.7787666666666666 +2024-01-15 11:30:13,306 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass_mid.scale_min, batch_count=2910.0, ans=0.79815 +2024-01-15 11:30:23,572 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=9.08 vs. limit=8.59125 +2024-01-15 11:30:35,038 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=384, metric=9.92 vs. limit=5.735833333333334 +2024-01-15 11:30:37,808 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=12.11 vs. limit=8.60375 +2024-01-15 11:30:42,324 INFO [train.py:994] (1/2) Epoch 2, batch 50, loss[loss=0.5733, simple_loss=0.5054, pruned_loss=0.3601, over 24491.00 frames. ], tot_loss[loss=0.576, simple_loss=0.5015, pruned_loss=0.3735, over 1081163.65 frames. ], batch size: 187, lr: 4.47e-02, grad_scale: 32.0 +2024-01-15 11:30:42,627 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward3.hidden_balancer.prob, batch_count=2976.6666666666665, ans=0.36046875 +2024-01-15 11:30:56,086 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.919e+02 2.574e+02 2.992e+02 3.662e+02 7.050e+02, threshold=5.985e+02, percent-clipped=0.0 +2024-01-15 11:30:58,202 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=8, num_channels=256, metric=3.34 vs. limit=3.4515000000000002 +2024-01-15 11:30:58,226 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.conv_module1.whiten, num_groups=1, num_channels=384, metric=6.06 vs. limit=8.62875 +2024-01-15 11:30:58,249 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.self_attn2.whiten, num_groups=1, num_channels=384, metric=10.58 vs. limit=9.7575 +2024-01-15 11:31:00,287 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module2.balancer1.prob, batch_count=3010.0, ans=0.35890625 +2024-01-15 11:31:11,431 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=3043.3333333333335, ans=0.2695666666666667 +2024-01-15 11:31:19,076 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.self_attn2.whiten, num_groups=1, num_channels=384, metric=10.92 vs. limit=9.7825 +2024-01-15 11:31:25,648 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.self_attn2.whiten, num_groups=1, num_channels=256, metric=9.99 vs. limit=9.807500000000001 +2024-01-15 11:31:30,041 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.bypass.skip_rate, batch_count=3076.6666666666665, ans=0.04949747468305833 +2024-01-15 11:31:30,497 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=512, metric=10.42 vs. limit=9.807500000000001 +2024-01-15 11:31:44,873 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=3110.0, ans=0.2689 +2024-01-15 11:31:46,333 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.out_combiner.scale_min, batch_count=3110.0, ans=0.79115 +2024-01-15 11:31:51,173 INFO [train.py:994] (1/2) Epoch 2, batch 100, loss[loss=0.5454, simple_loss=0.4869, pruned_loss=0.3304, over 24536.00 frames. ], tot_loss[loss=0.5648, simple_loss=0.4949, pruned_loss=0.3594, over 1908173.84 frames. ], batch size: 193, lr: 4.47e-02, grad_scale: 16.0 +2024-01-15 11:31:51,533 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.self_attn_weights.pos_emb_skip_rate, batch_count=3143.3333333333335, ans=0.10708333333333331 +2024-01-15 11:31:54,126 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module1.balancer2.min_abs, batch_count=3143.3333333333335, ans=0.24715 +2024-01-15 11:31:54,142 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.balancer2.prob, batch_count=3143.3333333333335, ans=0.35265625 +2024-01-15 11:31:57,456 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module1.balancer2.min_positive, batch_count=3143.3333333333335, ans=0.08035416666666667 +2024-01-15 11:32:01,972 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=5.97 vs. limit=5.785833333333334 +2024-01-15 11:32:23,259 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward3.hidden_balancer.prob, batch_count=3210.0, ans=0.34953124999999996 +2024-01-15 11:32:28,865 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=3210.0, ans=0.26789999999999997 +2024-01-15 11:32:31,839 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.nonlin_attention.whiten2, num_groups=1, num_channels=512, metric=6.94 vs. limit=6.621666666666667 +2024-01-15 11:32:55,269 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.ff2_skip_rate, batch_count=3276.6666666666665, ans=0.026274999999999993 +2024-01-15 11:33:00,193 INFO [train.py:994] (1/2) Epoch 2, batch 150, loss[loss=0.4592, simple_loss=0.4152, pruned_loss=0.269, over 23511.00 frames. ], tot_loss[loss=0.5504, simple_loss=0.4859, pruned_loss=0.3429, over 2546682.43 frames. ], batch size: 119, lr: 4.47e-02, grad_scale: 16.0 +2024-01-15 11:33:12,076 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.self_attn2.whiten, num_groups=1, num_channels=256, metric=10.75 vs. limit=9.9825 +2024-01-15 11:33:13,723 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=5.82 vs. limit=6.671666666666667 +2024-01-15 11:33:15,466 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.094e+02 2.592e+02 3.146e+02 4.123e+02 9.007e+02, threshold=6.292e+02, percent-clipped=3.0 +2024-01-15 11:33:28,632 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.self_attn_weights.whiten_keys, num_groups=8, num_channels=256, metric=3.45 vs. limit=3.5065 +2024-01-15 11:33:35,842 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.ff2_skip_rate, batch_count=3376.6666666666665, ans=0.02402499999999999 +2024-01-15 11:33:37,687 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=3376.6666666666665, ans=0.2662333333333333 +2024-01-15 11:33:55,319 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module2.balancer1.max_abs, batch_count=3443.3333333333335, ans=7.152083333333334 +2024-01-15 11:34:01,111 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=5.84 vs. limit=5.860833333333334 +2024-01-15 11:34:08,857 INFO [train.py:994] (1/2) Epoch 2, batch 200, loss[loss=0.4765, simple_loss=0.4344, pruned_loss=0.2728, over 24358.00 frames. ], tot_loss[loss=0.5388, simple_loss=0.4793, pruned_loss=0.329, over 3050117.51 frames. ], batch size: 153, lr: 4.47e-02, grad_scale: 16.0 +2024-01-15 11:34:16,106 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.dropout.p, batch_count=3476.6666666666665, ans=0.2652333333333333 +2024-01-15 11:34:16,884 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=256, metric=9.87 vs. limit=10.1075 +2024-01-15 11:34:26,960 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.feed_forward3.out_whiten, num_groups=1, num_channels=512, metric=9.69 vs. limit=8.81625 +2024-01-15 11:34:29,454 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=5.78 vs. limit=5.8774999999999995 +2024-01-15 11:34:58,939 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module2.balancer2.prob, batch_count=3576.6666666666665, ans=0.33234375 +2024-01-15 11:35:03,742 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=384, metric=7.17 vs. limit=6.805 +2024-01-15 11:35:08,111 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_module1.balancer1.prob, batch_count=3610.0, ans=0.33078125 +2024-01-15 11:35:16,676 INFO [train.py:994] (1/2) Epoch 2, batch 250, loss[loss=0.4972, simple_loss=0.4541, pruned_loss=0.2824, over 24458.00 frames. ], tot_loss[loss=0.523, simple_loss=0.4691, pruned_loss=0.3128, over 3436070.42 frames. ], batch size: 216, lr: 4.47e-02, grad_scale: 16.0 +2024-01-15 11:35:30,730 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=3676.6666666666665, ans=0.32765625 +2024-01-15 11:35:31,784 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.028e+02 2.695e+02 3.175e+02 3.911e+02 8.566e+02, threshold=6.350e+02, percent-clipped=6.0 +2024-01-15 11:35:36,027 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_skip_rate, batch_count=3676.6666666666665, ans=0.062124999999999986 +2024-01-15 11:35:57,943 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.balancer1.prob, batch_count=3743.3333333333335, ans=0.32453125 +2024-01-15 11:36:03,999 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=192, metric=9.30 vs. limit=8.90375 +2024-01-15 11:36:08,390 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=512, metric=6.15 vs. limit=6.871666666666667 +2024-01-15 11:36:11,003 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=9.07 vs. limit=8.91625 +2024-01-15 11:36:11,973 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module1.balancer2.prob, batch_count=3776.6666666666665, ans=0.32296875 +2024-01-15 11:36:20,869 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.bypass_mid.scale_min, batch_count=3776.6666666666665, ans=0.7678166666666667 +2024-01-15 11:36:25,180 INFO [train.py:994] (1/2) Epoch 2, batch 300, loss[loss=0.4665, simple_loss=0.4343, pruned_loss=0.2544, over 24365.00 frames. ], tot_loss[loss=0.5062, simple_loss=0.4579, pruned_loss=0.2967, over 3730815.97 frames. ], batch size: 153, lr: 4.46e-02, grad_scale: 16.0 +2024-01-15 11:36:25,486 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.balancer1.prob, batch_count=3810.0, ans=0.32140625 +2024-01-15 11:36:29,751 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.whiten, num_groups=1, num_channels=384, metric=4.61 vs. limit=5.524 +2024-01-15 11:36:42,001 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.self_attn1.whiten, num_groups=1, num_channels=384, metric=10.71 vs. limit=10.3825 +2024-01-15 11:36:44,146 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.4.encoder.layers.2.self_attn_weights, loss-sum=2.480e+01 +2024-01-15 11:37:04,080 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_skip_rate, batch_count=3910.0, ans=0.05337499999999998 +2024-01-15 11:37:16,527 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=3910.0, ans=0.2609 +2024-01-15 11:37:24,823 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=3943.3333333333335, ans=0.26056666666666667 +2024-01-15 11:37:26,835 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=192, metric=6.73 vs. limit=6.971666666666667 +2024-01-15 11:37:32,388 INFO [train.py:994] (1/2) Epoch 2, batch 350, loss[loss=0.4474, simple_loss=0.4228, pruned_loss=0.2367, over 24310.00 frames. ], tot_loss[loss=0.4913, simple_loss=0.4483, pruned_loss=0.2821, over 3969538.03 frames. ], batch size: 285, lr: 4.46e-02, grad_scale: 16.0 +2024-01-15 11:37:34,739 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module1.balancer2.min_positive, batch_count=3976.6666666666665, ans=0.07514583333333334 +2024-01-15 11:37:47,293 INFO [scaling.py:1022] (1/2) Whitening: name=encoder_embed.out_whiten, num_groups=1, num_channels=192, metric=4.80 vs. limit=4.802 +2024-01-15 11:37:47,494 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.985e+02 2.596e+02 3.097e+02 3.679e+02 8.153e+02, threshold=6.193e+02, percent-clipped=3.0 +2024-01-15 11:37:53,675 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.balancer2.prob, batch_count=4010.0, ans=0.31203125 +2024-01-15 11:38:00,460 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.conv_module2.whiten, num_groups=1, num_channels=256, metric=8.26 vs. limit=9.01625 +2024-01-15 11:38:05,547 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=6.04 vs. limit=6.010833333333333 +2024-01-15 11:38:31,177 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=4110.0, ans=0.2589 +2024-01-15 11:38:41,081 INFO [train.py:994] (1/2) Epoch 2, batch 400, loss[loss=0.4174, simple_loss=0.3999, pruned_loss=0.2149, over 24322.00 frames. ], tot_loss[loss=0.4771, simple_loss=0.4393, pruned_loss=0.2686, over 4145299.35 frames. ], batch size: 153, lr: 4.46e-02, grad_scale: 32.0 +2024-01-15 11:38:41,477 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=4143.333333333333, ans=0.0 +2024-01-15 11:38:52,465 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.bypass_mid.scale_min, batch_count=4143.333333333333, ans=0.7549833333333333 +2024-01-15 11:39:18,575 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=4210.0, ans=0.30265624999999996 +2024-01-15 11:39:30,666 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=4243.333333333333, ans=0.25756666666666667 +2024-01-15 11:39:47,273 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.self_attn1.whiten, num_groups=1, num_channels=512, metric=12.73 vs. limit=10.7075 +2024-01-15 11:39:48,305 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.2.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 11:39:49,297 INFO [train.py:994] (1/2) Epoch 2, batch 450, loss[loss=0.4075, simple_loss=0.3905, pruned_loss=0.2101, over 24371.00 frames. ], tot_loss[loss=0.4648, simple_loss=0.4317, pruned_loss=0.2568, over 4281912.62 frames. ], batch size: 153, lr: 4.46e-02, grad_scale: 16.0 +2024-01-15 11:39:59,363 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward2.hidden_balancer.prob, batch_count=4310.0, ans=0.29796875 +2024-01-15 11:39:59,365 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module2.balancer2.prob, batch_count=4310.0, ans=0.29796875 +2024-01-15 11:40:05,378 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.007e+02 2.923e+02 3.521e+02 4.109e+02 6.335e+02, threshold=7.042e+02, percent-clipped=2.0 +2024-01-15 11:40:08,517 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.self_attn2.whiten, num_groups=1, num_channels=512, metric=11.19 vs. limit=10.7575 +2024-01-15 11:40:20,980 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward3.hidden_balancer.prob, batch_count=4376.666666666667, ans=0.29484374999999996 +2024-01-15 11:40:29,970 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_skip_rate, batch_count=4410.0, ans=0.04829166666666667 +2024-01-15 11:40:39,935 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=4410.0, ans=0.2559 +2024-01-15 11:40:41,483 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.self_attn2.whiten, num_groups=1, num_channels=384, metric=11.24 vs. limit=10.807500000000001 +2024-01-15 11:40:47,325 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.balancer2.prob, batch_count=4443.333333333333, ans=0.29171875 +2024-01-15 11:40:56,886 INFO [train.py:994] (1/2) Epoch 2, batch 500, loss[loss=0.4062, simple_loss=0.3967, pruned_loss=0.2026, over 24501.00 frames. ], tot_loss[loss=0.4537, simple_loss=0.4252, pruned_loss=0.2461, over 4402783.62 frames. ], batch size: 187, lr: 4.45e-02, grad_scale: 16.0 +2024-01-15 11:41:29,971 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward3.out_whiten.whitening_limit, batch_count=4543.333333333333, ans=9.20375 +2024-01-15 11:41:38,620 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=4576.666666666667, ans=0.2542333333333333 +2024-01-15 11:41:44,285 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward2.hidden_balancer.prob, batch_count=4576.666666666667, ans=0.28546875 +2024-01-15 11:41:57,203 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_whiten, num_groups=1, num_channels=384, metric=8.60 vs. limit=9.22875 +2024-01-15 11:42:03,700 INFO [train.py:994] (1/2) Epoch 2, batch 550, loss[loss=0.3863, simple_loss=0.383, pruned_loss=0.1881, over 24456.00 frames. ], tot_loss[loss=0.4415, simple_loss=0.4175, pruned_loss=0.2354, over 4486230.95 frames. ], batch size: 181, lr: 4.45e-02, grad_scale: 16.0 +2024-01-15 11:42:19,794 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.137e+02 2.755e+02 3.155e+02 3.951e+02 7.770e+02, threshold=6.310e+02, percent-clipped=2.0 +2024-01-15 11:42:27,653 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.balancer1.prob, batch_count=4676.666666666667, ans=0.28078125 +2024-01-15 11:42:36,957 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=512, metric=3.73 vs. limit=9.26625 +2024-01-15 11:42:41,496 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.self_attn_weights.pos_emb_skip_rate, batch_count=4710.0, ans=0.0 +2024-01-15 11:42:45,204 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=4743.333333333333, ans=0.27765625 +2024-01-15 11:42:50,533 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=256, metric=11.52 vs. limit=11.057500000000001 +2024-01-15 11:43:02,249 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=6.27 vs. limit=6.194166666666667 +2024-01-15 11:43:09,453 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=4810.0, ans=0.27453125 +2024-01-15 11:43:09,929 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=512, metric=13.26 vs. limit=11.1075 +2024-01-15 11:43:10,443 INFO [train.py:994] (1/2) Epoch 2, batch 600, loss[loss=0.3899, simple_loss=0.392, pruned_loss=0.186, over 24459.00 frames. ], tot_loss[loss=0.4314, simple_loss=0.4119, pruned_loss=0.2262, over 4563676.25 frames. ], batch size: 170, lr: 4.45e-02, grad_scale: 16.0 +2024-01-15 11:43:29,824 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=4843.333333333333, ans=0.27296875 +2024-01-15 11:43:37,321 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass.skip_rate, batch_count=4876.666666666667, ans=0.07 +2024-01-15 11:43:38,711 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.3.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 11:44:07,949 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.ff3_skip_rate, batch_count=4943.333333333333, ans=0.009794927536231884 +2024-01-15 11:44:14,470 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.balancer1.prob, batch_count=4943.333333333333, ans=0.26828125 +2024-01-15 11:44:16,693 INFO [train.py:994] (1/2) Epoch 2, batch 650, loss[loss=0.3864, simple_loss=0.3863, pruned_loss=0.187, over 24297.00 frames. ], tot_loss[loss=0.4203, simple_loss=0.405, pruned_loss=0.2169, over 4608686.36 frames. ], batch size: 285, lr: 4.45e-02, grad_scale: 16.0 +2024-01-15 11:44:33,078 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.957e+02 2.608e+02 3.015e+02 3.605e+02 6.995e+02, threshold=6.031e+02, percent-clipped=1.0 +2024-01-15 11:44:43,413 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=5043.333333333333, ans=0.24956666666666666 +2024-01-15 11:44:43,851 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.self_attn1.whiten, num_groups=1, num_channels=384, metric=11.08 vs. limit=11.2825 +2024-01-15 11:44:53,485 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module2.balancer2.min_positive, batch_count=5043.333333333333, ans=0.06847916666666667 +2024-01-15 11:44:53,773 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.conv_module2.whiten, num_groups=1, num_channels=384, metric=4.28 vs. limit=9.39125 +2024-01-15 11:44:56,240 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=9.26 vs. limit=9.40375 +2024-01-15 11:44:57,637 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=384, metric=6.50 vs. limit=6.269166666666667 +2024-01-15 11:45:03,442 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.balancer1.prob, batch_count=5076.666666666667, ans=0.26203125 +2024-01-15 11:45:03,451 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.ff3_skip_rate, batch_count=5076.666666666667, ans=0.009765942028985508 +2024-01-15 11:45:12,336 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.balancer1.prob, batch_count=5110.0, ans=0.26046875 +2024-01-15 11:45:23,476 INFO [train.py:994] (1/2) Epoch 2, batch 700, loss[loss=0.3271, simple_loss=0.3409, pruned_loss=0.1483, over 24246.00 frames. ], tot_loss[loss=0.4102, simple_loss=0.399, pruned_loss=0.2086, over 4651658.37 frames. ], batch size: 140, lr: 4.44e-02, grad_scale: 16.0 +2024-01-15 11:45:25,381 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=384, metric=12.24 vs. limit=11.3575 +2024-01-15 11:45:31,586 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.bypass.skip_rate, batch_count=5143.333333333333, ans=0.09899494936611666 +2024-01-15 11:45:40,656 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=384, metric=12.01 vs. limit=11.3825 +2024-01-15 11:45:53,315 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.feed_forward3.out_whiten, num_groups=1, num_channels=512, metric=11.17 vs. limit=9.45375 +2024-01-15 11:45:54,276 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module1.balancer1.prob, batch_count=5210.0, ans=0.25578124999999996 +2024-01-15 11:45:55,895 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=512, metric=5.98 vs. limit=7.605 +2024-01-15 11:46:12,848 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=256, metric=11.04 vs. limit=11.432500000000001 +2024-01-15 11:46:28,231 INFO [train.py:994] (1/2) Epoch 2, batch 750, loss[loss=0.3932, simple_loss=0.4006, pruned_loss=0.1862, over 24335.00 frames. ], tot_loss[loss=0.4019, simple_loss=0.3944, pruned_loss=0.2015, over 4692294.10 frames. ], batch size: 285, lr: 4.44e-02, grad_scale: 16.0 +2024-01-15 11:46:44,352 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.125e+02 2.817e+02 3.394e+02 3.988e+02 1.067e+03, threshold=6.787e+02, percent-clipped=3.0 +2024-01-15 11:46:45,840 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=5343.333333333333, ans=0.24953124999999998 +2024-01-15 11:46:51,155 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=9.00 vs. limit=9.50375 +2024-01-15 11:47:08,012 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=512, metric=9.50 vs. limit=9.52875 +2024-01-15 11:47:11,479 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=7.13 vs. limit=9.52875 +2024-01-15 11:47:26,845 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=5443.333333333333, ans=0.24556666666666666 +2024-01-15 11:47:31,254 INFO [train.py:994] (1/2) Epoch 2, batch 800, loss[loss=0.3721, simple_loss=0.3829, pruned_loss=0.1745, over 24536.00 frames. ], tot_loss[loss=0.3936, simple_loss=0.3898, pruned_loss=0.1949, over 4717493.46 frames. ], batch size: 193, lr: 4.44e-02, grad_scale: 32.0 +2024-01-15 11:47:38,656 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=5476.666666666667, ans=0.24523333333333333 +2024-01-15 11:47:55,119 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=5543.333333333333, ans=0.24015625000000002 +2024-01-15 11:47:59,110 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.nonlin_attention.whiten2, num_groups=1, num_channels=384, metric=6.25 vs. limit=7.7716666666666665 +2024-01-15 11:48:18,026 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=5576.666666666667, ans=0.23859375 +2024-01-15 11:48:42,458 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module1.balancer2.prob, batch_count=5620.0, ans=0.2365625 +2024-01-15 11:48:42,922 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=512, metric=11.94 vs. limit=11.715 +2024-01-15 11:48:47,766 INFO [train.py:994] (1/2) Epoch 3, batch 0, loss[loss=0.3265, simple_loss=0.3341, pruned_loss=0.1551, over 23555.00 frames. ], tot_loss[loss=0.3265, simple_loss=0.3341, pruned_loss=0.1551, over 23555.00 frames. ], batch size: 119, lr: 4.40e-02, grad_scale: 32.0 +2024-01-15 11:48:47,766 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 11:49:07,310 INFO [zipformer.py:1858] (1/2) name=encoder.encoders.0.layers.1.self_attn_weights, attn_weights_entropy = tensor([4.9940, 4.6928, 4.7321, 4.4635], device='cuda:1') +2024-01-15 11:49:08,291 INFO [train.py:1026] (1/2) Epoch 3, validation: loss=0.2688, simple_loss=0.3253, pruned_loss=0.09388, over 1622729.00 frames. +2024-01-15 11:49:08,291 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 11:49:09,841 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module2.balancer1.prob, batch_count=5620.0, ans=0.2365625 +2024-01-15 11:49:13,414 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_skip_rate, batch_count=5620.0, ans=0.043250000000000004 +2024-01-15 11:49:26,899 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module1.balancer1.prob, batch_count=5653.333333333333, ans=0.235 +2024-01-15 11:49:31,493 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=5653.333333333333, ans=0.24346666666666666 +2024-01-15 11:49:32,344 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.086e+02 2.950e+02 3.385e+02 4.084e+02 7.457e+02, threshold=6.770e+02, percent-clipped=2.0 +2024-01-15 11:49:40,990 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.self_attn2.whiten, num_groups=1, num_channels=512, metric=15.72 vs. limit=11.765 +2024-01-15 11:50:09,223 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.balancer2.prob, batch_count=5753.333333333333, ans=0.23031249999999998 +2024-01-15 11:50:12,656 INFO [train.py:994] (1/2) Epoch 3, batch 50, loss[loss=0.3676, simple_loss=0.3835, pruned_loss=0.1709, over 23858.00 frames. ], tot_loss[loss=0.3488, simple_loss=0.3639, pruned_loss=0.1616, over 1094397.07 frames. ], batch size: 328, lr: 4.40e-02, grad_scale: 32.0 +2024-01-15 11:50:30,540 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module1.balancer2.prob, batch_count=5820.0, ans=0.2271875 +2024-01-15 11:50:30,615 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.attention_skip_rate, batch_count=5820.0, ans=0.04241666666666667 +2024-01-15 11:50:30,947 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.feed_forward2.out_whiten, num_groups=1, num_channels=512, metric=13.76 vs. limit=9.682500000000001 +2024-01-15 11:50:32,983 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module1.balancer1.prob, batch_count=5820.0, ans=0.2271875 +2024-01-15 11:50:33,057 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.bypass.skip_rate, batch_count=5820.0, ans=0.04949747468305833 +2024-01-15 11:51:00,352 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.bypass.skip_rate, batch_count=5886.666666666667, ans=0.07 +2024-01-15 11:51:17,781 INFO [train.py:994] (1/2) Epoch 3, batch 100, loss[loss=0.3426, simple_loss=0.3344, pruned_loss=0.1744, over 19257.00 frames. ], tot_loss[loss=0.3503, simple_loss=0.3658, pruned_loss=0.1627, over 1910982.12 frames. ], batch size: 82, lr: 4.40e-02, grad_scale: 32.0 +2024-01-15 11:51:29,944 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module2.balancer2.min_abs, batch_count=5986.666666666667, ans=0.2898 +2024-01-15 11:51:32,307 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module2.balancer2.prob, batch_count=5986.666666666667, ans=0.219375 +2024-01-15 11:51:42,958 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.145e+02 2.875e+02 3.360e+02 3.960e+02 7.959e+02, threshold=6.719e+02, percent-clipped=3.0 +2024-01-15 11:51:44,582 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module1.balancer2.prob, batch_count=6020.0, ans=0.21781250000000002 +2024-01-15 11:51:48,319 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.balancer1.prob, batch_count=6020.0, ans=0.21781250000000002 +2024-01-15 11:51:53,064 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.self_attn2.whiten, num_groups=1, num_channels=384, metric=11.67 vs. limit=12.015 +2024-01-15 11:51:56,906 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=3.83 vs. limit=3.908 +2024-01-15 11:52:07,149 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.nonlin_attention.whiten2, num_groups=1, num_channels=384, metric=6.21 vs. limit=8.026666666666667 +2024-01-15 11:52:10,526 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_skip_rate, batch_count=6086.666666666667, ans=0.041305555555555554 +2024-01-15 11:52:23,038 INFO [train.py:994] (1/2) Epoch 3, batch 150, loss[loss=0.3259, simple_loss=0.347, pruned_loss=0.1492, over 24125.00 frames. ], tot_loss[loss=0.348, simple_loss=0.3651, pruned_loss=0.1612, over 2555804.17 frames. ], batch size: 140, lr: 4.39e-02, grad_scale: 32.0 +2024-01-15 11:52:24,914 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=384, metric=13.09 vs. limit=12.09 +2024-01-15 11:52:31,437 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=6120.0, ans=0.23879999999999998 +2024-01-15 11:53:05,627 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=256, metric=12.55 vs. limit=12.165 +2024-01-15 11:53:06,555 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module1.balancer2.prob, batch_count=6220.0, ans=0.2084375 +2024-01-15 11:53:07,192 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=192, metric=6.40 vs. limit=6.555 +2024-01-15 11:53:18,510 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.bypass.skip_rate, batch_count=6253.333333333333, ans=0.09899494936611666 +2024-01-15 11:53:21,891 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.self_attn1.whiten, num_groups=1, num_channels=512, metric=14.17 vs. limit=12.190000000000001 +2024-01-15 11:53:27,234 INFO [train.py:994] (1/2) Epoch 3, batch 200, loss[loss=0.3479, simple_loss=0.3695, pruned_loss=0.1609, over 24453.00 frames. ], tot_loss[loss=0.3443, simple_loss=0.3635, pruned_loss=0.1588, over 3053783.75 frames. ], batch size: 170, lr: 4.39e-02, grad_scale: 32.0 +2024-01-15 11:53:29,989 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module2.balancer2.prob, batch_count=6286.666666666667, ans=0.2053125 +2024-01-15 11:53:51,511 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.145e+02 2.732e+02 3.081e+02 3.484e+02 6.309e+02, threshold=6.162e+02, percent-clipped=0.0 +2024-01-15 11:53:54,750 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=384, metric=9.69 vs. limit=9.8825 +2024-01-15 11:53:54,831 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=10.68 vs. limit=8.176666666666666 +2024-01-15 11:54:02,932 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=6353.333333333333, ans=0.23646666666666666 +2024-01-15 11:54:05,356 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.bypass_mid.scale_min, batch_count=6386.666666666667, ans=0.6764666666666667 +2024-01-15 11:54:26,757 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=384, metric=13.16 vs. limit=12.315000000000001 +2024-01-15 11:54:31,588 INFO [train.py:994] (1/2) Epoch 3, batch 250, loss[loss=0.3187, simple_loss=0.3506, pruned_loss=0.1419, over 24533.00 frames. ], tot_loss[loss=0.3405, simple_loss=0.3614, pruned_loss=0.1567, over 3438457.91 frames. ], batch size: 236, lr: 4.38e-02, grad_scale: 32.0 +2024-01-15 11:54:56,123 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.ff3_skip_rate, batch_count=6520.0, ans=0.009452173913043479 +2024-01-15 11:54:56,184 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.out_combiner.scale_min, batch_count=6520.0, ans=0.6718000000000001 +2024-01-15 11:55:20,830 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=384, metric=4.81 vs. limit=8.276666666666667 +2024-01-15 11:55:34,528 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=6620.0, ans=0.2338 +2024-01-15 11:55:35,394 INFO [train.py:994] (1/2) Epoch 3, batch 300, loss[loss=0.3283, simple_loss=0.3537, pruned_loss=0.1511, over 24540.00 frames. ], tot_loss[loss=0.3366, simple_loss=0.3597, pruned_loss=0.1543, over 3747597.63 frames. ], batch size: 165, lr: 4.38e-02, grad_scale: 32.0 +2024-01-15 11:55:45,191 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_skip_rate, batch_count=6620.0, ans=0.03908333333333334 +2024-01-15 11:55:50,550 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=384, metric=6.71 vs. limit=9.995000000000001 +2024-01-15 11:55:57,188 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.nonlin_attention.balancer.min_positive, batch_count=6653.333333333333, ans=0.18346666666666667 +2024-01-15 11:56:00,119 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.conv_module2.whiten, num_groups=1, num_channels=384, metric=7.29 vs. limit=9.995000000000001 +2024-01-15 11:56:00,475 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.241e+02 2.796e+02 3.225e+02 3.857e+02 7.668e+02, threshold=6.450e+02, percent-clipped=1.0 +2024-01-15 11:56:09,495 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=3.86 vs. limit=4.003 +2024-01-15 11:56:23,996 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.convnext.layerdrop_rate, batch_count=6720.0, ans=0.13784000000000002 +2024-01-15 11:56:24,121 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module1.balancer1.max_abs, batch_count=6720.0, ans=9.2 +2024-01-15 11:56:30,267 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=7.55 vs. limit=8.376666666666667 +2024-01-15 11:56:34,882 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_whiten, num_groups=1, num_channels=512, metric=8.08 vs. limit=10.0325 +2024-01-15 11:56:40,158 INFO [train.py:994] (1/2) Epoch 3, batch 350, loss[loss=0.3026, simple_loss=0.3359, pruned_loss=0.1346, over 24558.00 frames. ], tot_loss[loss=0.3342, simple_loss=0.3592, pruned_loss=0.1528, over 3977445.69 frames. ], batch size: 176, lr: 4.38e-02, grad_scale: 32.0 +2024-01-15 11:56:49,563 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=6786.666666666667, ans=0.181875 +2024-01-15 11:57:31,512 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module1.balancer1.prob, batch_count=6920.0, ans=0.17562499999999998 +2024-01-15 11:57:42,580 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.out_combiner.scale_min, batch_count=6953.333333333333, ans=0.6566333333333334 +2024-01-15 11:57:44,166 INFO [train.py:994] (1/2) Epoch 3, batch 400, loss[loss=0.297, simple_loss=0.3331, pruned_loss=0.1304, over 24601.00 frames. ], tot_loss[loss=0.3322, simple_loss=0.3585, pruned_loss=0.1516, over 4164412.25 frames. ], batch size: 199, lr: 4.37e-02, grad_scale: 32.0 +2024-01-15 11:58:07,837 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.317e+02 2.802e+02 3.170e+02 3.769e+02 6.505e+02, threshold=6.340e+02, percent-clipped=1.0 +2024-01-15 11:58:12,862 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module1.balancer2.min_positive, batch_count=7020.0, ans=0.05612500000000001 +2024-01-15 11:58:22,680 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_skip_rate, batch_count=7053.333333333333, ans=0.037277777777777785 +2024-01-15 11:58:47,420 INFO [train.py:994] (1/2) Epoch 3, batch 450, loss[loss=0.3277, simple_loss=0.3608, pruned_loss=0.1473, over 24383.00 frames. ], tot_loss[loss=0.3284, simple_loss=0.3567, pruned_loss=0.149, over 4307116.88 frames. ], batch size: 275, lr: 4.37e-02, grad_scale: 32.0 +2024-01-15 11:58:47,584 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.convnext.hidden_balancer.prob, batch_count=7120.0, ans=0.16625 +2024-01-15 11:59:06,007 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.attention_skip_rate, batch_count=7153.333333333333, ans=0.036861111111111115 +2024-01-15 11:59:24,970 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=7220.0, ans=0.2278 +2024-01-15 11:59:34,743 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_module2.balancer1.prob, batch_count=7220.0, ans=0.1615625 +2024-01-15 11:59:50,142 INFO [train.py:994] (1/2) Epoch 3, batch 500, loss[loss=0.3148, simple_loss=0.348, pruned_loss=0.1408, over 24455.00 frames. ], tot_loss[loss=0.3256, simple_loss=0.3555, pruned_loss=0.1471, over 4426050.99 frames. ], batch size: 222, lr: 4.37e-02, grad_scale: 32.0 +2024-01-15 12:00:14,612 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.102e+02 2.884e+02 3.254e+02 3.878e+02 6.671e+02, threshold=6.507e+02, percent-clipped=1.0 +2024-01-15 12:00:55,034 INFO [train.py:994] (1/2) Epoch 3, batch 550, loss[loss=0.3109, simple_loss=0.3517, pruned_loss=0.1351, over 24598.00 frames. ], tot_loss[loss=0.322, simple_loss=0.3531, pruned_loss=0.1448, over 4500454.74 frames. ], batch size: 199, lr: 4.36e-02, grad_scale: 32.0 +2024-01-15 12:01:13,814 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.conv.8.prob, batch_count=7486.666666666667, ans=0.1490625 +2024-01-15 12:01:16,315 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module1.balancer1.min_positive, batch_count=7486.666666666667, ans=0.02660416666666667 +2024-01-15 12:01:22,445 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward1.hidden_balancer.prob, batch_count=7520.0, ans=0.14750000000000002 +2024-01-15 12:01:32,249 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_module1.balancer1.prob, batch_count=7553.333333333333, ans=0.1459375 +2024-01-15 12:01:50,309 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.balancer2.prob, batch_count=7586.666666666667, ans=0.14437499999999998 +2024-01-15 12:01:57,159 INFO [train.py:994] (1/2) Epoch 3, batch 600, loss[loss=0.3221, simple_loss=0.3601, pruned_loss=0.142, over 24525.00 frames. ], tot_loss[loss=0.3204, simple_loss=0.3523, pruned_loss=0.1438, over 4559441.48 frames. ], batch size: 165, lr: 4.36e-02, grad_scale: 32.0 +2024-01-15 12:02:01,015 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module2.balancer1.prob, batch_count=7620.0, ans=0.1428125 +2024-01-15 12:02:07,020 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.self_attn2.whiten, num_groups=1, num_channels=384, metric=12.87 vs. limit=13.215 +2024-01-15 12:02:12,707 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=7653.333333333333, ans=0.14125 +2024-01-15 12:02:20,714 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.115e+02 2.650e+02 3.022e+02 3.630e+02 6.228e+02, threshold=6.043e+02, percent-clipped=0.0 +2024-01-15 12:02:25,981 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=384, metric=3.11 vs. limit=10.3825 +2024-01-15 12:02:45,111 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=144, metric=6.59 vs. limit=6.93 +2024-01-15 12:02:50,245 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.ff2_skip_rate, batch_count=7753.333333333333, ans=0.009184057971014493 +2024-01-15 12:02:59,060 INFO [train.py:994] (1/2) Epoch 3, batch 650, loss[loss=0.2903, simple_loss=0.3272, pruned_loss=0.1266, over 24155.00 frames. ], tot_loss[loss=0.3175, simple_loss=0.3507, pruned_loss=0.1417, over 4611575.80 frames. ], batch size: 140, lr: 4.35e-02, grad_scale: 32.0 +2024-01-15 12:03:05,364 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.ff3_skip_rate, batch_count=7786.666666666667, ans=0.009176811594202899 +2024-01-15 12:03:16,317 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module2.balancer2.prob, batch_count=7820.0, ans=0.1334375 +2024-01-15 12:03:20,008 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.balancer2.prob, batch_count=7820.0, ans=0.1334375 +2024-01-15 12:03:21,099 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.bypass_mid.scale_min, batch_count=7820.0, ans=0.6263000000000001 +2024-01-15 12:03:31,011 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_module2.balancer1.min_positive, batch_count=7853.333333333333, ans=0.025458333333333336 +2024-01-15 12:03:36,092 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=10.69 vs. limit=10.4575 +2024-01-15 12:03:48,209 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_module1.balancer2.prob, batch_count=7920.0, ans=0.12874999999999998 +2024-01-15 12:04:00,161 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.nonlin_attention.balancer.max_positive, batch_count=7953.333333333333, ans=0.8295333333333333 +2024-01-15 12:04:01,227 INFO [train.py:994] (1/2) Epoch 3, batch 700, loss[loss=0.3145, simple_loss=0.3475, pruned_loss=0.1407, over 24392.00 frames. ], tot_loss[loss=0.3153, simple_loss=0.3497, pruned_loss=0.1402, over 4661047.02 frames. ], batch size: 159, lr: 4.35e-02, grad_scale: 32.0 +2024-01-15 12:04:23,932 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=7986.666666666667, ans=0.125625 +2024-01-15 12:04:25,976 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.212e+02 2.889e+02 3.295e+02 3.888e+02 5.755e+02, threshold=6.589e+02, percent-clipped=0.0 +2024-01-15 12:04:26,306 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=8020.0, ans=0.2198 +2024-01-15 12:04:31,028 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=8020.0, ans=0.125 +2024-01-15 12:04:35,737 INFO [scaling.py:1022] (1/2) Whitening: name=encoder_embed.convnext.out_whiten, num_groups=1, num_channels=128, metric=4.81 vs. limit=5.0 +2024-01-15 12:04:51,497 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=8086.666666666667, ans=0.21913333333333335 +2024-01-15 12:05:01,909 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.conv_module2.whiten, num_groups=1, num_channels=384, metric=4.45 vs. limit=10.5325 +2024-01-15 12:05:05,432 INFO [train.py:994] (1/2) Epoch 3, batch 750, loss[loss=0.2896, simple_loss=0.3018, pruned_loss=0.1387, over 18773.00 frames. ], tot_loss[loss=0.3126, simple_loss=0.3483, pruned_loss=0.1382, over 4697581.87 frames. ], batch size: 81, lr: 4.35e-02, grad_scale: 32.0 +2024-01-15 12:05:06,133 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=512, metric=14.38 vs. limit=13.59 +2024-01-15 12:05:17,769 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.attention_skip_rate, batch_count=8153.333333333333, ans=0.03269444444444445 +2024-01-15 12:05:38,585 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=9.57 vs. limit=10.57 +2024-01-15 12:05:41,923 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.self_attn2.whiten, num_groups=1, num_channels=384, metric=13.23 vs. limit=13.665 +2024-01-15 12:05:43,056 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=512, metric=10.55 vs. limit=10.5825 +2024-01-15 12:05:57,550 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_skip_rate, batch_count=8253.333333333334, ans=0.03227777777777778 +2024-01-15 12:06:05,757 INFO [train.py:994] (1/2) Epoch 3, batch 800, loss[loss=0.3042, simple_loss=0.3499, pruned_loss=0.1292, over 24529.00 frames. ], tot_loss[loss=0.3096, simple_loss=0.3458, pruned_loss=0.1366, over 4695407.08 frames. ], batch size: 236, lr: 4.34e-02, grad_scale: 32.0 +2024-01-15 12:06:05,977 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.balancer1.prob, batch_count=8286.666666666666, ans=0.125 +2024-01-15 12:06:22,671 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.self_attn1.whiten, num_groups=1, num_channels=384, metric=13.74 vs. limit=13.74 +2024-01-15 12:06:27,764 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.950e+02 2.927e+02 3.165e+02 3.748e+02 6.702e+02, threshold=6.330e+02, percent-clipped=0.0 +2024-01-15 12:06:30,343 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward3.hidden_balancer.prob, batch_count=8353.333333333334, ans=0.125 +2024-01-15 12:06:50,954 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=256, metric=3.80 vs. limit=10.6575 +2024-01-15 12:07:17,343 INFO [train.py:994] (1/2) Epoch 4, batch 0, loss[loss=0.3246, simple_loss=0.3663, pruned_loss=0.1414, over 24469.00 frames. ], tot_loss[loss=0.3246, simple_loss=0.3663, pruned_loss=0.1414, over 24469.00 frames. ], batch size: 181, lr: 4.29e-02, grad_scale: 32.0 +2024-01-15 12:07:17,343 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 12:07:37,234 INFO [train.py:1026] (1/2) Epoch 4, validation: loss=0.222, simple_loss=0.3027, pruned_loss=0.07067, over 1622729.00 frames. +2024-01-15 12:07:37,236 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 12:08:06,643 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=8496.666666666666, ans=0.125 +2024-01-15 12:08:07,764 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.ff3_skip_rate, batch_count=8496.666666666666, ans=0.009022463768115942 +2024-01-15 12:08:14,237 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module1.balancer1.prob, batch_count=8530.0, ans=0.125 +2024-01-15 12:08:20,540 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.nonlin_attention.whiten1.whitening_limit, batch_count=8530.0, ans=7.1325 +2024-01-15 12:08:30,172 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.out_combiner.scale_min, batch_count=8563.333333333334, ans=0.6002833333333334 +2024-01-15 12:08:39,853 INFO [train.py:994] (1/2) Epoch 4, batch 50, loss[loss=0.2717, simple_loss=0.3156, pruned_loss=0.1139, over 24170.00 frames. ], tot_loss[loss=0.296, simple_loss=0.3361, pruned_loss=0.128, over 1086759.85 frames. ], batch size: 140, lr: 4.28e-02, grad_scale: 32.0 +2024-01-15 12:08:53,811 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass.skip_rate, batch_count=8630.0, ans=0.09899494936611666 +2024-01-15 12:09:05,247 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.out_proj.dropout_p, batch_count=8663.333333333334, ans=0.21336666666666665 +2024-01-15 12:09:12,571 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.074e+02 2.645e+02 2.860e+02 3.274e+02 6.285e+02, threshold=5.719e+02, percent-clipped=1.0 +2024-01-15 12:09:12,878 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.ff3_skip_rate, batch_count=8663.333333333334, ans=0.008986231884057972 +2024-01-15 12:09:43,141 INFO [train.py:994] (1/2) Epoch 4, batch 100, loss[loss=0.2861, simple_loss=0.3304, pruned_loss=0.1209, over 24475.00 frames. ], tot_loss[loss=0.2947, simple_loss=0.3361, pruned_loss=0.1266, over 1918858.31 frames. ], batch size: 250, lr: 4.28e-02, grad_scale: 32.0 +2024-01-15 12:09:44,542 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.balancer1.prob, batch_count=8763.333333333334, ans=0.125 +2024-01-15 12:09:45,770 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.bypass_mid.scale_min, batch_count=8763.333333333334, ans=0.5932833333333334 +2024-01-15 12:09:49,530 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.bypass.scale_min, batch_count=8763.333333333334, ans=0.5932833333333334 +2024-01-15 12:09:53,089 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=8763.333333333334, ans=0.0 +2024-01-15 12:09:57,397 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_module2.balancer1.prob, batch_count=8796.666666666666, ans=0.125 +2024-01-15 12:10:02,200 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.ff3_skip_rate, batch_count=8796.666666666666, ans=0.008957246376811594 +2024-01-15 12:10:41,885 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=8896.666666666666, ans=0.125 +2024-01-15 12:10:46,966 INFO [train.py:994] (1/2) Epoch 4, batch 150, loss[loss=0.28, simple_loss=0.3249, pruned_loss=0.1176, over 24411.00 frames. ], tot_loss[loss=0.295, simple_loss=0.3365, pruned_loss=0.1267, over 2549052.41 frames. ], batch size: 258, lr: 4.27e-02, grad_scale: 32.0 +2024-01-15 12:10:52,098 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=8930.0, ans=0.0 +2024-01-15 12:11:02,705 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=384, metric=14.59 vs. limit=14.2225 +2024-01-15 12:11:03,470 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.out_combiner.scale_min, batch_count=8963.333333333334, ans=0.5862833333333334 +2024-01-15 12:11:13,770 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward3.hidden_balancer.prob, batch_count=8996.666666666666, ans=0.125 +2024-01-15 12:11:19,241 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.225e+02 2.688e+02 3.071e+02 4.061e+02 8.177e+02, threshold=6.142e+02, percent-clipped=7.0 +2024-01-15 12:11:40,173 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module2.balancer2.prob, batch_count=9063.333333333334, ans=0.125 +2024-01-15 12:11:44,862 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.convnext.out_balancer.prob, batch_count=9063.333333333334, ans=0.125 +2024-01-15 12:11:44,939 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.ff3_skip_rate, batch_count=9063.333333333334, ans=0.00889927536231884 +2024-01-15 12:11:50,083 INFO [train.py:994] (1/2) Epoch 4, batch 200, loss[loss=0.296, simple_loss=0.346, pruned_loss=0.123, over 24458.00 frames. ], tot_loss[loss=0.2942, simple_loss=0.3364, pruned_loss=0.126, over 3051941.36 frames. ], batch size: 216, lr: 4.27e-02, grad_scale: 32.0 +2024-01-15 12:11:55,165 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.ff2_skip_rate, batch_count=9096.666666666666, ans=0.008892028985507247 +2024-01-15 12:11:59,989 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.balancer1.prob, batch_count=9096.666666666666, ans=0.125 +2024-01-15 12:12:00,060 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=9096.666666666666, ans=0.20903333333333335 +2024-01-15 12:12:08,083 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=8, num_channels=256, metric=4.28 vs. limit=4.3695 +2024-01-15 12:12:08,871 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=9130.0, ans=0.125 +2024-01-15 12:12:20,347 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.balancer_ff2.min_abs, batch_count=9163.333333333334, ans=0.1 +2024-01-15 12:12:39,206 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module1.balancer2.prob, batch_count=9230.0, ans=0.125 +2024-01-15 12:12:53,771 INFO [train.py:994] (1/2) Epoch 4, batch 250, loss[loss=0.3284, simple_loss=0.3646, pruned_loss=0.1461, over 23849.00 frames. ], tot_loss[loss=0.2925, simple_loss=0.3357, pruned_loss=0.1247, over 3445228.53 frames. ], batch size: 328, lr: 4.26e-02, grad_scale: 32.0 +2024-01-15 12:13:21,956 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.ff3_skip_rate, batch_count=9330.0, ans=0.008841304347826087 +2024-01-15 12:13:28,411 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.153e+02 2.732e+02 3.101e+02 3.691e+02 6.166e+02, threshold=6.201e+02, percent-clipped=1.0 +2024-01-15 12:13:31,221 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=9330.0, ans=0.125 +2024-01-15 12:13:42,905 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_skip_rate, batch_count=9363.333333333334, ans=0.02765277777777778 +2024-01-15 12:13:43,412 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=4.36 vs. limit=4.4045000000000005 +2024-01-15 12:13:51,811 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=384, metric=2.50 vs. limit=11.02375 +2024-01-15 12:13:56,013 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=8.81 vs. limit=11.02375 +2024-01-15 12:13:59,095 INFO [train.py:994] (1/2) Epoch 4, batch 300, loss[loss=0.2758, simple_loss=0.3096, pruned_loss=0.121, over 23544.00 frames. ], tot_loss[loss=0.2918, simple_loss=0.335, pruned_loss=0.1243, over 3741429.12 frames. ], batch size: 119, lr: 4.26e-02, grad_scale: 32.0 +2024-01-15 12:14:07,696 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=4.39 vs. limit=4.4145 +2024-01-15 12:14:13,391 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.convnext.layerdrop_rate, batch_count=9463.333333333334, ans=0.11246416666666667 +2024-01-15 12:14:16,339 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.feed_forward3.out_whiten, num_groups=1, num_channels=512, metric=10.65 vs. limit=11.04875 +2024-01-15 12:14:48,258 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.attention_skip_rate, batch_count=9530.0, ans=0.026958333333333338 +2024-01-15 12:15:03,278 INFO [train.py:994] (1/2) Epoch 4, batch 350, loss[loss=0.2671, simple_loss=0.3191, pruned_loss=0.1076, over 24406.00 frames. ], tot_loss[loss=0.2901, simple_loss=0.334, pruned_loss=0.1231, over 3976303.51 frames. ], batch size: 159, lr: 4.26e-02, grad_scale: 32.0 +2024-01-15 12:15:28,925 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.balancer2.min_abs, batch_count=9663.333333333334, ans=0.34495 +2024-01-15 12:15:35,760 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.316e+02 2.827e+02 3.214e+02 3.907e+02 6.387e+02, threshold=6.428e+02, percent-clipped=1.0 +2024-01-15 12:15:41,602 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=9696.666666666666, ans=0.125 +2024-01-15 12:15:44,783 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module1.balancer2.prob, batch_count=9696.666666666666, ans=0.125 +2024-01-15 12:16:06,338 INFO [train.py:994] (1/2) Epoch 4, batch 400, loss[loss=0.2786, simple_loss=0.3294, pruned_loss=0.1139, over 24475.00 frames. ], tot_loss[loss=0.2899, simple_loss=0.3342, pruned_loss=0.1228, over 4170998.09 frames. ], batch size: 267, lr: 4.25e-02, grad_scale: 32.0 +2024-01-15 12:16:33,512 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=11.70 vs. limit=11.18625 +2024-01-15 12:16:51,261 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.bypass_mid.scale_min, batch_count=9863.333333333334, ans=0.5547833333333334 +2024-01-15 12:16:51,272 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.ff3_skip_rate, batch_count=9863.333333333334, ans=0.00872536231884058 +2024-01-15 12:17:00,156 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass_mid.scale_min, batch_count=9896.666666666666, ans=0.5536166666666666 +2024-01-15 12:17:10,517 INFO [train.py:994] (1/2) Epoch 4, batch 450, loss[loss=0.2724, simple_loss=0.3219, pruned_loss=0.1115, over 24494.00 frames. ], tot_loss[loss=0.289, simple_loss=0.3339, pruned_loss=0.1221, over 4325622.82 frames. ], batch size: 243, lr: 4.25e-02, grad_scale: 32.0 +2024-01-15 12:17:12,012 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module1.balancer1.prob, batch_count=9930.0, ans=0.125 +2024-01-15 12:17:14,421 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward3.hidden_balancer.prob, batch_count=9930.0, ans=0.125 +2024-01-15 12:17:43,439 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.280e+02 2.708e+02 3.043e+02 3.598e+02 6.128e+02, threshold=6.087e+02, percent-clipped=0.0 +2024-01-15 12:17:54,990 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward1.hidden_balancer.prob, batch_count=10030.0, ans=0.125 +2024-01-15 12:17:58,508 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module2.balancer1.max_abs, batch_count=10030.0, ans=10.0 +2024-01-15 12:18:13,757 INFO [train.py:994] (1/2) Epoch 4, batch 500, loss[loss=0.3001, simple_loss=0.344, pruned_loss=0.1281, over 24464.00 frames. ], tot_loss[loss=0.2881, simple_loss=0.333, pruned_loss=0.1215, over 4429460.94 frames. ], batch size: 250, lr: 4.24e-02, grad_scale: 32.0 +2024-01-15 12:18:23,918 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.bypass.skip_rate, batch_count=10096.666666666666, ans=0.04949747468305833 +2024-01-15 12:18:26,885 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=4.47 vs. limit=4.5195 +2024-01-15 12:18:26,919 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=256, metric=14.44 vs. limit=15.0975 +2024-01-15 12:18:31,218 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward1.out_proj.dropout_p, batch_count=10130.0, ans=0.1987 +2024-01-15 12:18:34,752 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_skip_rate, batch_count=10130.0, ans=0.024458333333333335 +2024-01-15 12:18:35,880 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module2.balancer1.max_abs, batch_count=10130.0, ans=10.0 +2024-01-15 12:18:39,589 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=384, metric=16.67 vs. limit=15.122499999999999 +2024-01-15 12:18:59,927 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module2.balancer2.prob, batch_count=10196.666666666666, ans=0.125 +2024-01-15 12:19:17,902 INFO [train.py:994] (1/2) Epoch 4, batch 550, loss[loss=0.2765, simple_loss=0.3277, pruned_loss=0.1127, over 24586.00 frames. ], tot_loss[loss=0.2868, simple_loss=0.3325, pruned_loss=0.1205, over 4527858.64 frames. ], batch size: 199, lr: 4.24e-02, grad_scale: 32.0 +2024-01-15 12:19:47,222 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=10330.0, ans=0.125 +2024-01-15 12:19:48,805 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.conv_module1.whiten, num_groups=1, num_channels=512, metric=3.02 vs. limit=11.37375 +2024-01-15 12:19:50,509 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.184e+02 2.675e+02 3.062e+02 3.666e+02 5.960e+02, threshold=6.124e+02, percent-clipped=0.0 +2024-01-15 12:20:04,813 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=10363.333333333334, ans=0.19636666666666666 +2024-01-15 12:20:06,064 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass_mid.scale_min, batch_count=10363.333333333334, ans=0.5372833333333333 +2024-01-15 12:20:07,355 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module1.balancer2.prob, batch_count=10396.666666666666, ans=0.125 +2024-01-15 12:20:10,511 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.bypass.skip_rate, batch_count=10396.666666666666, ans=0.09899494936611666 +2024-01-15 12:20:20,940 INFO [train.py:994] (1/2) Epoch 4, batch 600, loss[loss=0.2888, simple_loss=0.3369, pruned_loss=0.1204, over 24578.00 frames. ], tot_loss[loss=0.2854, simple_loss=0.3315, pruned_loss=0.1197, over 4592915.01 frames. ], batch size: 176, lr: 4.23e-02, grad_scale: 32.0 +2024-01-15 12:20:26,522 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.convnext.out_balancer.prob, batch_count=10430.0, ans=0.125 +2024-01-15 12:20:40,819 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.bypass_mid.scale_min, batch_count=10463.333333333334, ans=0.5337833333333334 +2024-01-15 12:20:43,426 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.balancer2.prob, batch_count=10463.333333333334, ans=0.125 +2024-01-15 12:20:58,762 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module2.balancer1.prob, batch_count=10530.0, ans=0.125 +2024-01-15 12:21:04,301 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.balancer1.prob, batch_count=10530.0, ans=0.125 +2024-01-15 12:21:13,695 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module2.balancer1.prob, batch_count=10563.333333333334, ans=0.125 +2024-01-15 12:21:24,610 INFO [train.py:994] (1/2) Epoch 4, batch 650, loss[loss=0.2967, simple_loss=0.3438, pruned_loss=0.1248, over 24298.00 frames. ], tot_loss[loss=0.2835, simple_loss=0.3297, pruned_loss=0.1186, over 4622360.95 frames. ], batch size: 285, lr: 4.23e-02, grad_scale: 32.0 +2024-01-15 12:21:53,712 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward1.hidden_balancer.prob, batch_count=10663.333333333334, ans=0.125 +2024-01-15 12:21:57,812 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=384, metric=15.42 vs. limit=15.497499999999999 +2024-01-15 12:21:58,959 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.008e+02 2.685e+02 3.057e+02 3.561e+02 6.746e+02, threshold=6.115e+02, percent-clipped=1.0 +2024-01-15 12:22:00,569 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module2.balancer2.prob, batch_count=10663.333333333334, ans=0.125 +2024-01-15 12:22:15,606 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module1.balancer2.min_abs, batch_count=10730.0, ans=0.36095 +2024-01-15 12:22:21,052 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=4.25 vs. limit=4.6095 +2024-01-15 12:22:21,986 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=4.14 vs. limit=4.6095 +2024-01-15 12:22:28,266 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.attention_skip_rate, batch_count=10763.333333333334, ans=0.02181944444444444 +2024-01-15 12:22:28,848 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=4.21 vs. limit=4.6145 +2024-01-15 12:22:29,226 INFO [train.py:994] (1/2) Epoch 4, batch 700, loss[loss=0.3024, simple_loss=0.3463, pruned_loss=0.1293, over 24510.00 frames. ], tot_loss[loss=0.2833, simple_loss=0.3297, pruned_loss=0.1184, over 4653376.85 frames. ], batch size: 187, lr: 4.22e-02, grad_scale: 32.0 +2024-01-15 12:22:33,259 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module2.balancer1.prob, batch_count=10763.333333333334, ans=0.125 +2024-01-15 12:22:45,100 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.self_attn2.whiten, num_groups=1, num_channels=512, metric=17.92 vs. limit=15.5975 +2024-01-15 12:23:31,274 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module1.balancer1.prob, batch_count=10930.0, ans=0.125 +2024-01-15 12:23:32,154 INFO [train.py:994] (1/2) Epoch 4, batch 750, loss[loss=0.2946, simple_loss=0.3412, pruned_loss=0.124, over 24352.00 frames. ], tot_loss[loss=0.2826, simple_loss=0.3293, pruned_loss=0.1179, over 4693749.53 frames. ], batch size: 285, lr: 4.22e-02, grad_scale: 64.0 +2024-01-15 12:23:48,944 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=7.60 vs. limit=11.61125 +2024-01-15 12:24:04,139 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.122e+02 2.570e+02 2.957e+02 3.374e+02 5.779e+02, threshold=5.914e+02, percent-clipped=0.0 +2024-01-15 12:24:09,063 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.bypass.skip_rate, batch_count=11030.0, ans=0.07 +2024-01-15 12:24:14,557 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.balancer2.prob, batch_count=11030.0, ans=0.125 +2024-01-15 12:24:14,571 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=11030.0, ans=0.0 +2024-01-15 12:24:31,907 INFO [train.py:994] (1/2) Epoch 4, batch 800, loss[loss=0.2688, simple_loss=0.3052, pruned_loss=0.1162, over 23488.00 frames. ], tot_loss[loss=0.2809, simple_loss=0.3281, pruned_loss=0.1168, over 4721034.20 frames. ], batch size: 119, lr: 4.21e-02, grad_scale: 64.0 +2024-01-15 12:24:33,876 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=11096.666666666666, ans=0.0 +2024-01-15 12:24:45,258 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.balancer1.prob, batch_count=11130.0, ans=0.125 +2024-01-15 12:24:57,982 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_skip_rate, batch_count=11163.333333333334, ans=0.020152777777777773 +2024-01-15 12:25:00,274 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module2.balancer1.prob, batch_count=11163.333333333334, ans=0.125 +2024-01-15 12:25:03,618 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module2.balancer2.min_positive, batch_count=11163.333333333334, ans=0.05 +2024-01-15 12:25:19,593 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.conv_module1.whiten, num_groups=1, num_channels=384, metric=2.71 vs. limit=11.71125 +2024-01-15 12:25:45,405 INFO [train.py:994] (1/2) Epoch 5, batch 0, loss[loss=0.2637, simple_loss=0.3175, pruned_loss=0.105, over 24546.00 frames. ], tot_loss[loss=0.2637, simple_loss=0.3175, pruned_loss=0.105, over 24546.00 frames. ], batch size: 236, lr: 4.14e-02, grad_scale: 64.0 +2024-01-15 12:25:45,406 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 12:26:05,016 INFO [train.py:1026] (1/2) Epoch 5, validation: loss=0.2095, simple_loss=0.2939, pruned_loss=0.06254, over 1622729.00 frames. +2024-01-15 12:26:05,017 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 12:26:22,703 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=6.97 vs. limit=11.7275 +2024-01-15 12:26:24,428 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=11273.333333333334, ans=0.125 +2024-01-15 12:26:43,004 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.balancer2.prob, batch_count=11340.0, ans=0.125 +2024-01-15 12:26:44,147 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.ff2_skip_rate, batch_count=11340.0, ans=0.008404347826086957 +2024-01-15 12:26:47,270 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.272e+02 2.795e+02 3.154e+02 3.646e+02 6.070e+02, threshold=6.308e+02, percent-clipped=1.0 +2024-01-15 12:27:08,763 INFO [train.py:994] (1/2) Epoch 5, batch 50, loss[loss=0.2598, simple_loss=0.306, pruned_loss=0.1068, over 24383.00 frames. ], tot_loss[loss=0.2753, simple_loss=0.3231, pruned_loss=0.1137, over 1071798.43 frames. ], batch size: 153, lr: 4.14e-02, grad_scale: 64.0 +2024-01-15 12:27:13,649 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=11406.666666666666, ans=0.125 +2024-01-15 12:27:32,556 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.whiten, num_groups=1, num_channels=256, metric=4.60 vs. limit=8.589333333333332 +2024-01-15 12:27:35,649 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module2.balancer2.prob, batch_count=11473.333333333334, ans=0.125 +2024-01-15 12:27:56,643 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_skip_rate, batch_count=11540.0, ans=0.018583333333333334 +2024-01-15 12:27:59,036 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_skip_rate, batch_count=11540.0, ans=0.018583333333333334 +2024-01-15 12:28:05,114 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.conv_module2.whiten, num_groups=1, num_channels=384, metric=3.64 vs. limit=11.8275 +2024-01-15 12:28:06,045 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward1.hidden_balancer.prob, batch_count=11540.0, ans=0.125 +2024-01-15 12:28:09,356 INFO [train.py:994] (1/2) Epoch 5, batch 100, loss[loss=0.2529, simple_loss=0.3035, pruned_loss=0.1011, over 24035.00 frames. ], tot_loss[loss=0.2728, simple_loss=0.3218, pruned_loss=0.1119, over 1899280.03 frames. ], batch size: 131, lr: 4.13e-02, grad_scale: 32.0 +2024-01-15 12:28:33,087 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module1.balancer2.min_positive, batch_count=11640.0, ans=0.05 +2024-01-15 12:28:36,062 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=192, metric=9.42 vs. limit=11.865 +2024-01-15 12:28:36,614 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=11640.0, ans=0.125 +2024-01-15 12:28:42,374 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module1.balancer2.prob, batch_count=11640.0, ans=0.125 +2024-01-15 12:28:51,703 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.131e+02 2.763e+02 3.191e+02 3.657e+02 4.783e+02, threshold=6.382e+02, percent-clipped=0.0 +2024-01-15 12:28:53,257 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.bypass.skip_rate, batch_count=11673.333333333334, ans=0.09899494936611666 +2024-01-15 12:29:00,255 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=11706.666666666666, ans=0.125 +2024-01-15 12:29:11,263 INFO [train.py:994] (1/2) Epoch 5, batch 150, loss[loss=0.2694, simple_loss=0.3177, pruned_loss=0.1106, over 24415.00 frames. ], tot_loss[loss=0.2725, simple_loss=0.3218, pruned_loss=0.1117, over 2542066.83 frames. ], batch size: 159, lr: 4.13e-02, grad_scale: 32.0 +2024-01-15 12:29:24,205 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=11773.333333333334, ans=0.125 +2024-01-15 12:29:28,857 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.ff3_skip_rate, batch_count=11773.333333333334, ans=0.008310144927536232 +2024-01-15 12:29:37,863 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.out_combiner.scale_min, batch_count=11806.666666666666, ans=0.48676666666666674 +2024-01-15 12:29:39,074 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module2.balancer2.prob, batch_count=11806.666666666666, ans=0.125 +2024-01-15 12:29:53,757 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_skip_rate, batch_count=11840.0, ans=0.01733333333333334 +2024-01-15 12:29:58,249 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module2.balancer2.min_positive, batch_count=11840.0, ans=0.05 +2024-01-15 12:30:00,211 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=11873.333333333334, ans=0.125 +2024-01-15 12:30:13,668 INFO [train.py:994] (1/2) Epoch 5, batch 200, loss[loss=0.2771, simple_loss=0.3274, pruned_loss=0.1134, over 24468.00 frames. ], tot_loss[loss=0.273, simple_loss=0.3231, pruned_loss=0.1115, over 3045161.93 frames. ], batch size: 267, lr: 4.12e-02, grad_scale: 32.0 +2024-01-15 12:30:18,731 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.bypass.skip_rate, batch_count=11906.666666666666, ans=0.04949747468305833 +2024-01-15 12:30:28,918 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=11940.0, ans=0.0 +2024-01-15 12:30:31,292 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module1.balancer1.min_positive, batch_count=11940.0, ans=0.025 +2024-01-15 12:30:49,326 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=512, metric=11.86 vs. limit=11.99 +2024-01-15 12:30:55,996 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=10.92 vs. limit=12.002500000000001 +2024-01-15 12:30:56,397 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.019e+02 2.655e+02 3.060e+02 3.593e+02 5.323e+02, threshold=6.121e+02, percent-clipped=0.0 +2024-01-15 12:31:11,397 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=12040.0, ans=0.17959999999999998 +2024-01-15 12:31:16,456 INFO [train.py:994] (1/2) Epoch 5, batch 250, loss[loss=0.2708, simple_loss=0.324, pruned_loss=0.1088, over 24539.00 frames. ], tot_loss[loss=0.2719, simple_loss=0.3224, pruned_loss=0.1106, over 3441956.90 frames. ], batch size: 176, lr: 4.11e-02, grad_scale: 32.0 +2024-01-15 12:32:08,403 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.nonlin_attention.whiten1, num_groups=1, num_channels=384, metric=5.31 vs. limit=8.051666666666666 +2024-01-15 12:32:18,807 INFO [train.py:994] (1/2) Epoch 5, batch 300, loss[loss=0.2496, simple_loss=0.3024, pruned_loss=0.09841, over 24194.00 frames. ], tot_loss[loss=0.2706, simple_loss=0.3217, pruned_loss=0.1098, over 3752916.89 frames. ], batch size: 140, lr: 4.11e-02, grad_scale: 32.0 +2024-01-15 12:32:29,870 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.out_combiner.scale_min, batch_count=12273.333333333334, ans=0.4704333333333333 +2024-01-15 12:33:01,032 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.998e+02 2.817e+02 3.255e+02 3.789e+02 6.761e+02, threshold=6.509e+02, percent-clipped=1.0 +2024-01-15 12:33:01,898 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.feed_forward1.out_whiten, num_groups=1, num_channels=192, metric=10.49 vs. limit=12.127500000000001 +2024-01-15 12:33:07,668 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.conv_module2.whiten, num_groups=1, num_channels=384, metric=2.98 vs. limit=12.14 +2024-01-15 12:33:20,946 INFO [train.py:994] (1/2) Epoch 5, batch 350, loss[loss=0.2818, simple_loss=0.3347, pruned_loss=0.1145, over 24497.00 frames. ], tot_loss[loss=0.2707, simple_loss=0.3219, pruned_loss=0.1098, over 3984437.39 frames. ], batch size: 187, lr: 4.10e-02, grad_scale: 32.0 +2024-01-15 12:33:28,405 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.attention_skip_rate, batch_count=12406.666666666666, ans=0.014972222222222227 +2024-01-15 12:33:35,790 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_module1.balancer1.prob, batch_count=12440.0, ans=0.125 +2024-01-15 12:33:42,314 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward3.hidden_balancer.prob, batch_count=12440.0, ans=0.125 +2024-01-15 12:33:42,364 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass.skip_rate, batch_count=12440.0, ans=0.04949747468305833 +2024-01-15 12:33:57,926 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.feed_forward2.out_whiten, num_groups=1, num_channels=512, metric=18.90 vs. limit=12.1775 +2024-01-15 12:33:59,974 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=12506.666666666666, ans=0.125 +2024-01-15 12:34:10,388 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module1.balancer1.prob, batch_count=12506.666666666666, ans=0.125 +2024-01-15 12:34:25,144 INFO [train.py:994] (1/2) Epoch 5, batch 400, loss[loss=0.2867, simple_loss=0.3354, pruned_loss=0.119, over 22494.00 frames. ], tot_loss[loss=0.2689, simple_loss=0.3204, pruned_loss=0.1087, over 4158502.12 frames. ], batch size: 357, lr: 4.10e-02, grad_scale: 32.0 +2024-01-15 12:34:27,188 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=9.02 vs. limit=12.215 +2024-01-15 12:34:52,010 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.ff2_skip_rate, batch_count=12640.0, ans=0.008121739130434782 +2024-01-15 12:34:53,174 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=12640.0, ans=0.125 +2024-01-15 12:35:08,726 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.026e+02 2.507e+02 2.821e+02 3.302e+02 6.163e+02, threshold=5.641e+02, percent-clipped=0.0 +2024-01-15 12:35:20,040 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=12706.666666666666, ans=0.0 +2024-01-15 12:35:27,625 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=384, metric=16.72 vs. limit=17.055 +2024-01-15 12:35:28,865 INFO [train.py:994] (1/2) Epoch 5, batch 450, loss[loss=0.2744, simple_loss=0.3288, pruned_loss=0.11, over 24472.00 frames. ], tot_loss[loss=0.268, simple_loss=0.3197, pruned_loss=0.1082, over 4297877.55 frames. ], batch size: 222, lr: 4.09e-02, grad_scale: 32.0 +2024-01-15 12:35:29,210 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.bypass_mid.scale_min, batch_count=12740.0, ans=0.4541 +2024-01-15 12:35:31,560 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.attention_skip_rate, batch_count=12740.0, ans=0.013583333333333336 +2024-01-15 12:35:51,457 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_skip_rate, batch_count=12773.333333333334, ans=0.01344444444444444 +2024-01-15 12:36:14,884 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.balancer2.prob, batch_count=12840.0, ans=0.125 +2024-01-15 12:36:32,503 INFO [train.py:994] (1/2) Epoch 5, batch 500, loss[loss=0.3016, simple_loss=0.3479, pruned_loss=0.1277, over 23863.00 frames. ], tot_loss[loss=0.2678, simple_loss=0.3197, pruned_loss=0.1079, over 4414332.81 frames. ], batch size: 328, lr: 4.09e-02, grad_scale: 32.0 +2024-01-15 12:36:32,808 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=12906.666666666666, ans=0.125 +2024-01-15 12:36:42,523 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.ff2_skip_rate, batch_count=12906.666666666666, ans=0.00806376811594203 +2024-01-15 12:36:49,251 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=384, metric=18.83 vs. limit=17.205 +2024-01-15 12:36:56,029 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.self_attn_weights.pos_emb_skip_rate, batch_count=12940.0, ans=0.0 +2024-01-15 12:37:04,347 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_skip_rate, batch_count=12973.333333333334, ans=0.012611111111111108 +2024-01-15 12:37:04,409 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.attention_skip_rate, batch_count=12973.333333333334, ans=0.012611111111111108 +2024-01-15 12:37:13,166 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.self_attn2.whiten, num_groups=1, num_channels=256, metric=16.68 vs. limit=17.255000000000003 +2024-01-15 12:37:15,669 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.000e+02 2.645e+02 3.056e+02 3.686e+02 5.447e+02, threshold=6.112e+02, percent-clipped=0.0 +2024-01-15 12:37:26,336 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass.skip_rate, batch_count=13040.0, ans=0.04949747468305833 +2024-01-15 12:37:32,252 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_skip_rate, batch_count=13040.0, ans=0.012333333333333335 +2024-01-15 12:37:35,485 INFO [train.py:994] (1/2) Epoch 5, batch 550, loss[loss=0.2674, simple_loss=0.3273, pruned_loss=0.1038, over 24362.00 frames. ], tot_loss[loss=0.2676, simple_loss=0.3198, pruned_loss=0.1077, over 4510904.48 frames. ], batch size: 298, lr: 4.08e-02, grad_scale: 32.0 +2024-01-15 12:37:37,692 INFO [scaling.py:1022] (1/2) Whitening: name=encoder_embed.out_whiten, num_groups=1, num_channels=192, metric=6.15 vs. limit=6.6146666666666665 +2024-01-15 12:37:47,077 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.self_attn2.whiten, num_groups=1, num_channels=384, metric=19.12 vs. limit=17.305 +2024-01-15 12:38:01,346 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=13140.0, ans=0.1686 +2024-01-15 12:38:12,159 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module2.balancer1.prob, batch_count=13140.0, ans=0.125 +2024-01-15 12:38:13,440 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.ff3_skip_rate, batch_count=13173.333333333334, ans=0.008005797101449275 +2024-01-15 12:38:21,983 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=12.09 vs. limit=12.440000000000001 +2024-01-15 12:38:35,310 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=384, metric=16.04 vs. limit=11.603333333333332 +2024-01-15 12:38:39,361 INFO [train.py:994] (1/2) Epoch 5, batch 600, loss[loss=0.2401, simple_loss=0.2959, pruned_loss=0.0922, over 24141.00 frames. ], tot_loss[loss=0.266, simple_loss=0.3186, pruned_loss=0.1067, over 4577292.70 frames. ], batch size: 140, lr: 4.08e-02, grad_scale: 32.0 +2024-01-15 12:38:48,022 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.balancer1.prob, batch_count=13240.0, ans=0.125 +2024-01-15 12:39:06,008 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=13306.666666666666, ans=0.16693333333333335 +2024-01-15 12:39:23,501 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.093e+02 2.526e+02 2.863e+02 3.334e+02 5.005e+02, threshold=5.725e+02, percent-clipped=0.0 +2024-01-15 12:39:42,138 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_skip_rate, batch_count=13373.333333333334, ans=0.010944444444444444 +2024-01-15 12:39:44,241 INFO [train.py:994] (1/2) Epoch 5, batch 650, loss[loss=0.2659, simple_loss=0.3206, pruned_loss=0.1055, over 24460.00 frames. ], tot_loss[loss=0.2651, simple_loss=0.3181, pruned_loss=0.1061, over 4631938.37 frames. ], batch size: 222, lr: 4.07e-02, grad_scale: 32.0 +2024-01-15 12:39:44,566 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=13406.666666666666, ans=0.16593333333333335 +2024-01-15 12:39:44,890 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.feed_forward2.out_whiten, num_groups=1, num_channels=512, metric=10.59 vs. limit=12.5275 +2024-01-15 12:39:47,334 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.self_attn_weights.whiten_keys, num_groups=8, num_channels=256, metric=4.76 vs. limit=5.011 +2024-01-15 12:39:54,810 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.conv_module1.whiten, num_groups=1, num_channels=384, metric=3.71 vs. limit=12.5275 +2024-01-15 12:39:59,111 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.4.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 12:39:59,138 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.balancer1.prob, batch_count=13440.0, ans=0.125 +2024-01-15 12:40:08,411 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_skip_rate, batch_count=13473.333333333334, ans=0.010527777777777775 +2024-01-15 12:40:11,322 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.self_attn1.whiten, num_groups=1, num_channels=384, metric=18.58 vs. limit=17.605 +2024-01-15 12:40:12,133 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.bypass_mid.scale_min, batch_count=13473.333333333334, ans=0.42843333333333333 +2024-01-15 12:40:17,474 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=4.62 vs. limit=5.021 +2024-01-15 12:40:26,708 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module1.balancer2.prob, batch_count=13506.666666666666, ans=0.125 +2024-01-15 12:40:29,147 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module2.balancer1.prob, batch_count=13506.666666666666, ans=0.125 +2024-01-15 12:40:48,231 INFO [train.py:994] (1/2) Epoch 5, batch 700, loss[loss=0.27, simple_loss=0.3245, pruned_loss=0.1078, over 24603.00 frames. ], tot_loss[loss=0.264, simple_loss=0.3172, pruned_loss=0.1054, over 4670020.93 frames. ], batch size: 199, lr: 4.06e-02, grad_scale: 32.0 +2024-01-15 12:41:13,895 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=13640.0, ans=0.1636 +2024-01-15 12:41:30,114 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.027e+02 2.828e+02 3.267e+02 3.789e+02 7.603e+02, threshold=6.534e+02, percent-clipped=3.0 +2024-01-15 12:41:50,755 INFO [train.py:994] (1/2) Epoch 5, batch 750, loss[loss=0.2622, simple_loss=0.3093, pruned_loss=0.1076, over 24493.00 frames. ], tot_loss[loss=0.2637, simple_loss=0.3167, pruned_loss=0.1054, over 4698575.82 frames. ], batch size: 165, lr: 4.06e-02, grad_scale: 32.0 +2024-01-15 12:42:09,250 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=256, metric=16.91 vs. limit=17.83 +2024-01-15 12:42:21,422 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=512, metric=5.12 vs. limit=11.903333333333332 +2024-01-15 12:42:42,895 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.bypass.scale_min, batch_count=13873.333333333334, ans=0.4144333333333333 +2024-01-15 12:42:51,444 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=384, metric=15.27 vs. limit=12.715 +2024-01-15 12:42:51,799 INFO [train.py:994] (1/2) Epoch 5, batch 800, loss[loss=0.2625, simple_loss=0.3248, pruned_loss=0.1001, over 24512.00 frames. ], tot_loss[loss=0.2622, simple_loss=0.3158, pruned_loss=0.1044, over 4731196.42 frames. ], batch size: 229, lr: 4.05e-02, grad_scale: 32.0 +2024-01-15 12:42:58,951 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.balancer2.prob, batch_count=13906.666666666666, ans=0.125 +2024-01-15 12:43:07,834 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=13940.0, ans=0.0 +2024-01-15 12:43:18,971 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.ff3_skip_rate, batch_count=13973.333333333334, ans=0.007831884057971014 +2024-01-15 12:43:23,651 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=13973.333333333334, ans=0.16026666666666667 +2024-01-15 12:43:31,528 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.918e+02 2.625e+02 2.874e+02 3.417e+02 5.842e+02, threshold=5.748e+02, percent-clipped=0.0 +2024-01-15 12:43:37,772 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=8, num_channels=256, metric=5.05 vs. limit=5.106 +2024-01-15 12:44:05,337 INFO [train.py:994] (1/2) Epoch 6, batch 0, loss[loss=0.2492, simple_loss=0.3105, pruned_loss=0.09396, over 24442.00 frames. ], tot_loss[loss=0.2492, simple_loss=0.3105, pruned_loss=0.09396, over 24442.00 frames. ], batch size: 250, lr: 3.97e-02, grad_scale: 32.0 +2024-01-15 12:44:05,337 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 12:44:25,849 INFO [train.py:1026] (1/2) Epoch 6, validation: loss=0.2018, simple_loss=0.2869, pruned_loss=0.05834, over 1622729.00 frames. +2024-01-15 12:44:25,850 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 12:44:30,666 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=14.52 vs. limit=12.76875 +2024-01-15 12:44:45,871 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=14083.333333333334, ans=0.0 +2024-01-15 12:44:59,084 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.nonlin_attention.balancer.prob, batch_count=14116.666666666666, ans=0.125 +2024-01-15 12:44:59,114 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_skip_rate, batch_count=14116.666666666666, ans=0.007847222222222228 +2024-01-15 12:45:06,248 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module1.balancer1.prob, batch_count=14150.0, ans=0.125 +2024-01-15 12:45:14,451 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=384, metric=12.39 vs. limit=12.80625 +2024-01-15 12:45:20,505 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=14183.333333333334, ans=0.15816666666666665 +2024-01-15 12:45:21,737 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_module1.balancer2.prob, batch_count=14183.333333333334, ans=0.125 +2024-01-15 12:45:28,465 INFO [train.py:994] (1/2) Epoch 6, batch 50, loss[loss=0.2285, simple_loss=0.2846, pruned_loss=0.08617, over 23965.00 frames. ], tot_loss[loss=0.2582, simple_loss=0.3131, pruned_loss=0.1016, over 1087660.08 frames. ], batch size: 131, lr: 3.97e-02, grad_scale: 32.0 +2024-01-15 12:45:45,345 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.ff3_skip_rate, batch_count=14250.0, ans=0.0077717391304347825 +2024-01-15 12:45:59,662 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=14283.333333333334, ans=0.15716666666666668 +2024-01-15 12:46:03,046 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.nonlin_attention.balancer.prob, batch_count=14283.333333333334, ans=0.125 +2024-01-15 12:46:09,632 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.conv.2.prob, batch_count=14316.666666666666, ans=0.125 +2024-01-15 12:46:09,752 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.balancer2.prob, batch_count=14316.666666666666, ans=0.125 +2024-01-15 12:46:19,403 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.144e+02 2.532e+02 2.872e+02 3.378e+02 9.523e+02, threshold=5.745e+02, percent-clipped=4.0 +2024-01-15 12:46:29,988 INFO [train.py:994] (1/2) Epoch 6, batch 100, loss[loss=0.22, simple_loss=0.2772, pruned_loss=0.08143, over 23979.00 frames. ], tot_loss[loss=0.2578, simple_loss=0.3124, pruned_loss=0.1016, over 1907818.33 frames. ], batch size: 131, lr: 3.96e-02, grad_scale: 32.0 +2024-01-15 12:47:33,312 INFO [train.py:994] (1/2) Epoch 6, batch 150, loss[loss=0.2478, simple_loss=0.3092, pruned_loss=0.09315, over 24293.00 frames. ], tot_loss[loss=0.2572, simple_loss=0.3121, pruned_loss=0.1012, over 2544331.80 frames. ], batch size: 285, lr: 3.96e-02, grad_scale: 32.0 +2024-01-15 12:47:33,552 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module2.balancer2.prob, batch_count=14550.0, ans=0.125 +2024-01-15 12:47:39,695 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=14550.0, ans=0.1545 +2024-01-15 12:47:39,964 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=384, metric=12.12 vs. limit=12.95625 +2024-01-15 12:48:01,373 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.whiten, num_groups=1, num_channels=512, metric=5.58 vs. limit=9.846666666666668 +2024-01-15 12:48:04,511 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.0.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 12:48:24,975 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.141e+02 2.520e+02 2.957e+02 3.469e+02 5.327e+02, threshold=5.915e+02, percent-clipped=0.0 +2024-01-15 12:48:29,934 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.ff3_skip_rate, batch_count=14683.333333333334, ans=0.0076775362318840575 +2024-01-15 12:48:34,003 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.self_attn1.whiten, num_groups=1, num_channels=512, metric=19.80 vs. limit=18.5125 +2024-01-15 12:48:34,943 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.bypass.skip_rate, batch_count=14716.666666666666, ans=0.04949747468305833 +2024-01-15 12:48:35,716 INFO [train.py:994] (1/2) Epoch 6, batch 200, loss[loss=0.2601, simple_loss=0.3166, pruned_loss=0.1018, over 24441.00 frames. ], tot_loss[loss=0.2578, simple_loss=0.3134, pruned_loss=0.1012, over 3059147.16 frames. ], batch size: 250, lr: 3.95e-02, grad_scale: 32.0 +2024-01-15 12:48:41,065 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.conv_module1.whiten, num_groups=1, num_channels=384, metric=2.33 vs. limit=13.01875 +2024-01-15 12:48:55,746 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=17.40 vs. limit=13.03125 +2024-01-15 12:49:16,169 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.conv_module1.whiten, num_groups=1, num_channels=192, metric=6.86 vs. limit=13.05625 +2024-01-15 12:49:25,445 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=14850.0, ans=0.15150000000000002 +2024-01-15 12:49:34,451 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module1.balancer2.prob, batch_count=14850.0, ans=0.125 +2024-01-15 12:49:36,885 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module2.balancer1.prob, batch_count=14850.0, ans=0.125 +2024-01-15 12:49:39,771 INFO [train.py:994] (1/2) Epoch 6, batch 250, loss[loss=0.2571, simple_loss=0.3097, pruned_loss=0.1022, over 24615.00 frames. ], tot_loss[loss=0.2569, simple_loss=0.3125, pruned_loss=0.1007, over 3450830.51 frames. ], batch size: 199, lr: 3.94e-02, grad_scale: 16.0 +2024-01-15 12:50:17,204 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward1.out_proj.dropout_p, batch_count=14983.333333333334, ans=0.15016666666666667 +2024-01-15 12:50:19,529 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_module1.balancer2.min_positive, batch_count=14983.333333333334, ans=0.05 +2024-01-15 12:50:32,176 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.898e+02 2.584e+02 2.958e+02 3.363e+02 5.470e+02, threshold=5.917e+02, percent-clipped=0.0 +2024-01-15 12:50:41,680 INFO [train.py:994] (1/2) Epoch 6, batch 300, loss[loss=0.258, simple_loss=0.3144, pruned_loss=0.1008, over 24438.00 frames. ], tot_loss[loss=0.2566, simple_loss=0.312, pruned_loss=0.1006, over 3742171.23 frames. ], batch size: 250, lr: 3.94e-02, grad_scale: 16.0 +2024-01-15 12:50:41,987 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_skip_rate, batch_count=15050.0, ans=0.0039583333333333345 +2024-01-15 12:50:52,068 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.convnext.hidden_balancer.prob, batch_count=15050.0, ans=0.125 +2024-01-15 12:51:33,295 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module1.balancer2.prob, batch_count=15183.333333333334, ans=0.125 +2024-01-15 12:51:44,449 INFO [train.py:994] (1/2) Epoch 6, batch 350, loss[loss=0.2577, simple_loss=0.311, pruned_loss=0.1022, over 24568.00 frames. ], tot_loss[loss=0.2558, simple_loss=0.3114, pruned_loss=0.1001, over 3971098.40 frames. ], batch size: 193, lr: 3.93e-02, grad_scale: 16.0 +2024-01-15 12:51:54,052 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.whiten, num_groups=1, num_channels=384, metric=4.18 vs. limit=10.086666666666666 +2024-01-15 12:52:26,288 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.attention_skip_rate, batch_count=15316.666666666666, ans=0.002847222222222223 +2024-01-15 12:52:26,314 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module2.balancer1.prob, batch_count=15316.666666666666, ans=0.125 +2024-01-15 12:52:37,732 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.145e+02 2.642e+02 3.065e+02 3.528e+02 6.043e+02, threshold=6.131e+02, percent-clipped=0.0 +2024-01-15 12:52:44,525 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.bypass.scale_min, batch_count=15350.0, ans=0.36275 +2024-01-15 12:52:47,772 INFO [train.py:994] (1/2) Epoch 6, batch 400, loss[loss=0.2692, simple_loss=0.3255, pruned_loss=0.1064, over 24515.00 frames. ], tot_loss[loss=0.2558, simple_loss=0.3114, pruned_loss=0.1001, over 4162772.33 frames. ], batch size: 187, lr: 3.93e-02, grad_scale: 32.0 +2024-01-15 12:52:51,638 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.bypass.scale_min, batch_count=15383.333333333334, ans=0.36158333333333337 +2024-01-15 12:52:55,261 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module2.balancer2.prob, batch_count=15383.333333333334, ans=0.125 +2024-01-15 12:53:23,114 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=15450.0, ans=0.14550000000000002 +2024-01-15 12:53:50,315 INFO [train.py:994] (1/2) Epoch 6, batch 450, loss[loss=0.2636, simple_loss=0.3207, pruned_loss=0.1033, over 24531.00 frames. ], tot_loss[loss=0.2552, simple_loss=0.311, pruned_loss=0.09975, over 4308719.34 frames. ], batch size: 193, lr: 3.92e-02, grad_scale: 32.0 +2024-01-15 12:54:24,574 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=192, metric=8.92 vs. limit=13.35625 +2024-01-15 12:54:32,959 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward3.hidden_balancer.prob, batch_count=15650.0, ans=0.125 +2024-01-15 12:54:43,350 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.107e+02 2.574e+02 3.083e+02 3.661e+02 5.434e+02, threshold=6.165e+02, percent-clipped=0.0 +2024-01-15 12:54:54,291 INFO [train.py:994] (1/2) Epoch 6, batch 500, loss[loss=0.2204, simple_loss=0.2793, pruned_loss=0.08076, over 24200.00 frames. ], tot_loss[loss=0.2541, simple_loss=0.31, pruned_loss=0.0991, over 4411975.84 frames. ], batch size: 140, lr: 3.92e-02, grad_scale: 32.0 +2024-01-15 12:54:57,311 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.self_attn2.whiten, num_groups=1, num_channels=512, metric=19.92 vs. limit=19.2875 +2024-01-15 12:55:05,350 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.balancer1.prob, batch_count=15750.0, ans=0.125 +2024-01-15 12:55:13,486 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=4.60 vs. limit=8.9375 +2024-01-15 12:55:21,025 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.bypass.scale_min, batch_count=15783.333333333334, ans=0.34758333333333336 +2024-01-15 12:55:24,599 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.bypass.scale_min, batch_count=15783.333333333334, ans=0.34758333333333336 +2024-01-15 12:55:31,657 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.1.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 12:55:38,754 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=512, metric=20.42 vs. limit=19.3625 +2024-01-15 12:55:43,418 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=512, metric=20.44 vs. limit=19.3875 +2024-01-15 12:55:48,905 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=384, metric=12.61 vs. limit=12.925 +2024-01-15 12:55:55,453 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.ff3_skip_rate, batch_count=15883.333333333334, ans=0.007416666666666667 +2024-01-15 12:55:56,331 INFO [train.py:994] (1/2) Epoch 6, batch 550, loss[loss=0.2509, simple_loss=0.3075, pruned_loss=0.09715, over 24513.00 frames. ], tot_loss[loss=0.254, simple_loss=0.3102, pruned_loss=0.0989, over 4503285.29 frames. ], batch size: 243, lr: 3.91e-02, grad_scale: 32.0 +2024-01-15 12:55:57,844 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module2.balancer2.prob, batch_count=15883.333333333334, ans=0.125 +2024-01-15 12:56:25,364 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_module2.balancer2.min_abs, batch_count=15950.0, ans=0.43925000000000003 +2024-01-15 12:56:44,789 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=15983.333333333334, ans=0.125 +2024-01-15 12:56:50,483 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.044e+02 2.391e+02 2.809e+02 3.171e+02 7.983e+02, threshold=5.617e+02, percent-clipped=2.0 +2024-01-15 12:56:51,902 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.out_proj.dropout_p, batch_count=16016.666666666666, ans=0.13983333333333334 +2024-01-15 12:56:59,323 INFO [train.py:994] (1/2) Epoch 6, batch 600, loss[loss=0.233, simple_loss=0.2982, pruned_loss=0.0839, over 24567.00 frames. ], tot_loss[loss=0.2525, simple_loss=0.309, pruned_loss=0.09801, over 4571473.30 frames. ], batch size: 176, lr: 3.90e-02, grad_scale: 16.0 +2024-01-15 12:57:10,948 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module1.balancer1.prob, batch_count=16083.333333333334, ans=0.125 +2024-01-15 12:57:16,959 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.bypass.scale_min, batch_count=16083.333333333334, ans=0.3370833333333333 +2024-01-15 12:57:19,794 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=16083.333333333334, ans=0.0 +2024-01-15 12:57:19,851 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.bypass.skip_rate, batch_count=16083.333333333334, ans=0.09899494936611666 +2024-01-15 12:57:53,375 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.balancer1.prob, batch_count=16183.333333333334, ans=0.125 +2024-01-15 12:58:02,125 INFO [train.py:994] (1/2) Epoch 6, batch 650, loss[loss=0.2689, simple_loss=0.3244, pruned_loss=0.1067, over 24514.00 frames. ], tot_loss[loss=0.2516, simple_loss=0.3085, pruned_loss=0.09735, over 4640525.92 frames. ], batch size: 243, lr: 3.90e-02, grad_scale: 16.0 +2024-01-15 12:58:08,695 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.nonlin_attention.whiten2, num_groups=1, num_channels=512, metric=5.36 vs. limit=13.108333333333333 +2024-01-15 12:58:12,395 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=16216.666666666666, ans=0.125 +2024-01-15 12:58:19,677 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=256, metric=16.70 vs. limit=19.6875 +2024-01-15 12:58:45,805 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=16316.666666666666, ans=0.13683333333333333 +2024-01-15 12:58:55,215 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=512, metric=19.89 vs. limit=19.7625 +2024-01-15 12:58:55,715 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.004e+02 2.577e+02 2.996e+02 4.100e+02 6.936e+02, threshold=5.993e+02, percent-clipped=2.0 +2024-01-15 12:59:04,786 INFO [train.py:994] (1/2) Epoch 6, batch 700, loss[loss=0.2528, simple_loss=0.3157, pruned_loss=0.09495, over 24325.00 frames. ], tot_loss[loss=0.2499, simple_loss=0.3072, pruned_loss=0.09631, over 4680000.49 frames. ], batch size: 298, lr: 3.89e-02, grad_scale: 16.0 +2024-01-15 12:59:09,778 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module2.balancer2.prob, batch_count=16383.333333333334, ans=0.125 +2024-01-15 12:59:55,853 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.balancer2.prob, batch_count=16516.666666666668, ans=0.125 +2024-01-15 13:00:02,960 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=512, metric=5.50 vs. limit=13.258333333333335 +2024-01-15 13:00:07,680 INFO [train.py:994] (1/2) Epoch 6, batch 750, loss[loss=0.2529, simple_loss=0.3103, pruned_loss=0.09772, over 24505.00 frames. ], tot_loss[loss=0.2492, simple_loss=0.3066, pruned_loss=0.09585, over 4703700.04 frames. ], batch size: 229, lr: 3.89e-02, grad_scale: 16.0 +2024-01-15 13:00:12,638 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward2.hidden_balancer.prob, batch_count=16550.0, ans=0.125 +2024-01-15 13:00:20,074 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.5.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 13:00:24,206 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.balancer2.prob, batch_count=16583.333333333332, ans=0.125 +2024-01-15 13:00:45,846 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.bypass.skip_rate, batch_count=16650.0, ans=0.07 +2024-01-15 13:01:00,505 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.978e+02 2.440e+02 2.798e+02 3.396e+02 5.464e+02, threshold=5.595e+02, percent-clipped=0.0 +2024-01-15 13:01:08,972 INFO [train.py:994] (1/2) Epoch 6, batch 800, loss[loss=0.2767, simple_loss=0.3279, pruned_loss=0.1128, over 24549.00 frames. ], tot_loss[loss=0.2485, simple_loss=0.3065, pruned_loss=0.0953, over 4737809.07 frames. ], batch size: 176, lr: 3.88e-02, grad_scale: 32.0 +2024-01-15 13:01:10,314 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module2.balancer1.min_positive, batch_count=16716.666666666668, ans=0.025 +2024-01-15 13:01:25,004 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_module1.balancer1.prob, batch_count=16750.0, ans=0.125 +2024-01-15 13:01:39,145 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.nonlin_attention.whiten2, num_groups=1, num_channels=384, metric=3.97 vs. limit=13.391666666666666 +2024-01-15 13:02:20,525 INFO [train.py:994] (1/2) Epoch 7, batch 0, loss[loss=0.2749, simple_loss=0.3307, pruned_loss=0.1096, over 24489.00 frames. ], tot_loss[loss=0.2749, simple_loss=0.3307, pruned_loss=0.1096, over 24489.00 frames. ], batch size: 187, lr: 3.79e-02, grad_scale: 32.0 +2024-01-15 13:02:20,525 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 13:02:40,095 INFO [train.py:1026] (1/2) Epoch 7, validation: loss=0.1997, simple_loss=0.2857, pruned_loss=0.05682, over 1622729.00 frames. +2024-01-15 13:02:40,096 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 13:02:54,622 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.out_combiner.scale_min, batch_count=16893.333333333332, ans=0.3087333333333334 +2024-01-15 13:03:22,499 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.self_attn_weights.pos_emb_skip_rate, batch_count=16960.0, ans=0.0 +2024-01-15 13:03:39,622 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=16993.333333333332, ans=0.0 +2024-01-15 13:03:42,737 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.863e+02 2.614e+02 2.980e+02 3.791e+02 6.611e+02, threshold=5.960e+02, percent-clipped=2.0 +2024-01-15 13:03:42,765 INFO [train.py:994] (1/2) Epoch 7, batch 50, loss[loss=0.2397, simple_loss=0.3009, pruned_loss=0.08923, over 24458.00 frames. ], tot_loss[loss=0.2439, simple_loss=0.3027, pruned_loss=0.09259, over 1084026.50 frames. ], batch size: 181, lr: 3.79e-02, grad_scale: 32.0 +2024-01-15 13:04:00,949 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=11.95 vs. limit=13.8975 +2024-01-15 13:04:27,942 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.attention_skip_rate, batch_count=17126.666666666668, ans=0.0 +2024-01-15 13:04:38,213 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.whiten, num_groups=1, num_channels=384, metric=3.29 vs. limit=10.864 +2024-01-15 13:04:40,651 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.self_attn1.whiten, num_groups=1, num_channels=512, metric=20.68 vs. limit=20.369999999999997 +2024-01-15 13:04:44,719 INFO [train.py:994] (1/2) Epoch 7, batch 100, loss[loss=0.2414, simple_loss=0.3036, pruned_loss=0.0896, over 24526.00 frames. ], tot_loss[loss=0.2432, simple_loss=0.3023, pruned_loss=0.09207, over 1913338.16 frames. ], batch size: 229, lr: 3.78e-02, grad_scale: 16.0 +2024-01-15 13:04:47,743 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.whiten, num_groups=1, num_channels=512, metric=5.57 vs. limit=10.877333333333333 +2024-01-15 13:05:26,319 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module2.balancer2.prob, batch_count=17293.333333333332, ans=0.125 +2024-01-15 13:05:34,294 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward1.out_proj.dropout_p, batch_count=17326.666666666668, ans=0.1267333333333333 +2024-01-15 13:05:48,207 INFO [train.py:994] (1/2) Epoch 7, batch 150, loss[loss=0.2506, simple_loss=0.3109, pruned_loss=0.09514, over 24343.00 frames. ], tot_loss[loss=0.2439, simple_loss=0.3031, pruned_loss=0.09235, over 2557815.79 frames. ], batch size: 298, lr: 3.78e-02, grad_scale: 16.0 +2024-01-15 13:05:49,388 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.035e+02 2.480e+02 2.785e+02 3.316e+02 4.883e+02, threshold=5.571e+02, percent-clipped=0.0 +2024-01-15 13:05:55,884 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.balancer2.prob, batch_count=17360.0, ans=0.125 +2024-01-15 13:06:03,945 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=17393.333333333332, ans=0.1260666666666667 +2024-01-15 13:06:12,680 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module2.balancer1.prob, batch_count=17426.666666666668, ans=0.125 +2024-01-15 13:06:26,054 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.conv_module2.whiten, num_groups=1, num_channels=192, metric=3.28 vs. limit=14.0475 +2024-01-15 13:06:28,983 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.2.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 13:06:33,752 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=384, metric=12.10 vs. limit=14.0475 +2024-01-15 13:06:35,717 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.balancer1.prob, batch_count=17460.0, ans=0.125 +2024-01-15 13:06:45,324 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.ff2_skip_rate, batch_count=17493.333333333332, ans=0.007066666666666667 +2024-01-15 13:06:47,828 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=17493.333333333332, ans=0.1250666666666667 +2024-01-15 13:06:51,740 INFO [train.py:994] (1/2) Epoch 7, batch 200, loss[loss=0.2054, simple_loss=0.2478, pruned_loss=0.08146, over 18691.00 frames. ], tot_loss[loss=0.2435, simple_loss=0.3022, pruned_loss=0.09234, over 3046965.04 frames. ], batch size: 81, lr: 3.77e-02, grad_scale: 16.0 +2024-01-15 13:06:55,694 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.balancer2.prob, batch_count=17526.666666666668, ans=0.125 +2024-01-15 13:07:00,890 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module1.balancer2.min_positive, batch_count=17526.666666666668, ans=0.05 +2024-01-15 13:07:09,031 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=17560.0, ans=0.12440000000000001 +2024-01-15 13:07:19,632 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=9.27 vs. limit=14.0975 +2024-01-15 13:07:38,752 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=17626.666666666668, ans=0.1237333333333333 +2024-01-15 13:07:45,547 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=17660.0, ans=0.12340000000000001 +2024-01-15 13:07:54,126 INFO [train.py:994] (1/2) Epoch 7, batch 250, loss[loss=0.2674, simple_loss=0.3221, pruned_loss=0.1063, over 24569.00 frames. ], tot_loss[loss=0.2441, simple_loss=0.3029, pruned_loss=0.09261, over 3439370.86 frames. ], batch size: 176, lr: 3.76e-02, grad_scale: 16.0 +2024-01-15 13:07:55,375 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.023e+02 2.392e+02 2.709e+02 3.243e+02 5.140e+02, threshold=5.418e+02, percent-clipped=0.0 +2024-01-15 13:08:30,811 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=512, metric=14.94 vs. limit=14.1725 +2024-01-15 13:08:36,193 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass.scale_min, batch_count=17793.333333333332, ans=0.27723333333333344 +2024-01-15 13:08:38,125 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.ff2_skip_rate, batch_count=17793.333333333332, ans=0.007001449275362319 +2024-01-15 13:08:41,057 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=17793.333333333332, ans=0.12206666666666668 +2024-01-15 13:08:54,226 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=4.50 vs. limit=9.456666666666667 +2024-01-15 13:08:55,957 INFO [train.py:994] (1/2) Epoch 7, batch 300, loss[loss=0.2504, simple_loss=0.3098, pruned_loss=0.0955, over 23941.00 frames. ], tot_loss[loss=0.2439, simple_loss=0.3029, pruned_loss=0.09243, over 3756803.50 frames. ], batch size: 328, lr: 3.76e-02, grad_scale: 8.0 +2024-01-15 13:08:59,808 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=17860.0, ans=0.12140000000000001 +2024-01-15 13:09:12,828 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=17893.333333333332, ans=0.12106666666666668 +2024-01-15 13:09:15,194 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=17893.333333333332, ans=0.125 +2024-01-15 13:09:28,796 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module2.balancer2.min_abs, batch_count=17926.666666666668, ans=0.4689 +2024-01-15 13:09:34,139 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward1.out_proj.dropout_p, batch_count=17960.0, ans=0.12040000000000001 +2024-01-15 13:09:47,285 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.feed_forward2.out_whiten, num_groups=1, num_channels=512, metric=14.34 vs. limit=14.247499999999999 +2024-01-15 13:09:53,223 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.self_attn_weights.pos_emb_skip_rate, batch_count=17993.333333333332, ans=0.0 +2024-01-15 13:09:58,907 INFO [train.py:994] (1/2) Epoch 7, batch 350, loss[loss=0.2611, simple_loss=0.3182, pruned_loss=0.102, over 24234.00 frames. ], tot_loss[loss=0.2433, simple_loss=0.3025, pruned_loss=0.09199, over 3988854.00 frames. ], batch size: 311, lr: 3.75e-02, grad_scale: 8.0 +2024-01-15 13:09:59,164 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.ff2_skip_rate, batch_count=18026.666666666668, ans=0.00695072463768116 +2024-01-15 13:10:01,280 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.851e+02 2.448e+02 2.766e+02 3.076e+02 2.159e+03, threshold=5.532e+02, percent-clipped=4.0 +2024-01-15 13:10:11,246 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=9.77 vs. limit=14.2725 +2024-01-15 13:10:12,181 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.balancer2.prob, batch_count=18060.0, ans=0.125 +2024-01-15 13:11:00,524 INFO [train.py:994] (1/2) Epoch 7, batch 400, loss[loss=0.2471, simple_loss=0.3087, pruned_loss=0.09275, over 24438.00 frames. ], tot_loss[loss=0.2429, simple_loss=0.3025, pruned_loss=0.0917, over 4175915.87 frames. ], batch size: 170, lr: 3.75e-02, grad_scale: 16.0 +2024-01-15 13:11:09,238 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=18193.333333333332, ans=0.11806666666666668 +2024-01-15 13:11:37,985 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=18293.333333333332, ans=0.11706666666666668 +2024-01-15 13:11:58,021 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=384, metric=12.64 vs. limit=14.163333333333334 +2024-01-15 13:12:02,129 INFO [train.py:994] (1/2) Epoch 7, batch 450, loss[loss=0.229, simple_loss=0.2892, pruned_loss=0.08443, over 24176.00 frames. ], tot_loss[loss=0.2424, simple_loss=0.3019, pruned_loss=0.09142, over 4308077.48 frames. ], batch size: 140, lr: 3.74e-02, grad_scale: 16.0 +2024-01-15 13:12:05,170 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.955e+02 2.388e+02 2.626e+02 2.940e+02 4.634e+02, threshold=5.253e+02, percent-clipped=0.0 +2024-01-15 13:12:18,067 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_skip_rate, batch_count=18393.333333333332, ans=0.0 +2024-01-15 13:12:20,540 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module2.balancer2.min_abs, batch_count=18393.333333333332, ans=0.4759 +2024-01-15 13:12:29,534 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module2.balancer1.prob, batch_count=18426.666666666668, ans=0.125 +2024-01-15 13:12:36,048 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.ff3_skip_rate, batch_count=18426.666666666668, ans=0.0068637681159420295 +2024-01-15 13:12:36,503 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_whiten, num_groups=1, num_channels=384, metric=13.52 vs. limit=14.41 +2024-01-15 13:12:38,805 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=384, metric=13.30 vs. limit=14.41 +2024-01-15 13:13:05,238 INFO [train.py:994] (1/2) Epoch 7, batch 500, loss[loss=0.2578, simple_loss=0.3132, pruned_loss=0.1012, over 24474.00 frames. ], tot_loss[loss=0.2417, simple_loss=0.3015, pruned_loss=0.09098, over 4418421.79 frames. ], batch size: 170, lr: 3.73e-02, grad_scale: 16.0 +2024-01-15 13:13:13,788 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module1.balancer1.prob, batch_count=18526.666666666668, ans=0.125 +2024-01-15 13:13:22,571 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.out_proj.dropout_p, batch_count=18560.0, ans=0.1144 +2024-01-15 13:13:23,836 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.bypass_mid.scale_min, batch_count=18560.0, ans=0.25040000000000007 +2024-01-15 13:13:30,337 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.bypass.scale_min, batch_count=18593.333333333332, ans=0.24923333333333342 +2024-01-15 13:13:40,708 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_skip_rate, batch_count=18626.666666666668, ans=0.0 +2024-01-15 13:13:58,923 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=18660.0, ans=0.125 +2024-01-15 13:14:06,842 INFO [train.py:994] (1/2) Epoch 7, batch 550, loss[loss=0.2334, simple_loss=0.2944, pruned_loss=0.08619, over 24536.00 frames. ], tot_loss[loss=0.2411, simple_loss=0.301, pruned_loss=0.09063, over 4492783.20 frames. ], batch size: 236, lr: 3.73e-02, grad_scale: 16.0 +2024-01-15 13:14:09,603 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.822e+02 2.418e+02 2.722e+02 3.262e+02 6.493e+02, threshold=5.445e+02, percent-clipped=3.0 +2024-01-15 13:14:16,640 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module1.balancer2.min_abs, batch_count=18693.333333333332, ans=0.48039999999999994 +2024-01-15 13:14:31,609 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.ff2_skip_rate, batch_count=18760.0, ans=0.006791304347826087 +2024-01-15 13:14:39,391 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.bypass.scale_min, batch_count=18760.0, ans=0.24340000000000006 +2024-01-15 13:14:42,978 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.whiten, num_groups=1, num_channels=256, metric=5.10 vs. limit=11.504 +2024-01-15 13:14:43,626 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.bypass.scale_min, batch_count=18793.333333333332, ans=0.2422333333333334 +2024-01-15 13:14:51,883 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.attention_skip_rate, batch_count=18793.333333333332, ans=0.0 +2024-01-15 13:14:56,622 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module2.balancer2.prob, batch_count=18826.666666666668, ans=0.125 +2024-01-15 13:15:08,842 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.4.encoder.layers.2.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 13:15:09,669 INFO [train.py:994] (1/2) Epoch 7, batch 600, loss[loss=0.2019, simple_loss=0.2471, pruned_loss=0.0784, over 19067.00 frames. ], tot_loss[loss=0.2404, simple_loss=0.3006, pruned_loss=0.09009, over 4564575.88 frames. ], batch size: 82, lr: 3.72e-02, grad_scale: 16.0 +2024-01-15 13:15:27,213 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=18893.333333333332, ans=0.125 +2024-01-15 13:15:33,457 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_module1.balancer1.prob, batch_count=18926.666666666668, ans=0.125 +2024-01-15 13:15:47,037 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=18960.0, ans=0.0 +2024-01-15 13:16:11,528 INFO [train.py:994] (1/2) Epoch 7, batch 650, loss[loss=0.2553, simple_loss=0.3163, pruned_loss=0.09711, over 24480.00 frames. ], tot_loss[loss=0.2399, simple_loss=0.3003, pruned_loss=0.08982, over 4619512.30 frames. ], batch size: 181, lr: 3.72e-02, grad_scale: 16.0 +2024-01-15 13:16:13,841 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.002e+02 2.481e+02 2.819e+02 3.403e+02 1.032e+03, threshold=5.637e+02, percent-clipped=2.0 +2024-01-15 13:16:39,190 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=19093.333333333332, ans=0.125 +2024-01-15 13:17:03,810 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.conv.5.prob, batch_count=19160.0, ans=0.125 +2024-01-15 13:17:13,785 INFO [train.py:994] (1/2) Epoch 7, batch 700, loss[loss=0.247, simple_loss=0.3102, pruned_loss=0.09194, over 24479.00 frames. ], tot_loss[loss=0.24, simple_loss=0.3007, pruned_loss=0.08962, over 4676506.96 frames. ], batch size: 181, lr: 3.71e-02, grad_scale: 16.0 +2024-01-15 13:17:15,233 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=19193.333333333332, ans=0.125 +2024-01-15 13:17:47,941 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.out_proj.dropout_p, batch_count=19260.0, ans=0.10740000000000002 +2024-01-15 13:17:56,315 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.balancer1.prob, batch_count=19293.333333333332, ans=0.125 +2024-01-15 13:18:16,523 INFO [train.py:994] (1/2) Epoch 7, batch 750, loss[loss=0.2329, simple_loss=0.2968, pruned_loss=0.08447, over 24523.00 frames. ], tot_loss[loss=0.24, simple_loss=0.3008, pruned_loss=0.08964, over 4700773.73 frames. ], batch size: 165, lr: 3.71e-02, grad_scale: 16.0 +2024-01-15 13:18:18,846 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.030e+02 2.484e+02 2.733e+02 3.222e+02 4.395e+02, threshold=5.467e+02, percent-clipped=0.0 +2024-01-15 13:18:23,351 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module2.balancer2.prob, batch_count=19360.0, ans=0.125 +2024-01-15 13:18:36,148 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.out_combiner.scale_min, batch_count=19393.333333333332, ans=0.2212333333333334 +2024-01-15 13:18:47,533 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.balancer2.prob, batch_count=19426.666666666668, ans=0.125 +2024-01-15 13:18:50,317 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=19426.666666666668, ans=0.10573333333333335 +2024-01-15 13:18:54,845 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward3.hidden_balancer.prob, batch_count=19460.0, ans=0.125 +2024-01-15 13:19:15,983 INFO [train.py:994] (1/2) Epoch 7, batch 800, loss[loss=0.2525, simple_loss=0.3135, pruned_loss=0.09578, over 24506.00 frames. ], tot_loss[loss=0.2394, simple_loss=0.3001, pruned_loss=0.08931, over 4730101.68 frames. ], batch size: 181, lr: 3.70e-02, grad_scale: 32.0 +2024-01-15 13:19:26,817 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=19560.0, ans=0.10440000000000002 +2024-01-15 13:19:38,287 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.conv.5.prob, batch_count=19593.333333333332, ans=0.125 +2024-01-15 13:19:49,960 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=9.28 vs. limit=14.86 +2024-01-15 13:19:59,642 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.nonlin_attention.balancer.prob, batch_count=19626.666666666668, ans=0.125 +2024-01-15 13:20:00,032 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.feed_forward3.out_whiten, num_groups=1, num_channels=512, metric=18.50 vs. limit=14.86 +2024-01-15 13:20:01,869 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.balancer1.prob, batch_count=19660.0, ans=0.125 +2024-01-15 13:20:28,048 INFO [train.py:994] (1/2) Epoch 8, batch 0, loss[loss=0.2232, simple_loss=0.2914, pruned_loss=0.07754, over 24348.00 frames. ], tot_loss[loss=0.2232, simple_loss=0.2914, pruned_loss=0.07754, over 24348.00 frames. ], batch size: 285, lr: 3.61e-02, grad_scale: 32.0 +2024-01-15 13:20:28,049 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 13:20:45,557 INFO [zipformer.py:1858] (1/2) name=encoder.encoders.2.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([3.9755, 3.7558, 3.1738, 3.5943], device='cuda:1') +2024-01-15 13:20:46,311 INFO [zipformer.py:1858] (1/2) name=encoder.encoders.2.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([3.8050, 3.6394, 2.9857, 3.4719], device='cuda:1') +2024-01-15 13:20:48,230 INFO [train.py:1026] (1/2) Epoch 8, validation: loss=0.1914, simple_loss=0.2786, pruned_loss=0.05211, over 1622729.00 frames. +2024-01-15 13:20:48,231 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 13:20:59,656 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.013e+02 2.481e+02 2.846e+02 3.388e+02 4.975e+02, threshold=5.693e+02, percent-clipped=0.0 +2024-01-15 13:21:22,915 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_skip_rate, batch_count=19736.666666666668, ans=0.0 +2024-01-15 13:21:35,403 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.ff2_skip_rate, batch_count=19770.0, ans=0.006571739130434783 +2024-01-15 13:21:39,582 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.ff2_skip_rate, batch_count=19803.333333333332, ans=0.006564492753623188 +2024-01-15 13:21:46,635 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.ff3_skip_rate, batch_count=19803.333333333332, ans=0.006564492753623188 +2024-01-15 13:21:49,910 INFO [train.py:994] (1/2) Epoch 8, batch 50, loss[loss=0.2481, simple_loss=0.3106, pruned_loss=0.09282, over 24230.00 frames. ], tot_loss[loss=0.233, simple_loss=0.295, pruned_loss=0.08554, over 1094745.95 frames. ], batch size: 311, lr: 3.60e-02, grad_scale: 32.0 +2024-01-15 13:21:50,554 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.conv_module2.whiten, num_groups=1, num_channels=512, metric=5.91 vs. limit=14.938749999999999 +2024-01-15 13:21:55,637 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.ff2_skip_rate, batch_count=19836.666666666668, ans=0.006557246376811594 +2024-01-15 13:22:30,289 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.dropout.p, batch_count=19936.666666666668, ans=0.10063333333333332 +2024-01-15 13:22:33,092 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.feed_forward2.out_whiten, num_groups=1, num_channels=512, metric=13.24 vs. limit=14.97625 +2024-01-15 13:22:52,455 INFO [train.py:994] (1/2) Epoch 8, batch 100, loss[loss=0.2595, simple_loss=0.3153, pruned_loss=0.1018, over 24392.00 frames. ], tot_loss[loss=0.2342, simple_loss=0.2961, pruned_loss=0.08616, over 1917736.91 frames. ], batch size: 159, lr: 3.60e-02, grad_scale: 32.0 +2024-01-15 13:23:00,346 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=20003.333333333332, ans=0.125 +2024-01-15 13:23:03,536 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.852e+02 2.423e+02 2.800e+02 3.261e+02 6.993e+02, threshold=5.599e+02, percent-clipped=2.0 +2024-01-15 13:23:17,665 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_skip_rate, batch_count=20070.0, ans=0.0 +2024-01-15 13:23:40,287 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=20103.333333333332, ans=0.125 +2024-01-15 13:23:42,755 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.balancer2.prob, batch_count=20136.666666666668, ans=0.125 +2024-01-15 13:23:47,149 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.feed_forward1.out_whiten, num_groups=1, num_channels=512, metric=12.48 vs. limit=15.0 +2024-01-15 13:23:54,920 INFO [train.py:994] (1/2) Epoch 8, batch 150, loss[loss=0.2196, simple_loss=0.2765, pruned_loss=0.08131, over 24224.00 frames. ], tot_loss[loss=0.2342, simple_loss=0.2963, pruned_loss=0.08607, over 2558289.29 frames. ], batch size: 140, lr: 3.59e-02, grad_scale: 32.0 +2024-01-15 13:23:56,558 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.conv_module2.whiten, num_groups=1, num_channels=256, metric=5.94 vs. limit=15.0 +2024-01-15 13:24:00,996 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=20170.0, ans=0.125 +2024-01-15 13:24:19,802 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.balancer1.prob, batch_count=20236.666666666668, ans=0.125 +2024-01-15 13:24:19,877 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=20236.666666666668, ans=0.125 +2024-01-15 13:24:31,678 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=384, metric=22.04 vs. limit=22.5 +2024-01-15 13:24:42,044 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=4.16 vs. limit=6.0 +2024-01-15 13:24:47,572 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.ff2_skip_rate, batch_count=20303.333333333332, ans=0.006455797101449275 +2024-01-15 13:24:57,370 INFO [train.py:994] (1/2) Epoch 8, batch 200, loss[loss=0.2348, simple_loss=0.2921, pruned_loss=0.08878, over 24343.00 frames. ], tot_loss[loss=0.2344, simple_loss=0.2967, pruned_loss=0.08611, over 3066395.72 frames. ], batch size: 153, lr: 3.59e-02, grad_scale: 32.0 +2024-01-15 13:25:06,628 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.balancer1.prob, batch_count=20336.666666666668, ans=0.125 +2024-01-15 13:25:08,734 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.990e+02 2.315e+02 2.435e+02 2.815e+02 4.055e+02, threshold=4.871e+02, percent-clipped=0.0 +2024-01-15 13:25:12,651 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass.scale_min, batch_count=20370.0, ans=0.2 +2024-01-15 13:25:29,369 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.attention_skip_rate, batch_count=20403.333333333332, ans=0.0 +2024-01-15 13:25:34,035 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.balancer1.prob, batch_count=20436.666666666668, ans=0.125 +2024-01-15 13:25:34,056 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module1.balancer2.prob, batch_count=20436.666666666668, ans=0.125 +2024-01-15 13:25:39,905 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.balancer_ff3.min_abs, batch_count=20436.666666666668, ans=0.2 +2024-01-15 13:25:59,667 INFO [train.py:994] (1/2) Epoch 8, batch 250, loss[loss=0.2378, simple_loss=0.298, pruned_loss=0.08879, over 24477.00 frames. ], tot_loss[loss=0.2348, simple_loss=0.2967, pruned_loss=0.08644, over 3448579.01 frames. ], batch size: 170, lr: 3.58e-02, grad_scale: 32.0 +2024-01-15 13:26:34,057 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_module2.balancer1.max_abs, batch_count=20570.0, ans=10.0 +2024-01-15 13:26:34,275 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.self_attn2.whiten, num_groups=1, num_channels=384, metric=21.69 vs. limit=22.5 +2024-01-15 13:26:47,282 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=20603.333333333332, ans=0.1 +2024-01-15 13:26:54,527 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=20636.666666666668, ans=0.1 +2024-01-15 13:27:01,722 INFO [train.py:994] (1/2) Epoch 8, batch 300, loss[loss=0.2487, simple_loss=0.3156, pruned_loss=0.09095, over 24347.00 frames. ], tot_loss[loss=0.2328, simple_loss=0.295, pruned_loss=0.08533, over 3742155.79 frames. ], batch size: 298, lr: 3.58e-02, grad_scale: 32.0 +2024-01-15 13:27:13,455 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.844e+02 2.481e+02 2.795e+02 3.167e+02 5.927e+02, threshold=5.590e+02, percent-clipped=2.0 +2024-01-15 13:27:17,911 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=192, metric=6.10 vs. limit=10.0 +2024-01-15 13:27:27,889 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_skip_rate, batch_count=20736.666666666668, ans=0.0 +2024-01-15 13:27:41,089 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.balancer2.prob, batch_count=20770.0, ans=0.125 +2024-01-15 13:27:42,299 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.self_attn_weights.pos_emb_skip_rate, batch_count=20770.0, ans=0.0 +2024-01-15 13:27:55,264 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.ff2_skip_rate, batch_count=20803.333333333332, ans=0.0063471014492753625 +2024-01-15 13:27:56,403 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.1.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 13:28:04,381 INFO [train.py:994] (1/2) Epoch 8, batch 350, loss[loss=0.2156, simple_loss=0.2813, pruned_loss=0.07501, over 24335.00 frames. ], tot_loss[loss=0.2332, simple_loss=0.2955, pruned_loss=0.08546, over 3984547.47 frames. ], batch size: 147, lr: 3.57e-02, grad_scale: 32.0 +2024-01-15 13:28:28,713 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=20903.333333333332, ans=0.1 +2024-01-15 13:28:28,744 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module2.balancer1.prob, batch_count=20903.333333333332, ans=0.125 +2024-01-15 13:28:30,371 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=512, metric=10.98 vs. limit=15.0 +2024-01-15 13:28:37,099 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.balancer1.prob, batch_count=20903.333333333332, ans=0.125 +2024-01-15 13:28:44,082 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module1.balancer2.prob, batch_count=20936.666666666668, ans=0.125 +2024-01-15 13:29:05,929 INFO [train.py:994] (1/2) Epoch 8, batch 400, loss[loss=0.2058, simple_loss=0.2665, pruned_loss=0.07259, over 24089.00 frames. ], tot_loss[loss=0.2325, simple_loss=0.2949, pruned_loss=0.08506, over 4181530.75 frames. ], batch size: 131, lr: 3.56e-02, grad_scale: 32.0 +2024-01-15 13:29:17,211 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.964e+02 2.300e+02 2.666e+02 3.173e+02 4.906e+02, threshold=5.331e+02, percent-clipped=0.0 +2024-01-15 13:29:19,380 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=144, metric=6.36 vs. limit=10.0 +2024-01-15 13:29:20,620 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=21036.666666666668, ans=0.1 +2024-01-15 13:29:20,627 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.balancer2.prob, batch_count=21036.666666666668, ans=0.125 +2024-01-15 13:29:36,253 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.bypass.skip_rate, batch_count=21070.0, ans=0.07 +2024-01-15 13:29:39,397 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=21070.0, ans=0.1 +2024-01-15 13:29:49,493 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=21103.333333333332, ans=0.1 +2024-01-15 13:30:08,881 INFO [train.py:994] (1/2) Epoch 8, batch 450, loss[loss=0.2405, simple_loss=0.3062, pruned_loss=0.08741, over 24353.00 frames. ], tot_loss[loss=0.2326, simple_loss=0.295, pruned_loss=0.08515, over 4311691.54 frames. ], batch size: 298, lr: 3.56e-02, grad_scale: 32.0 +2024-01-15 13:30:24,049 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_module1.balancer2.prob, batch_count=21203.333333333332, ans=0.125 +2024-01-15 13:30:26,347 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module2.balancer2.prob, batch_count=21203.333333333332, ans=0.125 +2024-01-15 13:30:34,600 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=21236.666666666668, ans=0.125 +2024-01-15 13:31:11,016 INFO [train.py:994] (1/2) Epoch 8, batch 500, loss[loss=0.2447, simple_loss=0.309, pruned_loss=0.09024, over 24421.00 frames. ], tot_loss[loss=0.2323, simple_loss=0.2946, pruned_loss=0.08499, over 4427661.16 frames. ], batch size: 181, lr: 3.55e-02, grad_scale: 32.0 +2024-01-15 13:31:12,421 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass.skip_rate, batch_count=21336.666666666668, ans=0.04949747468305833 +2024-01-15 13:31:19,598 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_module1.balancer2.prob, batch_count=21336.666666666668, ans=0.125 +2024-01-15 13:31:22,111 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.874e+02 2.351e+02 2.729e+02 3.218e+02 7.359e+02, threshold=5.457e+02, percent-clipped=2.0 +2024-01-15 13:31:35,182 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=512, metric=23.62 vs. limit=22.5 +2024-01-15 13:31:36,009 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.bypass_mid.scale_min, batch_count=21403.333333333332, ans=0.2 +2024-01-15 13:31:58,730 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=21436.666666666668, ans=0.125 +2024-01-15 13:32:13,254 INFO [train.py:994] (1/2) Epoch 8, batch 550, loss[loss=0.2486, simple_loss=0.3126, pruned_loss=0.09232, over 23838.00 frames. ], tot_loss[loss=0.2318, simple_loss=0.2942, pruned_loss=0.08471, over 4510762.09 frames. ], batch size: 328, lr: 3.55e-02, grad_scale: 32.0 +2024-01-15 13:32:29,320 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.attention_skip_rate, batch_count=21536.666666666668, ans=0.0 +2024-01-15 13:32:30,520 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module2.balancer2.min_abs, batch_count=21536.666666666668, ans=0.5 +2024-01-15 13:32:33,430 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=192, metric=8.93 vs. limit=15.0 +2024-01-15 13:32:40,803 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.nonlin_attention.balancer.prob, batch_count=21570.0, ans=0.125 +2024-01-15 13:33:15,487 INFO [train.py:994] (1/2) Epoch 8, batch 600, loss[loss=0.2437, simple_loss=0.3009, pruned_loss=0.09326, over 24579.00 frames. ], tot_loss[loss=0.2306, simple_loss=0.2932, pruned_loss=0.08399, over 4565145.46 frames. ], batch size: 176, lr: 3.54e-02, grad_scale: 32.0 +2024-01-15 13:33:25,941 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.032e+02 2.417e+02 2.801e+02 3.609e+02 5.540e+02, threshold=5.603e+02, percent-clipped=1.0 +2024-01-15 13:34:17,520 INFO [train.py:994] (1/2) Epoch 8, batch 650, loss[loss=0.2548, simple_loss=0.3137, pruned_loss=0.09794, over 24468.00 frames. ], tot_loss[loss=0.231, simple_loss=0.2937, pruned_loss=0.08416, over 4622358.25 frames. ], batch size: 187, lr: 3.53e-02, grad_scale: 32.0 +2024-01-15 13:34:39,139 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=21870.0, ans=0.1 +2024-01-15 13:34:40,913 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=256, metric=3.56 vs. limit=15.0 +2024-01-15 13:35:00,573 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=4.51 vs. limit=15.0 +2024-01-15 13:35:15,071 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.self_attn2.whiten, num_groups=1, num_channels=192, metric=15.06 vs. limit=22.5 +2024-01-15 13:35:19,815 INFO [train.py:994] (1/2) Epoch 8, batch 700, loss[loss=0.2207, simple_loss=0.2748, pruned_loss=0.08331, over 23697.00 frames. ], tot_loss[loss=0.2306, simple_loss=0.2931, pruned_loss=0.08398, over 4662444.41 frames. ], batch size: 119, lr: 3.53e-02, grad_scale: 32.0 +2024-01-15 13:35:29,244 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward3.out_whiten.whitening_limit, batch_count=22003.333333333332, ans=15.0 +2024-01-15 13:35:30,858 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.817e+02 2.323e+02 2.652e+02 3.070e+02 7.446e+02, threshold=5.305e+02, percent-clipped=1.0 +2024-01-15 13:35:35,901 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module2.balancer1.prob, batch_count=22036.666666666668, ans=0.125 +2024-01-15 13:35:53,549 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.ff2_skip_rate, batch_count=22070.0, ans=0.006071739130434782 +2024-01-15 13:35:53,779 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=384, metric=21.71 vs. limit=22.5 +2024-01-15 13:36:13,626 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.bypass_mid.scale_min, batch_count=22136.666666666668, ans=0.2 +2024-01-15 13:36:21,538 INFO [train.py:994] (1/2) Epoch 8, batch 750, loss[loss=0.242, simple_loss=0.3033, pruned_loss=0.09036, over 24516.00 frames. ], tot_loss[loss=0.2308, simple_loss=0.2934, pruned_loss=0.08408, over 4694993.12 frames. ], batch size: 187, lr: 3.52e-02, grad_scale: 32.0 +2024-01-15 13:36:33,049 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.bypass_mid.scale_min, batch_count=22203.333333333332, ans=0.2 +2024-01-15 13:36:33,451 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=512, metric=12.54 vs. limit=15.0 +2024-01-15 13:37:05,327 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=22270.0, ans=0.1 +2024-01-15 13:37:20,730 INFO [train.py:994] (1/2) Epoch 8, batch 800, loss[loss=0.2296, simple_loss=0.2964, pruned_loss=0.08141, over 24486.00 frames. ], tot_loss[loss=0.2308, simple_loss=0.2937, pruned_loss=0.08396, over 4728021.85 frames. ], batch size: 210, lr: 3.52e-02, grad_scale: 32.0 +2024-01-15 13:37:30,747 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.814e+02 2.312e+02 2.631e+02 3.113e+02 5.409e+02, threshold=5.262e+02, percent-clipped=1.0 +2024-01-15 13:37:32,150 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.ff3_skip_rate, batch_count=22370.0, ans=0.006006521739130435 +2024-01-15 13:37:54,489 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass_mid.scale_min, batch_count=22436.666666666668, ans=0.2 +2024-01-15 13:37:57,991 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=22436.666666666668, ans=0.1 +2024-01-15 13:38:00,237 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.bypass.scale_min, batch_count=22436.666666666668, ans=0.2 +2024-01-15 13:38:03,491 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward2.hidden_balancer.prob, batch_count=22436.666666666668, ans=0.125 +2024-01-15 13:38:32,542 INFO [train.py:994] (1/2) Epoch 9, batch 0, loss[loss=0.2346, simple_loss=0.2981, pruned_loss=0.08553, over 24460.00 frames. ], tot_loss[loss=0.2346, simple_loss=0.2981, pruned_loss=0.08553, over 24460.00 frames. ], batch size: 222, lr: 3.43e-02, grad_scale: 32.0 +2024-01-15 13:38:32,543 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 13:38:52,995 INFO [train.py:1026] (1/2) Epoch 9, validation: loss=0.1878, simple_loss=0.275, pruned_loss=0.05027, over 1622729.00 frames. +2024-01-15 13:38:52,996 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 13:39:11,614 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=22513.333333333332, ans=0.125 +2024-01-15 13:39:26,602 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=22546.666666666668, ans=0.125 +2024-01-15 13:39:28,898 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.bypass_mid.scale_min, batch_count=22580.0, ans=0.2 +2024-01-15 13:39:43,606 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.self_attn2.whiten, num_groups=1, num_channels=256, metric=19.67 vs. limit=22.5 +2024-01-15 13:39:49,842 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=256, metric=16.00 vs. limit=22.5 +2024-01-15 13:39:53,820 INFO [train.py:994] (1/2) Epoch 9, batch 50, loss[loss=0.2337, simple_loss=0.2986, pruned_loss=0.08446, over 24397.00 frames. ], tot_loss[loss=0.2255, simple_loss=0.2883, pruned_loss=0.08135, over 1088162.95 frames. ], batch size: 275, lr: 3.42e-02, grad_scale: 32.0 +2024-01-15 13:40:02,813 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward1.hidden_balancer.prob, batch_count=22646.666666666668, ans=0.125 +2024-01-15 13:40:06,329 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module1.balancer1.prob, batch_count=22680.0, ans=0.125 +2024-01-15 13:40:14,528 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.991e+02 2.374e+02 2.681e+02 3.103e+02 5.812e+02, threshold=5.362e+02, percent-clipped=2.0 +2024-01-15 13:40:34,414 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=22746.666666666668, ans=0.125 +2024-01-15 13:40:44,467 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=22780.0, ans=0.125 +2024-01-15 13:40:44,496 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=22780.0, ans=0.1 +2024-01-15 13:40:49,258 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.balancer1.prob, batch_count=22780.0, ans=0.125 +2024-01-15 13:40:55,829 INFO [train.py:994] (1/2) Epoch 9, batch 100, loss[loss=0.2042, simple_loss=0.2723, pruned_loss=0.06801, over 24304.00 frames. ], tot_loss[loss=0.2262, simple_loss=0.2892, pruned_loss=0.08165, over 1905012.60 frames. ], batch size: 147, lr: 3.42e-02, grad_scale: 32.0 +2024-01-15 13:40:58,457 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=22813.333333333332, ans=0.1 +2024-01-15 13:41:05,583 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module2.balancer1.min_positive, batch_count=22813.333333333332, ans=0.025 +2024-01-15 13:41:07,969 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_skip_rate, batch_count=22846.666666666668, ans=0.0 +2024-01-15 13:41:25,755 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass.scale_min, batch_count=22880.0, ans=0.2 +2024-01-15 13:41:27,714 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.balancer1.prob, batch_count=22880.0, ans=0.125 +2024-01-15 13:41:30,636 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_skip_rate, batch_count=22880.0, ans=0.0 +2024-01-15 13:41:31,941 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.balancer1.prob, batch_count=22880.0, ans=0.125 +2024-01-15 13:41:35,330 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass.scale_min, batch_count=22913.333333333332, ans=0.2 +2024-01-15 13:41:37,983 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=4.50 vs. limit=6.0 +2024-01-15 13:41:40,176 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module2.balancer1.prob, batch_count=22913.333333333332, ans=0.125 +2024-01-15 13:41:45,971 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module1.balancer2.prob, batch_count=22946.666666666668, ans=0.125 +2024-01-15 13:41:52,533 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=22946.666666666668, ans=0.125 +2024-01-15 13:41:59,118 INFO [train.py:994] (1/2) Epoch 9, batch 150, loss[loss=0.2328, simple_loss=0.3012, pruned_loss=0.08214, over 24368.00 frames. ], tot_loss[loss=0.2241, simple_loss=0.2874, pruned_loss=0.08039, over 2540526.42 frames. ], batch size: 275, lr: 3.41e-02, grad_scale: 32.0 +2024-01-15 13:42:05,180 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.out_proj.dropout_p, batch_count=22980.0, ans=0.1 +2024-01-15 13:42:12,465 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module2.balancer2.min_abs, batch_count=23013.333333333332, ans=0.5 +2024-01-15 13:42:16,287 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.nonlin_attention.whiten1, num_groups=1, num_channels=384, metric=5.49 vs. limit=10.0 +2024-01-15 13:42:18,648 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.883e+02 2.295e+02 2.541e+02 2.854e+02 4.375e+02, threshold=5.082e+02, percent-clipped=0.0 +2024-01-15 13:42:25,317 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module2.balancer2.min_abs, batch_count=23046.666666666668, ans=0.5 +2024-01-15 13:42:47,274 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=10.28 vs. limit=15.0 +2024-01-15 13:42:54,839 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.bypass.scale_min, batch_count=23113.333333333332, ans=0.2 +2024-01-15 13:42:59,898 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=512, metric=13.83 vs. limit=15.0 +2024-01-15 13:43:00,458 INFO [train.py:994] (1/2) Epoch 9, batch 200, loss[loss=0.2197, simple_loss=0.2836, pruned_loss=0.07787, over 24489.00 frames. ], tot_loss[loss=0.2244, simple_loss=0.2882, pruned_loss=0.08029, over 3038289.91 frames. ], batch size: 229, lr: 3.41e-02, grad_scale: 32.0 +2024-01-15 13:43:05,457 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.bypass.skip_rate, batch_count=23146.666666666668, ans=0.09899494936611666 +2024-01-15 13:43:10,897 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_skip_rate, batch_count=23146.666666666668, ans=0.0 +2024-01-15 13:43:28,929 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.convnext.layerdrop_rate, batch_count=23213.333333333332, ans=0.015 +2024-01-15 13:43:44,926 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.ff3_skip_rate, batch_count=23246.666666666668, ans=0.005815942028985507 +2024-01-15 13:44:03,116 INFO [train.py:994] (1/2) Epoch 9, batch 250, loss[loss=0.2305, simple_loss=0.2923, pruned_loss=0.08434, over 24472.00 frames. ], tot_loss[loss=0.2253, simple_loss=0.2897, pruned_loss=0.08044, over 3442357.78 frames. ], batch size: 267, lr: 3.40e-02, grad_scale: 32.0 +2024-01-15 13:44:07,816 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.balancer2.prob, batch_count=23313.333333333332, ans=0.125 +2024-01-15 13:44:16,491 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_skip_rate, batch_count=23346.666666666668, ans=0.0 +2024-01-15 13:44:23,382 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.928e+02 2.470e+02 2.801e+02 3.237e+02 5.591e+02, threshold=5.602e+02, percent-clipped=1.0 +2024-01-15 13:44:31,203 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module1.balancer2.prob, batch_count=23380.0, ans=0.125 +2024-01-15 13:44:51,438 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module2.balancer2.prob, batch_count=23413.333333333332, ans=0.125 +2024-01-15 13:44:54,279 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.self_attn1.whiten, num_groups=1, num_channels=384, metric=21.97 vs. limit=22.5 +2024-01-15 13:45:06,115 INFO [train.py:994] (1/2) Epoch 9, batch 300, loss[loss=0.2327, simple_loss=0.2968, pruned_loss=0.08436, over 24604.00 frames. ], tot_loss[loss=0.2256, simple_loss=0.2897, pruned_loss=0.08075, over 3741680.93 frames. ], batch size: 199, lr: 3.39e-02, grad_scale: 32.0 +2024-01-15 13:45:08,659 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.nonlin_attention.balancer.prob, batch_count=23480.0, ans=0.125 +2024-01-15 13:45:27,774 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.nonlin_attention.balancer.max_positive, batch_count=23513.333333333332, ans=0.95 +2024-01-15 13:45:33,834 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module2.balancer1.prob, batch_count=23546.666666666668, ans=0.125 +2024-01-15 13:45:39,528 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=23546.666666666668, ans=0.1 +2024-01-15 13:45:55,228 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=5.23 vs. limit=6.0 +2024-01-15 13:45:59,676 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass.skip_rate, batch_count=23613.333333333332, ans=0.04949747468305833 +2024-01-15 13:46:04,341 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward2.hidden_balancer.prob, batch_count=23613.333333333332, ans=0.125 +2024-01-15 13:46:08,248 INFO [train.py:994] (1/2) Epoch 9, batch 350, loss[loss=0.1906, simple_loss=0.2567, pruned_loss=0.06227, over 24206.00 frames. ], tot_loss[loss=0.2251, simple_loss=0.2896, pruned_loss=0.08031, over 3987918.97 frames. ], batch size: 132, lr: 3.39e-02, grad_scale: 32.0 +2024-01-15 13:46:25,776 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.bypass_mid.scale_min, batch_count=23680.0, ans=0.2 +2024-01-15 13:46:27,747 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.923e+02 2.389e+02 2.601e+02 3.203e+02 4.963e+02, threshold=5.202e+02, percent-clipped=0.0 +2024-01-15 13:46:31,579 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=23713.333333333332, ans=0.125 +2024-01-15 13:46:40,325 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module2.balancer1.prob, batch_count=23713.333333333332, ans=0.125 +2024-01-15 13:46:49,194 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module1.balancer2.prob, batch_count=23746.666666666668, ans=0.125 +2024-01-15 13:47:03,864 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=23780.0, ans=0.125 +2024-01-15 13:47:09,690 INFO [train.py:994] (1/2) Epoch 9, batch 400, loss[loss=0.2553, simple_loss=0.3137, pruned_loss=0.09846, over 22436.00 frames. ], tot_loss[loss=0.2246, simple_loss=0.2889, pruned_loss=0.08015, over 4152418.10 frames. ], batch size: 357, lr: 3.38e-02, grad_scale: 32.0 +2024-01-15 13:47:14,617 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.feed_forward1.out_whiten, num_groups=1, num_channels=384, metric=11.37 vs. limit=15.0 +2024-01-15 13:47:15,534 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass.scale_min, batch_count=23813.333333333332, ans=0.2 +2024-01-15 13:47:29,400 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module1.balancer1.max_abs, batch_count=23846.666666666668, ans=10.0 +2024-01-15 13:47:45,634 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.whiten, num_groups=1, num_channels=256, metric=4.31 vs. limit=12.0 +2024-01-15 13:48:12,639 INFO [train.py:994] (1/2) Epoch 9, batch 450, loss[loss=0.2402, simple_loss=0.3101, pruned_loss=0.08518, over 23842.00 frames. ], tot_loss[loss=0.2246, simple_loss=0.2889, pruned_loss=0.08016, over 4294968.87 frames. ], batch size: 328, lr: 3.38e-02, grad_scale: 32.0 +2024-01-15 13:48:18,308 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module1.balancer1.min_positive, batch_count=23980.0, ans=0.025 +2024-01-15 13:48:32,118 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.balancer1.prob, batch_count=24013.333333333332, ans=0.125 +2024-01-15 13:48:33,062 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.903e+02 2.345e+02 2.628e+02 3.128e+02 5.089e+02, threshold=5.255e+02, percent-clipped=0.0 +2024-01-15 13:48:36,880 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.ff3_skip_rate, batch_count=24046.666666666668, ans=0.005642028985507246 +2024-01-15 13:48:57,353 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_skip_rate, batch_count=24080.0, ans=0.0 +2024-01-15 13:49:15,881 INFO [train.py:994] (1/2) Epoch 9, batch 500, loss[loss=0.1984, simple_loss=0.2477, pruned_loss=0.07461, over 19079.00 frames. ], tot_loss[loss=0.224, simple_loss=0.2882, pruned_loss=0.07993, over 4400315.87 frames. ], batch size: 83, lr: 3.37e-02, grad_scale: 32.0 +2024-01-15 13:49:25,065 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module1.balancer2.prob, batch_count=24146.666666666668, ans=0.125 +2024-01-15 13:49:26,385 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=24146.666666666668, ans=0.125 +2024-01-15 13:49:27,553 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module1.balancer1.prob, batch_count=24180.0, ans=0.125 +2024-01-15 13:50:08,314 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=144, metric=9.26 vs. limit=10.0 +2024-01-15 13:50:18,513 INFO [train.py:994] (1/2) Epoch 9, batch 550, loss[loss=0.2558, simple_loss=0.3175, pruned_loss=0.0971, over 22604.00 frames. ], tot_loss[loss=0.2244, simple_loss=0.2885, pruned_loss=0.08014, over 4493694.07 frames. ], batch size: 357, lr: 3.37e-02, grad_scale: 32.0 +2024-01-15 13:50:22,368 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.ff3_skip_rate, batch_count=24313.333333333332, ans=0.0055840579710144935 +2024-01-15 13:50:31,244 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=24346.666666666668, ans=0.1 +2024-01-15 13:50:38,522 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.063e+02 2.333e+02 2.583e+02 2.991e+02 4.085e+02, threshold=5.165e+02, percent-clipped=0.0 +2024-01-15 13:50:56,349 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module1.balancer2.prob, batch_count=24413.333333333332, ans=0.125 +2024-01-15 13:51:20,803 INFO [train.py:994] (1/2) Epoch 9, batch 600, loss[loss=0.2168, simple_loss=0.2807, pruned_loss=0.0765, over 24316.00 frames. ], tot_loss[loss=0.2241, simple_loss=0.2881, pruned_loss=0.08001, over 4554353.35 frames. ], batch size: 285, lr: 3.36e-02, grad_scale: 64.0 +2024-01-15 13:51:21,070 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.balancer1.prob, batch_count=24480.0, ans=0.125 +2024-01-15 13:51:25,616 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.bypass.scale_min, batch_count=24480.0, ans=0.2 +2024-01-15 13:51:31,243 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.balancer2.prob, batch_count=24480.0, ans=0.125 +2024-01-15 13:51:42,493 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=384, metric=25.59 vs. limit=22.5 +2024-01-15 13:52:12,460 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module1.balancer2.prob, batch_count=24613.333333333332, ans=0.125 +2024-01-15 13:52:22,695 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=24646.666666666668, ans=0.0 +2024-01-15 13:52:23,554 INFO [train.py:994] (1/2) Epoch 9, batch 650, loss[loss=0.1907, simple_loss=0.2521, pruned_loss=0.06464, over 23540.00 frames. ], tot_loss[loss=0.224, simple_loss=0.2883, pruned_loss=0.07989, over 4605721.37 frames. ], batch size: 119, lr: 3.36e-02, grad_scale: 64.0 +2024-01-15 13:52:32,494 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module2.balancer2.prob, batch_count=24646.666666666668, ans=0.125 +2024-01-15 13:52:43,603 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.958e+02 2.423e+02 2.648e+02 3.293e+02 5.337e+02, threshold=5.296e+02, percent-clipped=1.0 +2024-01-15 13:53:13,833 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=512, metric=12.05 vs. limit=15.0 +2024-01-15 13:53:13,939 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=192, metric=5.34 vs. limit=10.0 +2024-01-15 13:53:26,553 INFO [train.py:994] (1/2) Epoch 9, batch 700, loss[loss=0.2323, simple_loss=0.2891, pruned_loss=0.08772, over 24544.00 frames. ], tot_loss[loss=0.2235, simple_loss=0.2879, pruned_loss=0.07956, over 4648896.04 frames. ], batch size: 176, lr: 3.35e-02, grad_scale: 64.0 +2024-01-15 13:53:32,930 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.bypass_mid.scale_min, batch_count=24813.333333333332, ans=0.2 +2024-01-15 13:53:34,670 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=5.05 vs. limit=6.0 +2024-01-15 13:53:48,976 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward3.hidden_balancer.prob, batch_count=24846.666666666668, ans=0.125 +2024-01-15 13:54:09,445 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module1.balancer1.prob, batch_count=24913.333333333332, ans=0.125 +2024-01-15 13:54:09,510 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 13:54:17,778 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=144, metric=6.12 vs. limit=10.0 +2024-01-15 13:54:20,654 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=24946.666666666668, ans=0.0 +2024-01-15 13:54:28,937 INFO [train.py:994] (1/2) Epoch 9, batch 750, loss[loss=0.2133, simple_loss=0.2836, pruned_loss=0.07149, over 24483.00 frames. ], tot_loss[loss=0.2234, simple_loss=0.2879, pruned_loss=0.07943, over 4687005.93 frames. ], batch size: 165, lr: 3.34e-02, grad_scale: 32.0 +2024-01-15 13:54:37,513 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=5.31 vs. limit=6.0 +2024-01-15 13:54:42,515 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass_mid.scale_min, batch_count=25013.333333333332, ans=0.2 +2024-01-15 13:54:50,088 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=384, metric=22.74 vs. limit=22.5 +2024-01-15 13:54:50,484 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.978e+02 2.521e+02 2.840e+02 3.310e+02 7.138e+02, threshold=5.680e+02, percent-clipped=2.0 +2024-01-15 13:54:52,281 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.feed_forward1.out_whiten, num_groups=1, num_channels=384, metric=12.08 vs. limit=15.0 +2024-01-15 13:55:14,036 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.ff3_skip_rate, batch_count=25080.0, ans=0.005417391304347826 +2024-01-15 13:55:19,505 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=25113.333333333332, ans=0.1 +2024-01-15 13:55:29,293 INFO [train.py:994] (1/2) Epoch 9, batch 800, loss[loss=0.2192, simple_loss=0.2832, pruned_loss=0.07763, over 24314.00 frames. ], tot_loss[loss=0.2232, simple_loss=0.2878, pruned_loss=0.0793, over 4717376.19 frames. ], batch size: 147, lr: 3.34e-02, grad_scale: 32.0 +2024-01-15 13:55:35,226 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 13:55:58,622 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=25213.333333333332, ans=0.125 +2024-01-15 13:56:08,969 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=192, metric=3.30 vs. limit=10.0 +2024-01-15 13:56:41,990 INFO [train.py:994] (1/2) Epoch 10, batch 0, loss[loss=0.2371, simple_loss=0.2972, pruned_loss=0.08853, over 24569.00 frames. ], tot_loss[loss=0.2371, simple_loss=0.2972, pruned_loss=0.08853, over 24569.00 frames. ], batch size: 176, lr: 3.25e-02, grad_scale: 32.0 +2024-01-15 13:56:41,990 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 13:57:01,735 INFO [train.py:1026] (1/2) Epoch 10, validation: loss=0.1857, simple_loss=0.2729, pruned_loss=0.04922, over 1622729.00 frames. +2024-01-15 13:57:01,736 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 13:57:07,457 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.balancer2.prob, batch_count=25290.0, ans=0.125 +2024-01-15 13:57:23,380 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.bypass_mid.scale_min, batch_count=25323.333333333332, ans=0.2 +2024-01-15 13:57:32,562 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=5.68 vs. limit=6.0 +2024-01-15 13:57:33,273 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.980e+02 2.286e+02 2.610e+02 2.905e+02 4.355e+02, threshold=5.219e+02, percent-clipped=0.0 +2024-01-15 13:57:37,234 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.bypass.skip_rate, batch_count=25356.666666666668, ans=0.07 +2024-01-15 13:57:53,881 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.bypass.scale_min, batch_count=25423.333333333332, ans=0.2 +2024-01-15 13:58:05,149 INFO [train.py:994] (1/2) Epoch 10, batch 50, loss[loss=0.2244, simple_loss=0.2863, pruned_loss=0.08121, over 24397.00 frames. ], tot_loss[loss=0.22, simple_loss=0.2855, pruned_loss=0.07724, over 1080362.23 frames. ], batch size: 159, lr: 3.25e-02, grad_scale: 32.0 +2024-01-15 13:58:24,578 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_skip_rate, batch_count=25490.0, ans=0.0 +2024-01-15 13:58:29,120 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=25523.333333333332, ans=0.1 +2024-01-15 13:58:45,835 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass.skip_rate, batch_count=25556.666666666668, ans=0.09899494936611666 +2024-01-15 13:58:49,678 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=384, metric=2.45 vs. limit=15.0 +2024-01-15 13:59:02,314 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_skip_rate, batch_count=25590.0, ans=0.0 +2024-01-15 13:59:07,241 INFO [train.py:994] (1/2) Epoch 10, batch 100, loss[loss=0.2183, simple_loss=0.2882, pruned_loss=0.07417, over 24501.00 frames. ], tot_loss[loss=0.2182, simple_loss=0.2834, pruned_loss=0.07657, over 1897769.82 frames. ], batch size: 187, lr: 3.24e-02, grad_scale: 32.0 +2024-01-15 13:59:12,834 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.bypass.skip_rate, batch_count=25623.333333333332, ans=0.07 +2024-01-15 13:59:37,399 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.922e+02 2.364e+02 2.776e+02 3.219e+02 6.201e+02, threshold=5.552e+02, percent-clipped=1.0 +2024-01-15 14:00:09,784 INFO [train.py:994] (1/2) Epoch 10, batch 150, loss[loss=0.2096, simple_loss=0.2765, pruned_loss=0.07135, over 24501.00 frames. ], tot_loss[loss=0.2192, simple_loss=0.2844, pruned_loss=0.07696, over 2547276.45 frames. ], batch size: 204, lr: 3.24e-02, grad_scale: 32.0 +2024-01-15 14:00:23,753 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.self_attn2.whiten, num_groups=1, num_channels=256, metric=19.61 vs. limit=22.5 +2024-01-15 14:00:52,769 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.ff2_skip_rate, batch_count=25890.0, ans=0.005241304347826087 +2024-01-15 14:01:11,346 INFO [train.py:994] (1/2) Epoch 10, batch 200, loss[loss=0.2214, simple_loss=0.289, pruned_loss=0.07693, over 24549.00 frames. ], tot_loss[loss=0.2196, simple_loss=0.2853, pruned_loss=0.07702, over 3062254.11 frames. ], batch size: 176, lr: 3.23e-02, grad_scale: 32.0 +2024-01-15 14:01:12,778 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=25956.666666666668, ans=0.1 +2024-01-15 14:01:23,996 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.nonlin_attention.whiten1.whitening_limit, batch_count=25990.0, ans=10.0 +2024-01-15 14:01:29,686 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.out_proj.dropout_p, batch_count=25990.0, ans=0.1 +2024-01-15 14:01:33,584 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.feed_forward2.out_whiten, num_groups=1, num_channels=384, metric=9.74 vs. limit=15.0 +2024-01-15 14:01:34,543 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.1.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 14:01:39,298 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=26023.333333333332, ans=0.125 +2024-01-15 14:01:40,383 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=26023.333333333332, ans=0.125 +2024-01-15 14:01:41,183 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.782e+02 2.154e+02 2.425e+02 2.905e+02 5.459e+02, threshold=4.850e+02, percent-clipped=0.0 +2024-01-15 14:01:43,397 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module1.balancer1.prob, batch_count=26023.333333333332, ans=0.125 +2024-01-15 14:01:44,659 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module1.balancer1.prob, batch_count=26023.333333333332, ans=0.125 +2024-01-15 14:02:02,653 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module1.balancer1.prob, batch_count=26090.0, ans=0.125 +2024-01-15 14:02:13,354 INFO [train.py:994] (1/2) Epoch 10, batch 250, loss[loss=0.2092, simple_loss=0.273, pruned_loss=0.07267, over 24417.00 frames. ], tot_loss[loss=0.219, simple_loss=0.2848, pruned_loss=0.07655, over 3456075.67 frames. ], batch size: 159, lr: 3.23e-02, grad_scale: 16.0 +2024-01-15 14:02:18,295 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=26123.333333333332, ans=0.1 +2024-01-15 14:02:23,091 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_skip_rate, batch_count=26123.333333333332, ans=0.0 +2024-01-15 14:02:25,463 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=26156.666666666668, ans=0.125 +2024-01-15 14:02:35,562 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.bypass.scale_min, batch_count=26156.666666666668, ans=0.2 +2024-01-15 14:02:50,457 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.self_attn_weights.pos_emb_skip_rate, batch_count=26223.333333333332, ans=0.0 +2024-01-15 14:03:06,848 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.ff2_skip_rate, batch_count=26256.666666666668, ans=0.005161594202898551 +2024-01-15 14:03:06,894 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.balancer2.prob, batch_count=26256.666666666668, ans=0.125 +2024-01-15 14:03:14,809 INFO [train.py:994] (1/2) Epoch 10, batch 300, loss[loss=0.2137, simple_loss=0.2814, pruned_loss=0.07299, over 24314.00 frames. ], tot_loss[loss=0.2185, simple_loss=0.2844, pruned_loss=0.07625, over 3755654.11 frames. ], batch size: 147, lr: 3.22e-02, grad_scale: 16.0 +2024-01-15 14:03:21,092 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=11.98 vs. limit=15.0 +2024-01-15 14:03:44,468 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.self_attn_weights.pos_emb_skip_rate, batch_count=26356.666666666668, ans=0.0 +2024-01-15 14:03:45,319 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.872e+02 2.343e+02 2.638e+02 3.055e+02 5.073e+02, threshold=5.275e+02, percent-clipped=1.0 +2024-01-15 14:03:57,123 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.convnext.hidden_balancer.prob, batch_count=26390.0, ans=0.125 +2024-01-15 14:04:00,101 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.whiten, num_groups=1, num_channels=384, metric=6.15 vs. limit=12.0 +2024-01-15 14:04:16,076 INFO [train.py:994] (1/2) Epoch 10, batch 350, loss[loss=0.2174, simple_loss=0.2838, pruned_loss=0.07551, over 24591.00 frames. ], tot_loss[loss=0.2186, simple_loss=0.2845, pruned_loss=0.07638, over 3994292.76 frames. ], batch size: 199, lr: 3.22e-02, grad_scale: 16.0 +2024-01-15 14:04:20,082 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.nonlin_attention.balancer.prob, batch_count=26456.666666666668, ans=0.125 +2024-01-15 14:04:38,005 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.ff3_skip_rate, batch_count=26490.0, ans=0.005110869565217392 +2024-01-15 14:04:57,599 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=384, metric=22.48 vs. limit=22.5 +2024-01-15 14:05:19,576 INFO [train.py:994] (1/2) Epoch 10, batch 400, loss[loss=0.2211, simple_loss=0.2883, pruned_loss=0.077, over 24568.00 frames. ], tot_loss[loss=0.2189, simple_loss=0.2846, pruned_loss=0.07658, over 4177806.48 frames. ], batch size: 176, lr: 3.21e-02, grad_scale: 32.0 +2024-01-15 14:05:31,878 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.balancer1.prob, batch_count=26656.666666666668, ans=0.125 +2024-01-15 14:05:42,037 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module2.balancer1.prob, batch_count=26656.666666666668, ans=0.125 +2024-01-15 14:05:46,693 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass.scale_min, batch_count=26690.0, ans=0.2 +2024-01-15 14:05:48,969 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass.scale_min, batch_count=26690.0, ans=0.2 +2024-01-15 14:05:48,988 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass.skip_rate, batch_count=26690.0, ans=0.07 +2024-01-15 14:05:53,442 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.894e+02 2.342e+02 2.602e+02 2.965e+02 5.895e+02, threshold=5.203e+02, percent-clipped=1.0 +2024-01-15 14:06:13,070 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module2.balancer1.max_abs, batch_count=26756.666666666668, ans=10.0 +2024-01-15 14:06:16,495 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=26756.666666666668, ans=0.125 +2024-01-15 14:06:23,702 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.3.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 14:06:24,554 INFO [train.py:994] (1/2) Epoch 10, batch 450, loss[loss=0.232, simple_loss=0.2967, pruned_loss=0.08366, over 24535.00 frames. ], tot_loss[loss=0.2181, simple_loss=0.2837, pruned_loss=0.07628, over 4308568.27 frames. ], batch size: 193, lr: 3.20e-02, grad_scale: 32.0 +2024-01-15 14:06:30,872 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.balancer1.prob, batch_count=26790.0, ans=0.125 +2024-01-15 14:06:35,635 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=26790.0, ans=0.125 +2024-01-15 14:06:43,350 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=256, metric=19.72 vs. limit=22.5 +2024-01-15 14:07:01,788 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.balancer2.prob, batch_count=26890.0, ans=0.125 +2024-01-15 14:07:08,786 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.bypass.skip_rate, batch_count=26890.0, ans=0.035 +2024-01-15 14:07:08,879 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_module2.balancer1.prob, batch_count=26890.0, ans=0.125 +2024-01-15 14:07:19,057 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.bypass_mid.scale_min, batch_count=26923.333333333332, ans=0.2 +2024-01-15 14:07:27,544 INFO [train.py:994] (1/2) Epoch 10, batch 500, loss[loss=0.2261, simple_loss=0.2925, pruned_loss=0.07992, over 24444.00 frames. ], tot_loss[loss=0.218, simple_loss=0.2839, pruned_loss=0.07604, over 4426340.51 frames. ], batch size: 170, lr: 3.20e-02, grad_scale: 16.0 +2024-01-15 14:07:31,725 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=5.07 vs. limit=6.0 +2024-01-15 14:07:46,232 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.ff2_skip_rate, batch_count=26990.0, ans=0.005002173913043478 +2024-01-15 14:07:47,282 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=26990.0, ans=0.125 +2024-01-15 14:07:49,658 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_skip_rate, batch_count=26990.0, ans=0.0 +2024-01-15 14:07:58,749 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.904e+02 2.338e+02 2.675e+02 3.125e+02 5.410e+02, threshold=5.349e+02, percent-clipped=1.0 +2024-01-15 14:08:06,031 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.conv.8.prob, batch_count=27056.666666666668, ans=0.125 +2024-01-15 14:08:08,487 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.out_proj.dropout_p, batch_count=27056.666666666668, ans=0.1 +2024-01-15 14:08:16,359 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.bypass_mid.scale_min, batch_count=27090.0, ans=0.2 +2024-01-15 14:08:28,012 INFO [train.py:994] (1/2) Epoch 10, batch 550, loss[loss=0.2129, simple_loss=0.2791, pruned_loss=0.07335, over 24226.00 frames. ], tot_loss[loss=0.2175, simple_loss=0.2834, pruned_loss=0.07582, over 4513531.15 frames. ], batch size: 311, lr: 3.19e-02, grad_scale: 16.0 +2024-01-15 14:08:32,963 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_skip_rate, batch_count=27123.333333333332, ans=0.0 +2024-01-15 14:08:36,175 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module2.balancer1.prob, batch_count=27123.333333333332, ans=0.125 +2024-01-15 14:08:37,217 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.bypass.skip_rate, batch_count=27123.333333333332, ans=0.07 +2024-01-15 14:08:49,178 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass.skip_rate, batch_count=27156.666666666668, ans=0.04949747468305833 +2024-01-15 14:09:00,371 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.out_proj.dropout_p, batch_count=27190.0, ans=0.1 +2024-01-15 14:09:09,267 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=27223.333333333332, ans=0.125 +2024-01-15 14:09:27,386 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module2.balancer2.prob, batch_count=27256.666666666668, ans=0.125 +2024-01-15 14:09:27,760 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.whiten, num_groups=1, num_channels=512, metric=3.98 vs. limit=12.0 +2024-01-15 14:09:30,634 INFO [train.py:994] (1/2) Epoch 10, batch 600, loss[loss=0.2335, simple_loss=0.2902, pruned_loss=0.08837, over 24598.00 frames. ], tot_loss[loss=0.2182, simple_loss=0.2845, pruned_loss=0.07592, over 4590092.55 frames. ], batch size: 176, lr: 3.19e-02, grad_scale: 16.0 +2024-01-15 14:09:40,365 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.attention_skip_rate, batch_count=27290.0, ans=0.0 +2024-01-15 14:09:44,089 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.balancer2.prob, batch_count=27323.333333333332, ans=0.125 +2024-01-15 14:09:47,956 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.bypass.skip_rate, batch_count=27323.333333333332, ans=0.035 +2024-01-15 14:09:51,420 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.self_attn2.whiten, num_groups=1, num_channels=384, metric=18.63 vs. limit=22.5 +2024-01-15 14:10:03,145 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.869e+02 2.330e+02 2.553e+02 2.956e+02 5.056e+02, threshold=5.106e+02, percent-clipped=0.0 +2024-01-15 14:10:15,914 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=27390.0, ans=0.1 +2024-01-15 14:10:32,632 INFO [train.py:994] (1/2) Epoch 10, batch 650, loss[loss=0.2293, simple_loss=0.2989, pruned_loss=0.07987, over 23870.00 frames. ], tot_loss[loss=0.2172, simple_loss=0.2838, pruned_loss=0.07529, over 4640933.86 frames. ], batch size: 328, lr: 3.18e-02, grad_scale: 16.0 +2024-01-15 14:10:33,351 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=512, metric=22.42 vs. limit=22.5 +2024-01-15 14:10:42,068 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.bypass.skip_rate, batch_count=27456.666666666668, ans=0.09899494936611666 +2024-01-15 14:10:43,247 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_skip_rate, batch_count=27456.666666666668, ans=0.0 +2024-01-15 14:10:58,172 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_skip_rate, batch_count=27523.333333333332, ans=0.0 +2024-01-15 14:11:11,958 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module2.balancer1.prob, batch_count=27556.666666666668, ans=0.125 +2024-01-15 14:11:18,665 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass.skip_rate, batch_count=27556.666666666668, ans=0.04949747468305833 +2024-01-15 14:11:29,487 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.nonlin_attention.whiten2, num_groups=1, num_channels=384, metric=6.95 vs. limit=15.0 +2024-01-15 14:11:35,420 INFO [train.py:994] (1/2) Epoch 10, batch 700, loss[loss=0.2247, simple_loss=0.2898, pruned_loss=0.07979, over 24504.00 frames. ], tot_loss[loss=0.2166, simple_loss=0.2832, pruned_loss=0.07502, over 4674500.88 frames. ], batch size: 181, lr: 3.18e-02, grad_scale: 16.0 +2024-01-15 14:11:49,486 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=27656.666666666668, ans=0.1 +2024-01-15 14:12:04,409 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.balancer2.prob, batch_count=27690.0, ans=0.125 +2024-01-15 14:12:08,169 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.856e+02 2.316e+02 2.694e+02 3.351e+02 5.251e+02, threshold=5.388e+02, percent-clipped=2.0 +2024-01-15 14:12:23,840 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.bypass.scale_min, batch_count=27756.666666666668, ans=0.2 +2024-01-15 14:12:34,690 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.attention_skip_rate, batch_count=27756.666666666668, ans=0.0 +2024-01-15 14:12:35,913 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.feed_forward2.out_whiten, num_groups=1, num_channels=384, metric=10.32 vs. limit=15.0 +2024-01-15 14:12:37,758 INFO [train.py:994] (1/2) Epoch 10, batch 750, loss[loss=0.2239, simple_loss=0.2952, pruned_loss=0.07631, over 24322.00 frames. ], tot_loss[loss=0.217, simple_loss=0.2835, pruned_loss=0.07527, over 4702048.15 frames. ], batch size: 285, lr: 3.17e-02, grad_scale: 16.0 +2024-01-15 14:12:40,455 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.balancer_ff3.min_abs, batch_count=27790.0, ans=0.2 +2024-01-15 14:12:42,784 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.ff2_skip_rate, batch_count=27790.0, ans=0.004828260869565218 +2024-01-15 14:12:57,558 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.ff2_skip_rate, batch_count=27823.333333333332, ans=0.004821014492753624 +2024-01-15 14:13:37,418 INFO [train.py:994] (1/2) Epoch 10, batch 800, loss[loss=0.2306, simple_loss=0.2996, pruned_loss=0.08081, over 24484.00 frames. ], tot_loss[loss=0.2168, simple_loss=0.2833, pruned_loss=0.07511, over 4727375.20 frames. ], batch size: 229, lr: 3.17e-02, grad_scale: 32.0 +2024-01-15 14:14:04,997 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=384, metric=22.84 vs. limit=22.5 +2024-01-15 14:14:07,569 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.048e+02 2.323e+02 2.666e+02 3.032e+02 5.624e+02, threshold=5.332e+02, percent-clipped=1.0 +2024-01-15 14:14:51,275 INFO [train.py:994] (1/2) Epoch 11, batch 0, loss[loss=0.2293, simple_loss=0.2946, pruned_loss=0.08197, over 23874.00 frames. ], tot_loss[loss=0.2293, simple_loss=0.2946, pruned_loss=0.08197, over 23874.00 frames. ], batch size: 328, lr: 3.08e-02, grad_scale: 32.0 +2024-01-15 14:14:51,275 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 14:15:04,587 INFO [zipformer.py:1858] (1/2) name=encoder.encoders.4.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([2.0406, 2.9866, 3.1680, 2.7271], device='cuda:1') +2024-01-15 14:15:12,002 INFO [train.py:1026] (1/2) Epoch 11, validation: loss=0.182, simple_loss=0.2695, pruned_loss=0.04721, over 1622729.00 frames. +2024-01-15 14:15:12,002 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 14:15:15,995 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.balancer1.prob, batch_count=28100.0, ans=0.125 +2024-01-15 14:15:38,810 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.bypass_mid.scale_min, batch_count=28166.666666666668, ans=0.2 +2024-01-15 14:15:45,930 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=28166.666666666668, ans=0.1 +2024-01-15 14:15:47,074 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass_mid.scale_min, batch_count=28166.666666666668, ans=0.2 +2024-01-15 14:15:49,113 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module1.balancer1.max_abs, batch_count=28200.0, ans=10.0 +2024-01-15 14:16:02,754 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward3.hidden_balancer.prob, batch_count=28233.333333333332, ans=0.125 +2024-01-15 14:16:14,605 INFO [train.py:994] (1/2) Epoch 11, batch 50, loss[loss=0.218, simple_loss=0.2809, pruned_loss=0.07757, over 24454.00 frames. ], tot_loss[loss=0.2116, simple_loss=0.2783, pruned_loss=0.07243, over 1092511.04 frames. ], batch size: 170, lr: 3.08e-02, grad_scale: 32.0 +2024-01-15 14:16:19,353 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=28266.666666666668, ans=0.125 +2024-01-15 14:16:21,694 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.bypass_mid.scale_min, batch_count=28266.666666666668, ans=0.2 +2024-01-15 14:16:22,854 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.attention_skip_rate, batch_count=28266.666666666668, ans=0.0 +2024-01-15 14:16:26,583 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_skip_rate, batch_count=28300.0, ans=0.0 +2024-01-15 14:16:54,953 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.836e+02 2.193e+02 2.497e+02 2.820e+02 5.572e+02, threshold=4.995e+02, percent-clipped=1.0 +2024-01-15 14:17:04,300 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=28400.0, ans=0.1 +2024-01-15 14:17:05,832 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=512, metric=9.52 vs. limit=15.0 +2024-01-15 14:17:16,547 INFO [train.py:994] (1/2) Epoch 11, batch 100, loss[loss=0.2208, simple_loss=0.2816, pruned_loss=0.08003, over 24526.00 frames. ], tot_loss[loss=0.2114, simple_loss=0.2779, pruned_loss=0.07244, over 1911019.72 frames. ], batch size: 243, lr: 3.07e-02, grad_scale: 32.0 +2024-01-15 14:18:00,641 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=28533.333333333332, ans=0.1 +2024-01-15 14:18:03,052 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_skip_rate, batch_count=28533.333333333332, ans=0.0 +2024-01-15 14:18:18,822 INFO [train.py:994] (1/2) Epoch 11, batch 150, loss[loss=0.214, simple_loss=0.2834, pruned_loss=0.0723, over 24531.00 frames. ], tot_loss[loss=0.2118, simple_loss=0.2789, pruned_loss=0.07229, over 2559490.39 frames. ], batch size: 236, lr: 3.07e-02, grad_scale: 32.0 +2024-01-15 14:18:21,432 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_skip_rate, batch_count=28600.0, ans=0.0 +2024-01-15 14:18:39,265 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.self_attn_weights.pos_emb_skip_rate, batch_count=28633.333333333332, ans=0.0 +2024-01-15 14:18:42,232 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=5.25 vs. limit=6.0 +2024-01-15 14:19:00,354 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.849e+02 2.381e+02 2.888e+02 3.320e+02 5.016e+02, threshold=5.776e+02, percent-clipped=3.0 +2024-01-15 14:19:03,109 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.attention_skip_rate, batch_count=28700.0, ans=0.0 +2024-01-15 14:19:03,650 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=4.65 vs. limit=6.0 +2024-01-15 14:19:12,182 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.balancer_ff3.min_abs, batch_count=28733.333333333332, ans=0.2 +2024-01-15 14:19:15,605 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 14:19:16,732 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module1.balancer2.min_abs, batch_count=28733.333333333332, ans=0.5 +2024-01-15 14:19:19,947 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=28733.333333333332, ans=0.1 +2024-01-15 14:19:22,146 INFO [train.py:994] (1/2) Epoch 11, batch 200, loss[loss=0.2069, simple_loss=0.2769, pruned_loss=0.06841, over 24487.00 frames. ], tot_loss[loss=0.2113, simple_loss=0.2784, pruned_loss=0.07206, over 3056314.22 frames. ], batch size: 216, lr: 3.06e-02, grad_scale: 32.0 +2024-01-15 14:19:41,161 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 14:19:55,461 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.balancer.min_positive, batch_count=28833.333333333332, ans=0.05 +2024-01-15 14:20:01,458 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module2.balancer2.prob, batch_count=28866.666666666668, ans=0.125 +2024-01-15 14:20:04,016 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass.scale_min, batch_count=28866.666666666668, ans=0.2 +2024-01-15 14:20:13,911 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_skip_rate, batch_count=28900.0, ans=0.0 +2024-01-15 14:20:24,838 INFO [train.py:994] (1/2) Epoch 11, batch 250, loss[loss=0.2161, simple_loss=0.2838, pruned_loss=0.07426, over 24521.00 frames. ], tot_loss[loss=0.2115, simple_loss=0.2792, pruned_loss=0.07188, over 3443762.47 frames. ], batch size: 236, lr: 3.06e-02, grad_scale: 32.0 +2024-01-15 14:20:38,024 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.bypass.skip_rate, batch_count=28966.666666666668, ans=0.09899494936611666 +2024-01-15 14:20:49,689 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=14.48 vs. limit=15.0 +2024-01-15 14:20:53,051 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.balancer1.prob, batch_count=29000.0, ans=0.125 +2024-01-15 14:20:56,574 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.out_combiner.scale_min, batch_count=29000.0, ans=0.2 +2024-01-15 14:21:06,188 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.905e+02 2.235e+02 2.512e+02 2.878e+02 4.566e+02, threshold=5.025e+02, percent-clipped=0.0 +2024-01-15 14:21:07,028 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=256, metric=20.87 vs. limit=22.5 +2024-01-15 14:21:10,134 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module2.balancer2.prob, batch_count=29033.333333333332, ans=0.125 +2024-01-15 14:21:10,236 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_skip_rate, batch_count=29033.333333333332, ans=0.0 +2024-01-15 14:21:17,458 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.feed_forward1.out_whiten, num_groups=1, num_channels=192, metric=11.05 vs. limit=15.0 +2024-01-15 14:21:27,743 INFO [train.py:994] (1/2) Epoch 11, batch 300, loss[loss=0.228, simple_loss=0.3001, pruned_loss=0.07795, over 23832.00 frames. ], tot_loss[loss=0.2117, simple_loss=0.2792, pruned_loss=0.07212, over 3746046.29 frames. ], batch size: 328, lr: 3.05e-02, grad_scale: 32.0 +2024-01-15 14:21:27,944 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module2.balancer1.prob, batch_count=29100.0, ans=0.125 +2024-01-15 14:21:33,834 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_module1.balancer1.prob, batch_count=29100.0, ans=0.125 +2024-01-15 14:21:35,662 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_skip_rate, batch_count=29100.0, ans=0.0 +2024-01-15 14:22:29,677 INFO [train.py:994] (1/2) Epoch 11, batch 350, loss[loss=0.2151, simple_loss=0.2757, pruned_loss=0.07721, over 24326.00 frames. ], tot_loss[loss=0.2116, simple_loss=0.2792, pruned_loss=0.07201, over 3977850.45 frames. ], batch size: 285, lr: 3.05e-02, grad_scale: 32.0 +2024-01-15 14:22:58,785 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module2.balancer2.prob, batch_count=29333.333333333332, ans=0.125 +2024-01-15 14:23:07,226 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward1.hidden_balancer.prob, batch_count=29366.666666666668, ans=0.125 +2024-01-15 14:23:11,100 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.002e+02 2.304e+02 2.500e+02 2.961e+02 4.886e+02, threshold=5.000e+02, percent-clipped=0.0 +2024-01-15 14:23:11,809 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=512, metric=5.35 vs. limit=15.0 +2024-01-15 14:23:13,177 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=10.18 vs. limit=15.0 +2024-01-15 14:23:13,815 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.balancer1.prob, batch_count=29366.666666666668, ans=0.125 +2024-01-15 14:23:19,257 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=29400.0, ans=0.1 +2024-01-15 14:23:31,984 INFO [train.py:994] (1/2) Epoch 11, batch 400, loss[loss=0.2083, simple_loss=0.283, pruned_loss=0.06678, over 24502.00 frames. ], tot_loss[loss=0.2115, simple_loss=0.2787, pruned_loss=0.07212, over 4156399.56 frames. ], batch size: 181, lr: 3.04e-02, grad_scale: 32.0 +2024-01-15 14:23:42,688 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=4.35 vs. limit=15.0 +2024-01-15 14:23:47,287 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=256, metric=10.55 vs. limit=15.0 +2024-01-15 14:24:07,343 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.balancer_ff3.min_abs, batch_count=29500.0, ans=0.2 +2024-01-15 14:24:19,830 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_skip_rate, batch_count=29533.333333333332, ans=0.0 +2024-01-15 14:24:34,443 INFO [train.py:994] (1/2) Epoch 11, batch 450, loss[loss=0.2267, simple_loss=0.2973, pruned_loss=0.07804, over 23870.00 frames. ], tot_loss[loss=0.211, simple_loss=0.2784, pruned_loss=0.07179, over 4297594.51 frames. ], batch size: 328, lr: 3.04e-02, grad_scale: 32.0 +2024-01-15 14:24:34,587 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.convnext.out_balancer.prob, batch_count=29600.0, ans=0.125 +2024-01-15 14:25:04,945 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass.scale_min, batch_count=29666.666666666668, ans=0.2 +2024-01-15 14:25:15,314 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.891e+02 2.230e+02 2.451e+02 2.754e+02 3.898e+02, threshold=4.902e+02, percent-clipped=0.0 +2024-01-15 14:25:15,637 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward2.hidden_balancer.prob, batch_count=29700.0, ans=0.125 +2024-01-15 14:25:30,099 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.conv_module2.whiten, num_groups=1, num_channels=512, metric=8.14 vs. limit=15.0 +2024-01-15 14:25:36,652 INFO [train.py:994] (1/2) Epoch 11, batch 500, loss[loss=0.1921, simple_loss=0.2518, pruned_loss=0.06622, over 23674.00 frames. ], tot_loss[loss=0.211, simple_loss=0.2785, pruned_loss=0.07178, over 4413100.17 frames. ], batch size: 119, lr: 3.03e-02, grad_scale: 32.0 +2024-01-15 14:25:43,934 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module1.balancer1.prob, batch_count=29766.666666666668, ans=0.125 +2024-01-15 14:25:56,114 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=29800.0, ans=0.1 +2024-01-15 14:26:19,912 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=384, metric=21.57 vs. limit=22.5 +2024-01-15 14:26:21,987 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.balancer2.prob, batch_count=29866.666666666668, ans=0.125 +2024-01-15 14:26:38,824 INFO [train.py:994] (1/2) Epoch 11, batch 550, loss[loss=0.2096, simple_loss=0.278, pruned_loss=0.07058, over 24488.00 frames. ], tot_loss[loss=0.2104, simple_loss=0.2781, pruned_loss=0.07135, over 4495552.73 frames. ], batch size: 216, lr: 3.03e-02, grad_scale: 32.0 +2024-01-15 14:27:00,880 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.balancer1.prob, batch_count=29966.666666666668, ans=0.125 +2024-01-15 14:27:04,704 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.bypass_mid.scale_min, batch_count=30000.0, ans=0.2 +2024-01-15 14:27:17,621 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.attention_skip_rate, batch_count=30033.333333333332, ans=0.0 +2024-01-15 14:27:18,936 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward3.hidden_balancer.prob, batch_count=30033.333333333332, ans=0.125 +2024-01-15 14:27:19,730 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.915e+02 2.557e+02 2.916e+02 3.459e+02 5.419e+02, threshold=5.831e+02, percent-clipped=1.0 +2024-01-15 14:27:22,294 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.balancer1.prob, batch_count=30033.333333333332, ans=0.125 +2024-01-15 14:27:23,641 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=30033.333333333332, ans=0.1 +2024-01-15 14:27:28,302 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module1.balancer2.prob, batch_count=30066.666666666668, ans=0.125 +2024-01-15 14:27:41,194 INFO [train.py:994] (1/2) Epoch 11, batch 600, loss[loss=0.2259, simple_loss=0.2913, pruned_loss=0.08023, over 24522.00 frames. ], tot_loss[loss=0.2105, simple_loss=0.278, pruned_loss=0.07151, over 4545033.93 frames. ], batch size: 165, lr: 3.02e-02, grad_scale: 32.0 +2024-01-15 14:28:04,489 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module1.whiten.whitening_limit, batch_count=30133.333333333332, ans=15.0 +2024-01-15 14:28:41,945 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module1.balancer1.min_positive, batch_count=30266.666666666668, ans=0.025 +2024-01-15 14:28:42,823 INFO [train.py:994] (1/2) Epoch 11, batch 650, loss[loss=0.2193, simple_loss=0.287, pruned_loss=0.07581, over 24473.00 frames. ], tot_loss[loss=0.2099, simple_loss=0.2778, pruned_loss=0.07101, over 4610370.03 frames. ], batch size: 216, lr: 3.02e-02, grad_scale: 32.0 +2024-01-15 14:28:47,946 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.attention_skip_rate, batch_count=30266.666666666668, ans=0.0 +2024-01-15 14:28:48,489 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=384, metric=18.76 vs. limit=15.0 +2024-01-15 14:29:02,140 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=5.50 vs. limit=6.0 +2024-01-15 14:29:06,314 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module2.balancer2.prob, batch_count=30300.0, ans=0.125 +2024-01-15 14:29:07,474 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.bypass.skip_rate, batch_count=30333.333333333332, ans=0.04949747468305833 +2024-01-15 14:29:24,992 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.788e+02 2.341e+02 2.706e+02 3.080e+02 4.003e+02, threshold=5.412e+02, percent-clipped=0.0 +2024-01-15 14:29:31,335 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.self_attn1.whiten.whitening_limit, batch_count=30366.666666666668, ans=22.5 +2024-01-15 14:29:33,533 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module2.balancer1.max_abs, batch_count=30400.0, ans=10.0 +2024-01-15 14:29:33,545 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.nonlin_attention.balancer.prob, batch_count=30400.0, ans=0.125 +2024-01-15 14:29:34,067 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=384, metric=13.15 vs. limit=15.0 +2024-01-15 14:29:46,606 INFO [train.py:994] (1/2) Epoch 11, batch 700, loss[loss=0.2209, simple_loss=0.2896, pruned_loss=0.07606, over 24487.00 frames. ], tot_loss[loss=0.2099, simple_loss=0.278, pruned_loss=0.07094, over 4658422.58 frames. ], batch size: 181, lr: 3.01e-02, grad_scale: 32.0 +2024-01-15 14:29:52,348 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=192, metric=6.63 vs. limit=15.0 +2024-01-15 14:29:56,827 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=192, metric=3.34 vs. limit=10.0 +2024-01-15 14:29:59,004 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=30466.666666666668, ans=0.125 +2024-01-15 14:30:32,557 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=8.36 vs. limit=10.0 +2024-01-15 14:30:37,578 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module2.balancer2.min_abs, batch_count=30566.666666666668, ans=0.5 +2024-01-15 14:30:42,298 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward2.hidden_balancer.prob, batch_count=30566.666666666668, ans=0.125 +2024-01-15 14:30:48,033 INFO [train.py:994] (1/2) Epoch 11, batch 750, loss[loss=0.2056, simple_loss=0.2777, pruned_loss=0.06676, over 24516.00 frames. ], tot_loss[loss=0.2096, simple_loss=0.2778, pruned_loss=0.07067, over 4699081.17 frames. ], batch size: 236, lr: 3.01e-02, grad_scale: 32.0 +2024-01-15 14:30:52,791 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.conv.5.prob, batch_count=30600.0, ans=0.125 +2024-01-15 14:31:06,209 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=30633.333333333332, ans=0.0 +2024-01-15 14:31:28,165 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.837e+02 2.345e+02 2.684e+02 3.156e+02 5.957e+02, threshold=5.367e+02, percent-clipped=1.0 +2024-01-15 14:31:43,225 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.balancer2.prob, batch_count=30733.333333333332, ans=0.125 +2024-01-15 14:31:47,490 INFO [train.py:994] (1/2) Epoch 11, batch 800, loss[loss=0.2085, simple_loss=0.2788, pruned_loss=0.0691, over 24519.00 frames. ], tot_loss[loss=0.2086, simple_loss=0.277, pruned_loss=0.07011, over 4722759.69 frames. ], batch size: 243, lr: 3.00e-02, grad_scale: 32.0 +2024-01-15 14:31:54,812 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=12.09 vs. limit=15.0 +2024-01-15 14:31:56,750 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module1.balancer2.prob, batch_count=30766.666666666668, ans=0.125 +2024-01-15 14:32:03,526 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module1.balancer2.prob, batch_count=30800.0, ans=0.125 +2024-01-15 14:32:05,893 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module2.balancer2.min_positive, batch_count=30800.0, ans=0.05 +2024-01-15 14:32:20,299 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module1.balancer2.prob, batch_count=30833.333333333332, ans=0.125 +2024-01-15 14:32:22,566 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.bypass.scale_min, batch_count=30866.666666666668, ans=0.2 +2024-01-15 14:32:27,096 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=30866.666666666668, ans=0.1 +2024-01-15 14:32:31,765 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=512, metric=2.24 vs. limit=15.0 +2024-01-15 14:32:33,522 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module1.balancer2.prob, batch_count=30900.0, ans=0.125 +2024-01-15 14:33:00,628 INFO [train.py:994] (1/2) Epoch 12, batch 0, loss[loss=0.1761, simple_loss=0.2427, pruned_loss=0.05477, over 23408.00 frames. ], tot_loss[loss=0.1761, simple_loss=0.2427, pruned_loss=0.05477, over 23408.00 frames. ], batch size: 119, lr: 2.93e-02, grad_scale: 32.0 +2024-01-15 14:33:00,629 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 14:33:14,077 INFO [zipformer.py:1858] (1/2) name=encoder.encoders.3.encoder.layers.3.self_attn_weights, attn_weights_entropy = tensor([1.5269, 1.6770, 2.1494, 2.3337, 2.1700, 2.3997, 2.2407, 2.3384], + device='cuda:1') +2024-01-15 14:33:20,522 INFO [train.py:1026] (1/2) Epoch 12, validation: loss=0.1784, simple_loss=0.2662, pruned_loss=0.04527, over 1622729.00 frames. +2024-01-15 14:33:20,523 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 14:33:36,494 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.ff3_skip_rate, batch_count=30943.333333333332, ans=0.004142753623188406 +2024-01-15 14:33:42,437 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.5.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 14:34:07,596 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.self_attn1.whiten, num_groups=1, num_channels=512, metric=21.76 vs. limit=22.5 +2024-01-15 14:34:10,508 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.934e+02 2.453e+02 2.794e+02 3.264e+02 4.680e+02, threshold=5.589e+02, percent-clipped=0.0 +2024-01-15 14:34:23,794 INFO [train.py:994] (1/2) Epoch 12, batch 50, loss[loss=0.2057, simple_loss=0.2766, pruned_loss=0.06734, over 24506.00 frames. ], tot_loss[loss=0.205, simple_loss=0.2738, pruned_loss=0.0681, over 1088725.22 frames. ], batch size: 187, lr: 2.92e-02, grad_scale: 32.0 +2024-01-15 14:34:24,105 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module1.balancer2.prob, batch_count=31076.666666666668, ans=0.125 +2024-01-15 14:34:30,128 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass.scale_min, batch_count=31076.666666666668, ans=0.2 +2024-01-15 14:34:32,429 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.bypass_mid.scale_min, batch_count=31076.666666666668, ans=0.2 +2024-01-15 14:34:35,802 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module1.balancer1.max_abs, batch_count=31110.0, ans=10.0 +2024-01-15 14:35:00,705 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module2.balancer2.prob, batch_count=31176.666666666668, ans=0.125 +2024-01-15 14:35:02,597 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_skip_rate, batch_count=31176.666666666668, ans=0.0 +2024-01-15 14:35:22,156 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module1.balancer2.min_positive, batch_count=31210.0, ans=0.05 +2024-01-15 14:35:25,477 INFO [train.py:994] (1/2) Epoch 12, batch 100, loss[loss=0.2115, simple_loss=0.2824, pruned_loss=0.07026, over 24475.00 frames. ], tot_loss[loss=0.2062, simple_loss=0.2749, pruned_loss=0.06875, over 1924898.91 frames. ], batch size: 181, lr: 2.92e-02, grad_scale: 32.0 +2024-01-15 14:35:26,038 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=4.16 vs. limit=6.0 +2024-01-15 14:35:31,187 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_module2.balancer2.min_abs, batch_count=31243.333333333332, ans=0.5 +2024-01-15 14:35:39,035 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.bypass.skip_rate, batch_count=31276.666666666668, ans=0.07 +2024-01-15 14:35:49,788 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.ff2_skip_rate, batch_count=31310.0, ans=0.004063043478260869 +2024-01-15 14:36:15,780 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.841e+02 2.234e+02 2.534e+02 2.829e+02 3.709e+02, threshold=5.069e+02, percent-clipped=0.0 +2024-01-15 14:36:19,409 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.nonlin_attention.whiten1, num_groups=1, num_channels=384, metric=4.33 vs. limit=10.0 +2024-01-15 14:36:21,272 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_skip_rate, batch_count=31376.666666666668, ans=0.0 +2024-01-15 14:36:28,119 INFO [train.py:994] (1/2) Epoch 12, batch 150, loss[loss=0.2072, simple_loss=0.2793, pruned_loss=0.06754, over 24433.00 frames. ], tot_loss[loss=0.2063, simple_loss=0.2753, pruned_loss=0.0686, over 2570682.37 frames. ], batch size: 159, lr: 2.91e-02, grad_scale: 16.0 +2024-01-15 14:36:42,349 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.self_attn2.whiten, num_groups=1, num_channels=512, metric=21.96 vs. limit=22.5 +2024-01-15 14:37:13,368 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_skip_rate, batch_count=31510.0, ans=0.0 +2024-01-15 14:37:30,126 INFO [train.py:994] (1/2) Epoch 12, batch 200, loss[loss=0.2137, simple_loss=0.2786, pruned_loss=0.07441, over 24402.00 frames. ], tot_loss[loss=0.2064, simple_loss=0.2754, pruned_loss=0.06866, over 3066292.53 frames. ], batch size: 159, lr: 2.91e-02, grad_scale: 16.0 +2024-01-15 14:37:49,267 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.feed_forward3.out_whiten, num_groups=1, num_channels=512, metric=18.01 vs. limit=15.0 +2024-01-15 14:37:50,951 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.attention_skip_rate, batch_count=31610.0, ans=0.0 +2024-01-15 14:37:54,789 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.feed_forward2.out_whiten, num_groups=1, num_channels=384, metric=11.71 vs. limit=15.0 +2024-01-15 14:38:21,334 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.969e+02 2.375e+02 2.614e+02 3.067e+02 4.543e+02, threshold=5.228e+02, percent-clipped=0.0 +2024-01-15 14:38:32,480 INFO [train.py:994] (1/2) Epoch 12, batch 250, loss[loss=0.2027, simple_loss=0.2722, pruned_loss=0.0666, over 24347.00 frames. ], tot_loss[loss=0.2062, simple_loss=0.2748, pruned_loss=0.06881, over 3441507.59 frames. ], batch size: 153, lr: 2.90e-02, grad_scale: 16.0 +2024-01-15 14:38:41,755 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=31743.333333333332, ans=0.125 +2024-01-15 14:38:41,843 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_module2.balancer1.prob, batch_count=31743.333333333332, ans=0.125 +2024-01-15 14:39:14,961 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_module2.balancer2.min_positive, batch_count=31843.333333333332, ans=0.05 +2024-01-15 14:39:27,844 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=5.56 vs. limit=6.0 +2024-01-15 14:39:33,734 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.3.encoder.layers.3.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 14:39:34,607 INFO [train.py:994] (1/2) Epoch 12, batch 300, loss[loss=0.2022, simple_loss=0.2733, pruned_loss=0.06556, over 24526.00 frames. ], tot_loss[loss=0.206, simple_loss=0.2749, pruned_loss=0.06852, over 3753852.33 frames. ], batch size: 165, lr: 2.90e-02, grad_scale: 16.0 +2024-01-15 14:39:39,632 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=31910.0, ans=0.1 +2024-01-15 14:39:54,366 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.balancer2.prob, batch_count=31943.333333333332, ans=0.125 +2024-01-15 14:40:25,017 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.837e+02 2.325e+02 2.638e+02 3.044e+02 6.193e+02, threshold=5.276e+02, percent-clipped=1.0 +2024-01-15 14:40:35,492 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=32076.666666666668, ans=0.125 +2024-01-15 14:40:36,427 INFO [train.py:994] (1/2) Epoch 12, batch 350, loss[loss=0.2011, simple_loss=0.2759, pruned_loss=0.06314, over 24424.00 frames. ], tot_loss[loss=0.2058, simple_loss=0.2746, pruned_loss=0.06851, over 3984225.79 frames. ], batch size: 250, lr: 2.89e-02, grad_scale: 16.0 +2024-01-15 14:40:49,611 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.self_attn2.whiten, num_groups=1, num_channels=192, metric=16.18 vs. limit=22.5 +2024-01-15 14:40:53,760 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=384, metric=4.21 vs. limit=15.0 +2024-01-15 14:41:08,526 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=5.50 vs. limit=6.0 +2024-01-15 14:41:38,211 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward3.hidden_balancer.prob, batch_count=32243.333333333332, ans=0.125 +2024-01-15 14:41:38,671 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=384, metric=20.88 vs. limit=22.5 +2024-01-15 14:41:39,094 INFO [train.py:994] (1/2) Epoch 12, batch 400, loss[loss=0.2141, simple_loss=0.2873, pruned_loss=0.07047, over 24313.00 frames. ], tot_loss[loss=0.2052, simple_loss=0.2744, pruned_loss=0.06802, over 4182591.15 frames. ], batch size: 298, lr: 2.89e-02, grad_scale: 32.0 +2024-01-15 14:42:30,261 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.805e+02 2.236e+02 2.460e+02 2.707e+02 3.564e+02, threshold=4.920e+02, percent-clipped=0.0 +2024-01-15 14:42:41,825 INFO [train.py:994] (1/2) Epoch 12, batch 450, loss[loss=0.1893, simple_loss=0.2379, pruned_loss=0.07035, over 18952.00 frames. ], tot_loss[loss=0.2048, simple_loss=0.2738, pruned_loss=0.06785, over 4320372.87 frames. ], batch size: 81, lr: 2.88e-02, grad_scale: 32.0 +2024-01-15 14:42:43,326 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=32410.0, ans=0.1 +2024-01-15 14:43:02,741 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=256, metric=3.61 vs. limit=15.0 +2024-01-15 14:43:08,442 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.whiten, num_groups=1, num_channels=384, metric=6.33 vs. limit=12.0 +2024-01-15 14:43:19,636 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=4.78 vs. limit=6.0 +2024-01-15 14:43:21,602 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.convnext.hidden_balancer.prob, batch_count=32510.0, ans=0.125 +2024-01-15 14:43:43,899 INFO [train.py:994] (1/2) Epoch 12, batch 500, loss[loss=0.2159, simple_loss=0.2855, pruned_loss=0.07316, over 24468.00 frames. ], tot_loss[loss=0.2049, simple_loss=0.2742, pruned_loss=0.06776, over 4434186.43 frames. ], batch size: 222, lr: 2.88e-02, grad_scale: 32.0 +2024-01-15 14:44:10,165 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.attention_skip_rate, batch_count=32643.333333333332, ans=0.0 +2024-01-15 14:44:11,275 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.balancer2.prob, batch_count=32643.333333333332, ans=0.125 +2024-01-15 14:44:34,706 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.859e+02 2.314e+02 2.670e+02 3.200e+02 5.574e+02, threshold=5.340e+02, percent-clipped=2.0 +2024-01-15 14:44:45,425 INFO [train.py:994] (1/2) Epoch 12, batch 550, loss[loss=0.1983, simple_loss=0.2649, pruned_loss=0.06579, over 24411.00 frames. ], tot_loss[loss=0.2052, simple_loss=0.2746, pruned_loss=0.06794, over 4529417.24 frames. ], batch size: 258, lr: 2.88e-02, grad_scale: 32.0 +2024-01-15 14:45:05,791 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=32776.666666666664, ans=0.0 +2024-01-15 14:45:14,110 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.nonlin_attention.balancer.prob, batch_count=32810.0, ans=0.125 +2024-01-15 14:45:24,846 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=32843.333333333336, ans=0.125 +2024-01-15 14:45:30,894 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.balancer1.prob, batch_count=32843.333333333336, ans=0.125 +2024-01-15 14:45:48,557 INFO [train.py:994] (1/2) Epoch 12, batch 600, loss[loss=0.2216, simple_loss=0.2872, pruned_loss=0.078, over 24520.00 frames. ], tot_loss[loss=0.2054, simple_loss=0.2744, pruned_loss=0.06815, over 4575645.75 frames. ], batch size: 187, lr: 2.87e-02, grad_scale: 32.0 +2024-01-15 14:45:55,992 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=384, metric=11.71 vs. limit=15.0 +2024-01-15 14:46:20,557 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=32976.666666666664, ans=0.1 +2024-01-15 14:46:20,645 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass.scale_min, batch_count=32976.666666666664, ans=0.2 +2024-01-15 14:46:40,524 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.867e+02 2.274e+02 2.530e+02 3.113e+02 5.260e+02, threshold=5.061e+02, percent-clipped=0.0 +2024-01-15 14:46:45,682 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward3.hidden_balancer.prob, batch_count=33043.333333333336, ans=0.125 +2024-01-15 14:46:50,233 INFO [train.py:994] (1/2) Epoch 12, batch 650, loss[loss=0.2191, simple_loss=0.2865, pruned_loss=0.07582, over 24216.00 frames. ], tot_loss[loss=0.2047, simple_loss=0.274, pruned_loss=0.06765, over 4641473.22 frames. ], batch size: 311, lr: 2.87e-02, grad_scale: 16.0 +2024-01-15 14:47:03,523 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=512, metric=10.42 vs. limit=15.0 +2024-01-15 14:47:10,251 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module2.balancer1.prob, batch_count=33110.0, ans=0.125 +2024-01-15 14:47:18,563 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module2.balancer2.prob, batch_count=33143.333333333336, ans=0.125 +2024-01-15 14:47:25,135 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=33143.333333333336, ans=0.125 +2024-01-15 14:47:29,939 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module2.balancer1.prob, batch_count=33176.666666666664, ans=0.125 +2024-01-15 14:47:32,258 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.nonlin_attention.balancer.min_positive, batch_count=33176.666666666664, ans=0.05 +2024-01-15 14:47:37,030 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.ff3_skip_rate, batch_count=33176.666666666664, ans=0.0036572463768115944 +2024-01-15 14:47:53,444 INFO [train.py:994] (1/2) Epoch 12, batch 700, loss[loss=0.2167, simple_loss=0.2853, pruned_loss=0.07404, over 24457.00 frames. ], tot_loss[loss=0.2048, simple_loss=0.2744, pruned_loss=0.06757, over 4680955.73 frames. ], batch size: 216, lr: 2.86e-02, grad_scale: 16.0 +2024-01-15 14:47:59,725 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.ff2_skip_rate, batch_count=33243.333333333336, ans=0.0036427536231884057 +2024-01-15 14:48:07,845 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward3.hidden_balancer.prob, batch_count=33276.666666666664, ans=0.125 +2024-01-15 14:48:16,663 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.bypass.skip_rate, batch_count=33310.0, ans=0.07 +2024-01-15 14:48:25,787 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.attention_skip_rate, batch_count=33310.0, ans=0.0 +2024-01-15 14:48:29,596 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.bypass.skip_rate, batch_count=33343.333333333336, ans=0.09899494936611666 +2024-01-15 14:48:31,900 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=33343.333333333336, ans=0.125 +2024-01-15 14:48:45,948 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.833e+02 2.288e+02 2.551e+02 2.862e+02 5.398e+02, threshold=5.102e+02, percent-clipped=1.0 +2024-01-15 14:48:50,935 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward3.hidden_balancer.prob, batch_count=33376.666666666664, ans=0.125 +2024-01-15 14:48:55,408 INFO [train.py:994] (1/2) Epoch 12, batch 750, loss[loss=0.2085, simple_loss=0.2765, pruned_loss=0.07026, over 24469.00 frames. ], tot_loss[loss=0.2044, simple_loss=0.2741, pruned_loss=0.0674, over 4711247.79 frames. ], batch size: 170, lr: 2.86e-02, grad_scale: 16.0 +2024-01-15 14:48:58,039 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module2.balancer1.prob, batch_count=33410.0, ans=0.125 +2024-01-15 14:49:09,970 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.bypass.skip_rate, batch_count=33443.333333333336, ans=0.035 +2024-01-15 14:49:30,434 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.conv_module2.whiten, num_groups=1, num_channels=256, metric=5.09 vs. limit=15.0 +2024-01-15 14:49:30,996 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_skip_rate, batch_count=33510.0, ans=0.0 +2024-01-15 14:49:35,008 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module2.balancer2.min_abs, batch_count=33510.0, ans=0.5 +2024-01-15 14:49:55,657 INFO [train.py:994] (1/2) Epoch 12, batch 800, loss[loss=0.2183, simple_loss=0.2874, pruned_loss=0.07466, over 24492.00 frames. ], tot_loss[loss=0.2041, simple_loss=0.2734, pruned_loss=0.06737, over 4728667.80 frames. ], batch size: 210, lr: 2.85e-02, grad_scale: 32.0 +2024-01-15 14:50:00,363 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=33576.666666666664, ans=0.125 +2024-01-15 14:50:06,168 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=33610.0, ans=0.1 +2024-01-15 14:50:08,410 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward1.out_proj.dropout_p, batch_count=33610.0, ans=0.1 +2024-01-15 14:50:15,545 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=33610.0, ans=0.125 +2024-01-15 14:50:43,707 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.987e+02 2.226e+02 2.599e+02 3.021e+02 8.149e+02, threshold=5.199e+02, percent-clipped=1.0 +2024-01-15 14:51:08,434 INFO [train.py:994] (1/2) Epoch 13, batch 0, loss[loss=0.1948, simple_loss=0.2631, pruned_loss=0.06331, over 24436.00 frames. ], tot_loss[loss=0.1948, simple_loss=0.2631, pruned_loss=0.06331, over 24436.00 frames. ], batch size: 250, lr: 2.78e-02, grad_scale: 32.0 +2024-01-15 14:51:08,435 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 14:51:28,554 INFO [train.py:1026] (1/2) Epoch 13, validation: loss=0.1772, simple_loss=0.2649, pruned_loss=0.04477, over 1622729.00 frames. +2024-01-15 14:51:28,555 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 14:51:28,866 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module1.balancer2.min_positive, batch_count=33720.0, ans=0.05 +2024-01-15 14:51:32,417 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=33720.0, ans=0.125 +2024-01-15 14:51:35,818 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.whiten, num_groups=1, num_channels=384, metric=2.96 vs. limit=12.0 +2024-01-15 14:51:42,884 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=33753.333333333336, ans=0.125 +2024-01-15 14:51:51,456 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.balancer2.prob, batch_count=33753.333333333336, ans=0.125 +2024-01-15 14:52:13,514 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.balancer2.prob, batch_count=33820.0, ans=0.125 +2024-01-15 14:52:19,330 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=3.64 vs. limit=6.0 +2024-01-15 14:52:25,509 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=384, metric=14.16 vs. limit=15.0 +2024-01-15 14:52:32,579 INFO [train.py:994] (1/2) Epoch 13, batch 50, loss[loss=0.1891, simple_loss=0.2593, pruned_loss=0.05943, over 24475.00 frames. ], tot_loss[loss=0.1995, simple_loss=0.2684, pruned_loss=0.06527, over 1081551.92 frames. ], batch size: 222, lr: 2.77e-02, grad_scale: 32.0 +2024-01-15 14:52:35,188 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.ff3_skip_rate, batch_count=33886.666666666664, ans=0.0035028985507246377 +2024-01-15 14:53:01,902 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.self_attn1.whiten, num_groups=1, num_channels=384, metric=17.46 vs. limit=22.5 +2024-01-15 14:53:04,323 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.conv_module1.whiten, num_groups=1, num_channels=512, metric=5.86 vs. limit=15.0 +2024-01-15 14:53:24,960 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.whiten, num_groups=1, num_channels=192, metric=4.29 vs. limit=12.0 +2024-01-15 14:53:25,592 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.ff2_skip_rate, batch_count=34020.0, ans=0.0034739130434782613 +2024-01-15 14:53:31,417 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=34020.0, ans=0.1 +2024-01-15 14:53:34,166 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.857e+02 2.225e+02 2.592e+02 3.041e+02 4.021e+02, threshold=5.184e+02, percent-clipped=0.0 +2024-01-15 14:53:35,377 INFO [train.py:994] (1/2) Epoch 13, batch 100, loss[loss=0.1906, simple_loss=0.2655, pruned_loss=0.05782, over 24378.00 frames. ], tot_loss[loss=0.2003, simple_loss=0.2696, pruned_loss=0.06551, over 1900127.49 frames. ], batch size: 275, lr: 2.77e-02, grad_scale: 32.0 +2024-01-15 14:53:39,198 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.nonlin_attention.balancer.max_positive, batch_count=34053.333333333336, ans=0.95 +2024-01-15 14:53:47,552 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_skip_rate, batch_count=34086.666666666664, ans=0.0 +2024-01-15 14:54:00,907 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=34120.0, ans=0.125 +2024-01-15 14:54:15,597 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_skip_rate, batch_count=34153.333333333336, ans=0.0 +2024-01-15 14:54:21,488 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=34153.333333333336, ans=0.1 +2024-01-15 14:54:22,702 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.bypass.skip_rate, batch_count=34153.333333333336, ans=0.09899494936611666 +2024-01-15 14:54:33,028 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module1.balancer2.prob, batch_count=34186.666666666664, ans=0.125 +2024-01-15 14:54:36,429 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module1.balancer2.prob, batch_count=34220.0, ans=0.125 +2024-01-15 14:54:37,351 INFO [train.py:994] (1/2) Epoch 13, batch 150, loss[loss=0.1973, simple_loss=0.2675, pruned_loss=0.06352, over 24506.00 frames. ], tot_loss[loss=0.2001, simple_loss=0.2695, pruned_loss=0.06528, over 2546215.69 frames. ], batch size: 187, lr: 2.77e-02, grad_scale: 32.0 +2024-01-15 14:54:37,715 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_skip_rate, batch_count=34220.0, ans=0.0 +2024-01-15 14:54:40,814 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.conv_module1.whiten, num_groups=1, num_channels=384, metric=2.23 vs. limit=15.0 +2024-01-15 14:54:41,756 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.bypass.skip_rate, batch_count=34220.0, ans=0.07 +2024-01-15 14:54:44,158 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=34220.0, ans=0.125 +2024-01-15 14:54:50,438 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=11.99 vs. limit=15.0 +2024-01-15 14:55:14,785 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_whiten.whitening_limit, batch_count=34320.0, ans=15.0 +2024-01-15 14:55:39,356 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.689e+02 2.355e+02 2.666e+02 3.154e+02 5.036e+02, threshold=5.331e+02, percent-clipped=0.0 +2024-01-15 14:55:40,576 INFO [train.py:994] (1/2) Epoch 13, batch 200, loss[loss=0.198, simple_loss=0.2726, pruned_loss=0.06173, over 24398.00 frames. ], tot_loss[loss=0.2004, simple_loss=0.2703, pruned_loss=0.06523, over 3044211.43 frames. ], batch size: 159, lr: 2.76e-02, grad_scale: 32.0 +2024-01-15 14:55:55,572 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.attention_skip_rate, batch_count=34420.0, ans=0.0 +2024-01-15 14:55:56,890 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass_mid.scale_min, batch_count=34420.0, ans=0.2 +2024-01-15 14:56:03,347 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=34420.0, ans=0.125 +2024-01-15 14:56:12,083 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.out_proj.dropout_p, batch_count=34453.333333333336, ans=0.1 +2024-01-15 14:56:13,337 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module1.balancer2.prob, batch_count=34453.333333333336, ans=0.125 +2024-01-15 14:56:15,725 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.0.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 14:56:43,203 INFO [train.py:994] (1/2) Epoch 13, batch 250, loss[loss=0.2096, simple_loss=0.2758, pruned_loss=0.07175, over 24356.00 frames. ], tot_loss[loss=0.1996, simple_loss=0.2692, pruned_loss=0.06498, over 3423643.47 frames. ], batch size: 153, lr: 2.76e-02, grad_scale: 32.0 +2024-01-15 14:57:02,297 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_skip_rate, batch_count=34586.666666666664, ans=0.0 +2024-01-15 14:57:03,533 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward3.hidden_balancer.prob, batch_count=34586.666666666664, ans=0.125 +2024-01-15 14:57:22,338 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.conv_module1.whiten, num_groups=1, num_channels=384, metric=3.69 vs. limit=15.0 +2024-01-15 14:57:31,199 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.attention_skip_rate, batch_count=34653.333333333336, ans=0.0 +2024-01-15 14:57:32,783 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.self_attn2.whiten.whitening_limit, batch_count=34686.666666666664, ans=22.5 +2024-01-15 14:57:35,904 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=34686.666666666664, ans=0.125 +2024-01-15 14:57:35,960 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.nonlin_attention.balancer.max_positive, batch_count=34686.666666666664, ans=0.95 +2024-01-15 14:57:42,046 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=256, metric=19.06 vs. limit=22.5 +2024-01-15 14:57:43,022 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward3.hidden_balancer.prob, batch_count=34686.666666666664, ans=0.125 +2024-01-15 14:57:44,588 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.872e+02 2.193e+02 2.493e+02 2.832e+02 4.514e+02, threshold=4.985e+02, percent-clipped=0.0 +2024-01-15 14:57:45,788 INFO [train.py:994] (1/2) Epoch 13, batch 300, loss[loss=0.1937, simple_loss=0.263, pruned_loss=0.06222, over 24193.00 frames. ], tot_loss[loss=0.2, simple_loss=0.2699, pruned_loss=0.0651, over 3736669.38 frames. ], batch size: 140, lr: 2.75e-02, grad_scale: 32.0 +2024-01-15 14:58:20,173 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_skip_rate, batch_count=34786.666666666664, ans=0.0 +2024-01-15 14:58:47,857 INFO [train.py:994] (1/2) Epoch 13, batch 350, loss[loss=0.2065, simple_loss=0.2799, pruned_loss=0.06652, over 23857.00 frames. ], tot_loss[loss=0.2001, simple_loss=0.2701, pruned_loss=0.06507, over 3963924.86 frames. ], batch size: 328, lr: 2.75e-02, grad_scale: 32.0 +2024-01-15 14:58:56,512 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.balancer_ff2.min_abs, batch_count=34886.666666666664, ans=0.1 +2024-01-15 14:59:15,875 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.balancer2.prob, batch_count=34953.333333333336, ans=0.125 +2024-01-15 14:59:44,409 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.bypass_mid.scale_min, batch_count=35020.0, ans=0.2 +2024-01-15 14:59:48,953 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.819e+02 2.352e+02 2.687e+02 3.129e+02 5.863e+02, threshold=5.374e+02, percent-clipped=1.0 +2024-01-15 14:59:50,221 INFO [train.py:994] (1/2) Epoch 13, batch 400, loss[loss=0.2053, simple_loss=0.2715, pruned_loss=0.06953, over 24466.00 frames. ], tot_loss[loss=0.1994, simple_loss=0.2693, pruned_loss=0.06474, over 4149454.86 frames. ], batch size: 250, lr: 2.74e-02, grad_scale: 32.0 +2024-01-15 15:00:12,169 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.feed_forward1.out_whiten, num_groups=1, num_channels=384, metric=10.30 vs. limit=15.0 +2024-01-15 15:00:23,977 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module1.balancer1.prob, batch_count=35120.0, ans=0.125 +2024-01-15 15:00:25,189 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module2.balancer1.max_abs, batch_count=35120.0, ans=10.0 +2024-01-15 15:00:27,487 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=35153.333333333336, ans=0.1 +2024-01-15 15:00:50,190 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=35186.666666666664, ans=0.1 +2024-01-15 15:00:53,428 INFO [train.py:994] (1/2) Epoch 13, batch 450, loss[loss=0.1921, simple_loss=0.2696, pruned_loss=0.05728, over 24313.00 frames. ], tot_loss[loss=0.1995, simple_loss=0.2699, pruned_loss=0.06456, over 4302300.50 frames. ], batch size: 285, lr: 2.74e-02, grad_scale: 32.0 +2024-01-15 15:00:58,442 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_skip_rate, batch_count=35220.0, ans=0.0 +2024-01-15 15:01:09,196 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module2.balancer1.prob, batch_count=35253.333333333336, ans=0.125 +2024-01-15 15:01:09,215 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=35253.333333333336, ans=0.125 +2024-01-15 15:01:32,766 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module2.balancer2.prob, batch_count=35320.0, ans=0.125 +2024-01-15 15:01:36,101 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_skip_rate, batch_count=35320.0, ans=0.0 +2024-01-15 15:01:41,657 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=384, metric=11.93 vs. limit=15.0 +2024-01-15 15:01:46,158 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.self_attn_weights.pos_emb_skip_rate, batch_count=35353.333333333336, ans=0.0 +2024-01-15 15:01:54,116 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.682e+02 2.235e+02 2.583e+02 3.206e+02 4.584e+02, threshold=5.166e+02, percent-clipped=0.0 +2024-01-15 15:01:55,313 INFO [train.py:994] (1/2) Epoch 13, batch 500, loss[loss=0.2047, simple_loss=0.2744, pruned_loss=0.06753, over 24523.00 frames. ], tot_loss[loss=0.1986, simple_loss=0.269, pruned_loss=0.06404, over 4404296.56 frames. ], batch size: 243, lr: 2.73e-02, grad_scale: 32.0 +2024-01-15 15:02:04,512 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=35386.666666666664, ans=0.1 +2024-01-15 15:02:38,236 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.balancer1.prob, batch_count=35486.666666666664, ans=0.125 +2024-01-15 15:02:57,426 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module1.balancer1.prob, batch_count=35553.333333333336, ans=0.125 +2024-01-15 15:02:58,363 INFO [train.py:994] (1/2) Epoch 13, batch 550, loss[loss=0.1692, simple_loss=0.2427, pruned_loss=0.04782, over 24311.00 frames. ], tot_loss[loss=0.1993, simple_loss=0.2698, pruned_loss=0.06439, over 4502574.39 frames. ], batch size: 147, lr: 2.73e-02, grad_scale: 16.0 +2024-01-15 15:03:05,858 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=35553.333333333336, ans=0.1 +2024-01-15 15:03:11,805 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=35586.666666666664, ans=0.1 +2024-01-15 15:03:13,562 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=256, metric=8.37 vs. limit=15.0 +2024-01-15 15:03:15,448 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.balancer1.prob, batch_count=35586.666666666664, ans=0.125 +2024-01-15 15:03:45,828 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.bypass.scale_min, batch_count=35653.333333333336, ans=0.2 +2024-01-15 15:03:48,170 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.ff2_skip_rate, batch_count=35686.666666666664, ans=0.003111594202898551 +2024-01-15 15:03:58,323 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=256, metric=17.19 vs. limit=22.5 +2024-01-15 15:03:58,441 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=384, metric=2.49 vs. limit=15.0 +2024-01-15 15:04:00,209 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.854e+02 2.330e+02 2.832e+02 3.202e+02 4.606e+02, threshold=5.663e+02, percent-clipped=0.0 +2024-01-15 15:04:00,237 INFO [train.py:994] (1/2) Epoch 13, batch 600, loss[loss=0.1679, simple_loss=0.2226, pruned_loss=0.05662, over 18720.00 frames. ], tot_loss[loss=0.1988, simple_loss=0.2692, pruned_loss=0.06414, over 4558209.31 frames. ], batch size: 81, lr: 2.73e-02, grad_scale: 16.0 +2024-01-15 15:04:04,216 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.ff2_skip_rate, batch_count=35720.0, ans=0.003104347826086957 +2024-01-15 15:04:04,245 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.bypass.scale_min, batch_count=35720.0, ans=0.2 +2024-01-15 15:04:25,996 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.out_combiner.scale_min, batch_count=35786.666666666664, ans=0.2 +2024-01-15 15:04:27,278 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=35786.666666666664, ans=0.0 +2024-01-15 15:04:39,681 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module2.balancer2.min_abs, batch_count=35820.0, ans=0.5 +2024-01-15 15:04:41,995 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module1.balancer2.prob, batch_count=35820.0, ans=0.125 +2024-01-15 15:04:49,760 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.balancer_na.min_abs, batch_count=35853.333333333336, ans=0.02 +2024-01-15 15:04:53,343 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.attention_skip_rate, batch_count=35853.333333333336, ans=0.0 +2024-01-15 15:04:59,267 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_skip_rate, batch_count=35853.333333333336, ans=0.0 +2024-01-15 15:05:01,301 INFO [train.py:994] (1/2) Epoch 13, batch 650, loss[loss=0.1922, simple_loss=0.2699, pruned_loss=0.0572, over 24464.00 frames. ], tot_loss[loss=0.1985, simple_loss=0.269, pruned_loss=0.06397, over 4613519.25 frames. ], batch size: 267, lr: 2.72e-02, grad_scale: 16.0 +2024-01-15 15:05:13,508 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.attention_skip_rate, batch_count=35920.0, ans=0.0 +2024-01-15 15:05:14,675 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.attention_skip_rate, batch_count=35920.0, ans=0.0 +2024-01-15 15:05:27,856 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.bypass.skip_rate, batch_count=35953.333333333336, ans=0.07 +2024-01-15 15:06:03,556 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module1.balancer2.prob, batch_count=36053.333333333336, ans=0.125 +2024-01-15 15:06:04,284 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.839e+02 2.223e+02 2.545e+02 2.957e+02 4.129e+02, threshold=5.090e+02, percent-clipped=0.0 +2024-01-15 15:06:04,314 INFO [train.py:994] (1/2) Epoch 13, batch 700, loss[loss=0.1955, simple_loss=0.2714, pruned_loss=0.05976, over 24394.00 frames. ], tot_loss[loss=0.1979, simple_loss=0.2686, pruned_loss=0.06364, over 4648475.31 frames. ], batch size: 258, lr: 2.72e-02, grad_scale: 16.0 +2024-01-15 15:06:44,224 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.feed_forward1.out_whiten, num_groups=1, num_channels=384, metric=14.93 vs. limit=15.0 +2024-01-15 15:07:02,869 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.bypass.scale_min, batch_count=36186.666666666664, ans=0.2 +2024-01-15 15:07:06,163 INFO [train.py:994] (1/2) Epoch 13, batch 750, loss[loss=0.1953, simple_loss=0.2721, pruned_loss=0.0592, over 24466.00 frames. ], tot_loss[loss=0.1984, simple_loss=0.2691, pruned_loss=0.06381, over 4686985.37 frames. ], batch size: 250, lr: 2.71e-02, grad_scale: 16.0 +2024-01-15 15:07:08,308 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=7.79 vs. limit=15.0 +2024-01-15 15:07:08,872 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=36220.0, ans=0.1 +2024-01-15 15:07:24,138 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.nonlin_attention.balancer.prob, batch_count=36253.333333333336, ans=0.125 +2024-01-15 15:07:39,163 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.feed_forward2.out_whiten, num_groups=1, num_channels=384, metric=13.94 vs. limit=15.0 +2024-01-15 15:08:04,646 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.whiten, num_groups=1, num_channels=384, metric=3.82 vs. limit=12.0 +2024-01-15 15:08:06,423 INFO [train.py:994] (1/2) Epoch 13, batch 800, loss[loss=0.1977, simple_loss=0.2744, pruned_loss=0.06054, over 24469.00 frames. ], tot_loss[loss=0.1983, simple_loss=0.2688, pruned_loss=0.06385, over 4715498.28 frames. ], batch size: 267, lr: 2.71e-02, grad_scale: 16.0 +2024-01-15 15:08:07,599 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.913e+02 2.262e+02 2.496e+02 3.133e+02 5.825e+02, threshold=4.992e+02, percent-clipped=1.0 +2024-01-15 15:08:12,112 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.conv.2.prob, batch_count=36386.666666666664, ans=0.125 +2024-01-15 15:08:23,782 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.bypass.scale_min, batch_count=36420.0, ans=0.2 +2024-01-15 15:08:27,153 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.ff2_skip_rate, batch_count=36420.0, ans=0.002952173913043479 +2024-01-15 15:09:18,724 INFO [train.py:994] (1/2) Epoch 14, batch 0, loss[loss=0.205, simple_loss=0.2732, pruned_loss=0.06847, over 24538.00 frames. ], tot_loss[loss=0.205, simple_loss=0.2732, pruned_loss=0.06847, over 24538.00 frames. ], batch size: 236, lr: 2.64e-02, grad_scale: 32.0 +2024-01-15 15:09:18,725 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 15:09:38,585 INFO [train.py:1026] (1/2) Epoch 14, validation: loss=0.1766, simple_loss=0.2638, pruned_loss=0.04469, over 1622729.00 frames. +2024-01-15 15:09:38,585 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 15:10:01,057 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.whiten, num_groups=1, num_channels=512, metric=4.50 vs. limit=12.0 +2024-01-15 15:10:42,549 INFO [train.py:994] (1/2) Epoch 14, batch 50, loss[loss=0.205, simple_loss=0.2807, pruned_loss=0.06466, over 23888.00 frames. ], tot_loss[loss=0.1942, simple_loss=0.2649, pruned_loss=0.06173, over 1081567.28 frames. ], batch size: 328, lr: 2.64e-02, grad_scale: 32.0 +2024-01-15 15:10:44,533 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=384, metric=11.40 vs. limit=15.0 +2024-01-15 15:10:49,416 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module1.balancer1.prob, batch_count=36696.666666666664, ans=0.125 +2024-01-15 15:10:52,665 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.716e+02 2.278e+02 2.632e+02 3.085e+02 4.590e+02, threshold=5.264e+02, percent-clipped=0.0 +2024-01-15 15:11:00,264 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.4.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 15:11:06,754 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.ff3_skip_rate, batch_count=36763.333333333336, ans=0.002877536231884057 +2024-01-15 15:11:45,595 INFO [train.py:994] (1/2) Epoch 14, batch 100, loss[loss=0.1955, simple_loss=0.271, pruned_loss=0.05995, over 24495.00 frames. ], tot_loss[loss=0.1948, simple_loss=0.267, pruned_loss=0.06136, over 1921346.38 frames. ], batch size: 210, lr: 2.63e-02, grad_scale: 32.0 +2024-01-15 15:11:56,688 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=36896.666666666664, ans=0.0 +2024-01-15 15:12:06,324 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=36896.666666666664, ans=0.125 +2024-01-15 15:12:36,088 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=36996.666666666664, ans=0.0 +2024-01-15 15:12:42,090 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module2.balancer2.prob, batch_count=36996.666666666664, ans=0.125 +2024-01-15 15:12:48,247 INFO [train.py:994] (1/2) Epoch 14, batch 150, loss[loss=0.2011, simple_loss=0.2696, pruned_loss=0.06632, over 24379.00 frames. ], tot_loss[loss=0.1952, simple_loss=0.2669, pruned_loss=0.06178, over 2570526.93 frames. ], batch size: 275, lr: 2.63e-02, grad_scale: 32.0 +2024-01-15 15:12:55,155 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.bypass.skip_rate, batch_count=37030.0, ans=0.04949747468305833 +2024-01-15 15:12:58,304 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.946e+02 2.387e+02 2.863e+02 3.406e+02 4.847e+02, threshold=5.726e+02, percent-clipped=0.0 +2024-01-15 15:13:01,270 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.conv_module2.whiten, num_groups=1, num_channels=512, metric=7.40 vs. limit=15.0 +2024-01-15 15:13:43,280 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass_mid.scale_min, batch_count=37163.333333333336, ans=0.2 +2024-01-15 15:13:46,754 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.bypass.skip_rate, batch_count=37163.333333333336, ans=0.07 +2024-01-15 15:13:51,288 INFO [train.py:994] (1/2) Epoch 14, batch 200, loss[loss=0.191, simple_loss=0.2635, pruned_loss=0.0593, over 24341.00 frames. ], tot_loss[loss=0.195, simple_loss=0.2667, pruned_loss=0.06167, over 3067987.74 frames. ], batch size: 285, lr: 2.62e-02, grad_scale: 16.0 +2024-01-15 15:13:52,740 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_skip_rate, batch_count=37196.666666666664, ans=0.0 +2024-01-15 15:14:09,854 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.self_attn2.whiten, num_groups=1, num_channels=384, metric=17.71 vs. limit=22.5 +2024-01-15 15:14:11,039 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=4.51 vs. limit=6.0 +2024-01-15 15:14:49,911 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.self_attn2.whiten, num_groups=1, num_channels=256, metric=18.59 vs. limit=22.5 +2024-01-15 15:14:51,760 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=37330.0, ans=0.1 +2024-01-15 15:14:53,852 INFO [train.py:994] (1/2) Epoch 14, batch 250, loss[loss=0.1911, simple_loss=0.2613, pruned_loss=0.0605, over 24391.00 frames. ], tot_loss[loss=0.1951, simple_loss=0.267, pruned_loss=0.06161, over 3460332.81 frames. ], batch size: 159, lr: 2.62e-02, grad_scale: 16.0 +2024-01-15 15:15:02,558 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=37363.333333333336, ans=0.1 +2024-01-15 15:15:05,906 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.918e+02 2.380e+02 2.749e+02 3.167e+02 5.253e+02, threshold=5.498e+02, percent-clipped=0.0 +2024-01-15 15:15:36,797 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.conv_module2.whiten, num_groups=1, num_channels=192, metric=3.12 vs. limit=15.0 +2024-01-15 15:15:41,071 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward3.hidden_balancer.prob, batch_count=37463.333333333336, ans=0.125 +2024-01-15 15:15:48,756 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.attention_skip_rate, batch_count=37496.666666666664, ans=0.0 +2024-01-15 15:15:55,566 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=37496.666666666664, ans=0.1 +2024-01-15 15:15:57,562 INFO [train.py:994] (1/2) Epoch 14, batch 300, loss[loss=0.2005, simple_loss=0.269, pruned_loss=0.06596, over 24178.00 frames. ], tot_loss[loss=0.1962, simple_loss=0.2679, pruned_loss=0.06224, over 3764910.35 frames. ], batch size: 140, lr: 2.62e-02, grad_scale: 16.0 +2024-01-15 15:16:02,568 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass_mid.scale_min, batch_count=37530.0, ans=0.2 +2024-01-15 15:16:06,730 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=256, metric=14.15 vs. limit=22.5 +2024-01-15 15:16:08,571 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.ff3_skip_rate, batch_count=37563.333333333336, ans=0.0027036231884057967 +2024-01-15 15:16:33,960 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward2.hidden_balancer.prob, batch_count=37630.0, ans=0.125 +2024-01-15 15:16:50,711 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=37663.333333333336, ans=0.125 +2024-01-15 15:17:00,062 INFO [train.py:994] (1/2) Epoch 14, batch 350, loss[loss=0.1873, simple_loss=0.2642, pruned_loss=0.05523, over 24513.00 frames. ], tot_loss[loss=0.1957, simple_loss=0.2673, pruned_loss=0.06204, over 3995404.52 frames. ], batch size: 229, lr: 2.61e-02, grad_scale: 16.0 +2024-01-15 15:17:00,346 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.balancer1.prob, batch_count=37696.666666666664, ans=0.125 +2024-01-15 15:17:12,194 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.842e+02 2.300e+02 2.540e+02 2.971e+02 4.745e+02, threshold=5.081e+02, percent-clipped=0.0 +2024-01-15 15:17:17,687 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.whiten, num_groups=1, num_channels=384, metric=3.31 vs. limit=12.0 +2024-01-15 15:17:18,592 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.nonlin_attention.balancer.max_positive, batch_count=37730.0, ans=0.95 +2024-01-15 15:17:24,650 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.whiten, num_groups=1, num_channels=384, metric=2.79 vs. limit=12.0 +2024-01-15 15:17:46,151 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=37796.666666666664, ans=0.1 +2024-01-15 15:17:49,782 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.bypass.scale_min, batch_count=37830.0, ans=0.2 +2024-01-15 15:17:50,917 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.ff2_skip_rate, batch_count=37830.0, ans=0.0026456521739130438 +2024-01-15 15:18:03,272 INFO [train.py:994] (1/2) Epoch 14, batch 400, loss[loss=0.1945, simple_loss=0.2666, pruned_loss=0.0612, over 24475.00 frames. ], tot_loss[loss=0.1951, simple_loss=0.2669, pruned_loss=0.06163, over 4186215.48 frames. ], batch size: 210, lr: 2.61e-02, grad_scale: 32.0 +2024-01-15 15:18:36,761 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=37930.0, ans=0.1 +2024-01-15 15:18:38,467 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=13.38 vs. limit=15.0 +2024-01-15 15:18:47,387 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=37963.333333333336, ans=0.1 +2024-01-15 15:18:49,610 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.convnext.out_balancer.prob, batch_count=37963.333333333336, ans=0.125 +2024-01-15 15:19:07,463 INFO [train.py:994] (1/2) Epoch 14, batch 450, loss[loss=0.1862, simple_loss=0.2675, pruned_loss=0.05239, over 24455.00 frames. ], tot_loss[loss=0.195, simple_loss=0.2669, pruned_loss=0.06157, over 4335435.46 frames. ], batch size: 250, lr: 2.60e-02, grad_scale: 32.0 +2024-01-15 15:19:07,818 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module1.balancer1.prob, batch_count=38030.0, ans=0.125 +2024-01-15 15:19:17,942 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_module2.balancer1.prob, batch_count=38030.0, ans=0.125 +2024-01-15 15:19:18,667 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.836e+02 2.289e+02 2.540e+02 2.904e+02 4.197e+02, threshold=5.081e+02, percent-clipped=0.0 +2024-01-15 15:19:18,996 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=38063.333333333336, ans=0.125 +2024-01-15 15:19:23,942 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.1.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 15:19:30,441 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module2.balancer1.prob, batch_count=38063.333333333336, ans=0.125 +2024-01-15 15:20:10,167 INFO [train.py:994] (1/2) Epoch 14, batch 500, loss[loss=0.2089, simple_loss=0.279, pruned_loss=0.0694, over 24485.00 frames. ], tot_loss[loss=0.1948, simple_loss=0.2666, pruned_loss=0.06152, over 4433631.35 frames. ], batch size: 181, lr: 2.60e-02, grad_scale: 32.0 +2024-01-15 15:20:12,849 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward1.hidden_balancer.prob, batch_count=38196.666666666664, ans=0.125 +2024-01-15 15:20:17,338 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.convnext.layerdrop_rate, batch_count=38196.666666666664, ans=0.015 +2024-01-15 15:20:20,328 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module1.balancer2.prob, batch_count=38196.666666666664, ans=0.125 +2024-01-15 15:20:38,833 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.balancer2.prob, batch_count=38263.333333333336, ans=0.125 +2024-01-15 15:21:05,159 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.ff3_skip_rate, batch_count=38330.0, ans=0.00253695652173913 +2024-01-15 15:21:11,954 INFO [train.py:994] (1/2) Epoch 14, batch 550, loss[loss=0.2086, simple_loss=0.2799, pruned_loss=0.06863, over 24433.00 frames. ], tot_loss[loss=0.1955, simple_loss=0.2674, pruned_loss=0.06187, over 4519809.39 frames. ], batch size: 250, lr: 2.60e-02, grad_scale: 32.0 +2024-01-15 15:21:16,381 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=38363.333333333336, ans=0.125 +2024-01-15 15:21:20,723 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward2.hidden_balancer.prob, batch_count=38363.333333333336, ans=0.125 +2024-01-15 15:21:23,923 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.859e+02 2.229e+02 2.560e+02 3.146e+02 5.169e+02, threshold=5.120e+02, percent-clipped=1.0 +2024-01-15 15:21:24,467 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=11.67 vs. limit=15.0 +2024-01-15 15:21:46,822 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module1.balancer1.min_positive, batch_count=38430.0, ans=0.025 +2024-01-15 15:21:46,862 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.balancer_ff2.min_abs, batch_count=38430.0, ans=0.1 +2024-01-15 15:22:03,731 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.feed_forward2.out_whiten, num_groups=1, num_channels=384, metric=13.50 vs. limit=15.0 +2024-01-15 15:22:14,817 INFO [train.py:994] (1/2) Epoch 14, batch 600, loss[loss=0.164, simple_loss=0.2377, pruned_loss=0.04518, over 23979.00 frames. ], tot_loss[loss=0.1948, simple_loss=0.2667, pruned_loss=0.06146, over 4598231.39 frames. ], batch size: 131, lr: 2.59e-02, grad_scale: 32.0 +2024-01-15 15:22:24,762 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=38530.0, ans=0.125 +2024-01-15 15:22:37,934 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=3.86 vs. limit=6.0 +2024-01-15 15:22:43,884 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.balancer2.prob, batch_count=38596.666666666664, ans=0.125 +2024-01-15 15:22:53,510 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_skip_rate, batch_count=38630.0, ans=0.0 +2024-01-15 15:23:05,536 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass_mid.scale_min, batch_count=38663.333333333336, ans=0.2 +2024-01-15 15:23:17,298 INFO [train.py:994] (1/2) Epoch 14, batch 650, loss[loss=0.2056, simple_loss=0.2748, pruned_loss=0.0682, over 24321.00 frames. ], tot_loss[loss=0.1938, simple_loss=0.2655, pruned_loss=0.0611, over 4634474.75 frames. ], batch size: 298, lr: 2.59e-02, grad_scale: 32.0 +2024-01-15 15:23:24,440 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.self_attn2.whiten, num_groups=1, num_channels=384, metric=16.07 vs. limit=22.5 +2024-01-15 15:23:27,923 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_module1.balancer2.prob, batch_count=38696.666666666664, ans=0.125 +2024-01-15 15:23:28,763 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.895e+02 2.204e+02 2.618e+02 3.440e+02 5.205e+02, threshold=5.237e+02, percent-clipped=1.0 +2024-01-15 15:23:58,492 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.balancer2.prob, batch_count=38796.666666666664, ans=0.125 +2024-01-15 15:24:07,839 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=38830.0, ans=0.125 +2024-01-15 15:24:20,104 INFO [train.py:994] (1/2) Epoch 14, batch 700, loss[loss=0.1948, simple_loss=0.2689, pruned_loss=0.06036, over 24360.00 frames. ], tot_loss[loss=0.1939, simple_loss=0.2655, pruned_loss=0.06118, over 4668461.61 frames. ], batch size: 275, lr: 2.58e-02, grad_scale: 32.0 +2024-01-15 15:24:25,909 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.balancer1.prob, batch_count=38863.333333333336, ans=0.125 +2024-01-15 15:24:30,518 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_skip_rate, batch_count=38863.333333333336, ans=0.0 +2024-01-15 15:24:33,080 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass_mid.scale_min, batch_count=38896.666666666664, ans=0.2 +2024-01-15 15:24:35,361 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.ff2_skip_rate, batch_count=38896.666666666664, ans=0.0024137681159420296 +2024-01-15 15:24:35,397 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_skip_rate, batch_count=38896.666666666664, ans=0.0 +2024-01-15 15:24:46,961 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=38930.0, ans=0.1 +2024-01-15 15:24:48,201 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.bypass.scale_min, batch_count=38930.0, ans=0.2 +2024-01-15 15:24:52,421 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=38930.0, ans=0.125 +2024-01-15 15:25:13,222 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module2.balancer2.prob, batch_count=38996.666666666664, ans=0.125 +2024-01-15 15:25:16,225 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module1.balancer2.prob, batch_count=38996.666666666664, ans=0.125 +2024-01-15 15:25:21,226 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.self_attn2.whiten, num_groups=1, num_channels=384, metric=15.83 vs. limit=22.5 +2024-01-15 15:25:23,097 INFO [train.py:994] (1/2) Epoch 14, batch 750, loss[loss=0.2138, simple_loss=0.2873, pruned_loss=0.0701, over 24334.00 frames. ], tot_loss[loss=0.1935, simple_loss=0.2652, pruned_loss=0.06089, over 4689991.68 frames. ], batch size: 298, lr: 2.58e-02, grad_scale: 32.0 +2024-01-15 15:25:29,360 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module1.balancer1.prob, batch_count=39030.0, ans=0.125 +2024-01-15 15:25:31,573 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.ff2_skip_rate, batch_count=39030.0, ans=0.0023847826086956523 +2024-01-15 15:25:33,694 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.808e+02 2.296e+02 2.739e+02 3.023e+02 4.357e+02, threshold=5.479e+02, percent-clipped=0.0 +2024-01-15 15:25:36,671 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.feed_forward3.out_whiten, num_groups=1, num_channels=512, metric=12.48 vs. limit=15.0 +2024-01-15 15:25:45,988 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=39063.333333333336, ans=0.1 +2024-01-15 15:25:57,569 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module1.balancer1.prob, batch_count=39096.666666666664, ans=0.125 +2024-01-15 15:26:18,266 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=39163.333333333336, ans=0.1 +2024-01-15 15:26:20,383 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.2.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 15:26:23,575 INFO [train.py:994] (1/2) Epoch 14, batch 800, loss[loss=0.2042, simple_loss=0.2786, pruned_loss=0.06486, over 24367.00 frames. ], tot_loss[loss=0.1924, simple_loss=0.2642, pruned_loss=0.06033, over 4704105.87 frames. ], batch size: 275, lr: 2.58e-02, grad_scale: 32.0 +2024-01-15 15:26:39,348 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.ff3_skip_rate, batch_count=39230.0, ans=0.002341304347826087 +2024-01-15 15:26:39,611 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.conv_module2.whiten, num_groups=1, num_channels=384, metric=2.48 vs. limit=15.0 +2024-01-15 15:26:52,002 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=384, metric=5.45 vs. limit=10.0 +2024-01-15 15:27:37,313 INFO [train.py:994] (1/2) Epoch 15, batch 0, loss[loss=0.2021, simple_loss=0.273, pruned_loss=0.06559, over 24575.00 frames. ], tot_loss[loss=0.2021, simple_loss=0.273, pruned_loss=0.06559, over 24575.00 frames. ], batch size: 176, lr: 2.51e-02, grad_scale: 32.0 +2024-01-15 15:27:37,314 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 15:27:57,300 INFO [train.py:1026] (1/2) Epoch 15, validation: loss=0.1734, simple_loss=0.2606, pruned_loss=0.04308, over 1622729.00 frames. +2024-01-15 15:27:57,301 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 15:27:57,553 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module2.balancer1.prob, batch_count=39340.0, ans=0.125 +2024-01-15 15:28:17,322 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.773e+02 2.162e+02 2.431e+02 2.731e+02 3.877e+02, threshold=4.862e+02, percent-clipped=0.0 +2024-01-15 15:28:29,590 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_skip_rate, batch_count=39406.666666666664, ans=0.0 +2024-01-15 15:28:32,356 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=384, metric=15.40 vs. limit=15.0 +2024-01-15 15:28:41,591 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=39440.0, ans=0.0 +2024-01-15 15:28:47,511 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=39473.333333333336, ans=0.125 +2024-01-15 15:28:54,677 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.balancer2.prob, batch_count=39473.333333333336, ans=0.125 +2024-01-15 15:28:55,837 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module1.balancer1.prob, batch_count=39473.333333333336, ans=0.125 +2024-01-15 15:29:00,171 INFO [train.py:994] (1/2) Epoch 15, batch 50, loss[loss=0.1663, simple_loss=0.2315, pruned_loss=0.05049, over 23551.00 frames. ], tot_loss[loss=0.1907, simple_loss=0.2623, pruned_loss=0.0596, over 1093803.92 frames. ], batch size: 119, lr: 2.51e-02, grad_scale: 32.0 +2024-01-15 15:29:00,436 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=39506.666666666664, ans=0.1 +2024-01-15 15:29:05,325 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=39506.666666666664, ans=0.125 +2024-01-15 15:29:10,065 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.ff3_skip_rate, batch_count=39506.666666666664, ans=0.0022811594202898555 +2024-01-15 15:29:41,873 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.bypass_mid.scale_min, batch_count=39606.666666666664, ans=0.2 +2024-01-15 15:29:47,288 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.attention_skip_rate, batch_count=39606.666666666664, ans=0.0 +2024-01-15 15:29:48,411 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module1.balancer2.prob, batch_count=39640.0, ans=0.125 +2024-01-15 15:30:00,926 INFO [train.py:994] (1/2) Epoch 15, batch 100, loss[loss=0.199, simple_loss=0.2765, pruned_loss=0.06077, over 22526.00 frames. ], tot_loss[loss=0.1906, simple_loss=0.263, pruned_loss=0.0591, over 1928491.78 frames. ], batch size: 357, lr: 2.50e-02, grad_scale: 32.0 +2024-01-15 15:30:09,697 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.nonlin_attention.balancer.prob, batch_count=39673.333333333336, ans=0.125 +2024-01-15 15:30:10,944 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_module2.balancer2.prob, batch_count=39673.333333333336, ans=0.125 +2024-01-15 15:30:20,449 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=39706.666666666664, ans=0.1 +2024-01-15 15:30:20,512 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.out_combiner.scale_min, batch_count=39706.666666666664, ans=0.2 +2024-01-15 15:30:21,383 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.911e+02 2.236e+02 2.514e+02 3.021e+02 4.488e+02, threshold=5.028e+02, percent-clipped=0.0 +2024-01-15 15:31:02,882 INFO [train.py:994] (1/2) Epoch 15, batch 150, loss[loss=0.1876, simple_loss=0.2615, pruned_loss=0.05682, over 24510.00 frames. ], tot_loss[loss=0.1894, simple_loss=0.2617, pruned_loss=0.0586, over 2565976.36 frames. ], batch size: 243, lr: 2.50e-02, grad_scale: 32.0 +2024-01-15 15:31:08,228 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=384, metric=24.04 vs. limit=22.5 +2024-01-15 15:31:12,593 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.balancer1.prob, batch_count=39840.0, ans=0.125 +2024-01-15 15:31:28,141 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.bypass.scale_min, batch_count=39906.666666666664, ans=0.2 +2024-01-15 15:31:39,136 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.ff3_skip_rate, batch_count=39940.0, ans=0.0021869565217391296 +2024-01-15 15:31:42,450 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.balancer2.prob, batch_count=39940.0, ans=0.125 +2024-01-15 15:31:46,233 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module2.balancer2.prob, batch_count=39940.0, ans=0.125 +2024-01-15 15:32:07,629 INFO [train.py:994] (1/2) Epoch 15, batch 200, loss[loss=0.1957, simple_loss=0.2626, pruned_loss=0.06437, over 24546.00 frames. ], tot_loss[loss=0.1896, simple_loss=0.2619, pruned_loss=0.05865, over 3057414.23 frames. ], batch size: 176, lr: 2.50e-02, grad_scale: 32.0 +2024-01-15 15:32:11,614 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward1.out_proj.dropout_p, batch_count=40006.666666666664, ans=0.1 +2024-01-15 15:32:17,445 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_whiten, num_groups=1, num_channels=384, metric=14.71 vs. limit=15.0 +2024-01-15 15:32:22,933 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.out_proj.dropout_p, batch_count=40040.0, ans=0.1 +2024-01-15 15:32:27,424 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.756e+02 2.321e+02 2.884e+02 3.523e+02 5.620e+02, threshold=5.767e+02, percent-clipped=2.0 +2024-01-15 15:32:35,285 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_whiten, num_groups=1, num_channels=384, metric=13.95 vs. limit=15.0 +2024-01-15 15:32:36,190 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=12.23 vs. limit=15.0 +2024-01-15 15:32:39,541 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass.scale_min, batch_count=40073.333333333336, ans=0.2 +2024-01-15 15:32:39,871 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=512, metric=2.29 vs. limit=15.0 +2024-01-15 15:33:08,706 INFO [train.py:994] (1/2) Epoch 15, batch 250, loss[loss=0.1895, simple_loss=0.2588, pruned_loss=0.06009, over 24463.00 frames. ], tot_loss[loss=0.1894, simple_loss=0.2618, pruned_loss=0.05854, over 3440980.28 frames. ], batch size: 267, lr: 2.49e-02, grad_scale: 32.0 +2024-01-15 15:33:52,047 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=40273.333333333336, ans=0.125 +2024-01-15 15:34:05,355 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=40306.666666666664, ans=0.1 +2024-01-15 15:34:07,719 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module2.balancer1.prob, batch_count=40306.666666666664, ans=0.125 +2024-01-15 15:34:10,758 INFO [train.py:994] (1/2) Epoch 15, batch 300, loss[loss=0.212, simple_loss=0.2857, pruned_loss=0.06919, over 22413.00 frames. ], tot_loss[loss=0.1895, simple_loss=0.2616, pruned_loss=0.05872, over 3717633.77 frames. ], batch size: 357, lr: 2.49e-02, grad_scale: 32.0 +2024-01-15 15:34:19,400 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.ff2_skip_rate, batch_count=40340.0, ans=0.0020999999999999994 +2024-01-15 15:34:20,687 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward3.hidden_balancer.prob, batch_count=40340.0, ans=0.125 +2024-01-15 15:34:31,006 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.955e+02 2.273e+02 2.449e+02 2.995e+02 5.556e+02, threshold=4.897e+02, percent-clipped=0.0 +2024-01-15 15:34:31,239 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.nonlin_attention.balancer.prob, batch_count=40373.333333333336, ans=0.125 +2024-01-15 15:34:39,534 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.conv.8.prob, batch_count=40406.666666666664, ans=0.125 +2024-01-15 15:35:12,606 INFO [train.py:994] (1/2) Epoch 15, batch 350, loss[loss=0.2054, simple_loss=0.275, pruned_loss=0.06784, over 24363.00 frames. ], tot_loss[loss=0.1894, simple_loss=0.2615, pruned_loss=0.05865, over 3953394.66 frames. ], batch size: 275, lr: 2.49e-02, grad_scale: 32.0 +2024-01-15 15:35:34,339 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.balancer1.prob, batch_count=40540.0, ans=0.125 +2024-01-15 15:35:47,682 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=40573.333333333336, ans=0.1 +2024-01-15 15:35:54,995 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.balancer.max_positive, batch_count=40606.666666666664, ans=0.95 +2024-01-15 15:36:03,442 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=40640.0, ans=0.0 +2024-01-15 15:36:04,783 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.balancer2.prob, batch_count=40640.0, ans=0.125 +2024-01-15 15:36:07,347 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=40640.0, ans=0.125 +2024-01-15 15:36:17,094 INFO [train.py:994] (1/2) Epoch 15, batch 400, loss[loss=0.1513, simple_loss=0.2231, pruned_loss=0.03977, over 23479.00 frames. ], tot_loss[loss=0.1894, simple_loss=0.2617, pruned_loss=0.05854, over 4150919.26 frames. ], batch size: 119, lr: 2.48e-02, grad_scale: 32.0 +2024-01-15 15:36:27,011 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module1.balancer2.prob, batch_count=40673.333333333336, ans=0.125 +2024-01-15 15:36:33,838 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.bypass.skip_rate, batch_count=40706.666666666664, ans=0.07 +2024-01-15 15:36:37,119 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.804e+02 2.267e+02 2.564e+02 2.924e+02 4.452e+02, threshold=5.127e+02, percent-clipped=0.0 +2024-01-15 15:36:38,621 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module1.balancer1.prob, batch_count=40706.666666666664, ans=0.125 +2024-01-15 15:36:46,286 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=40740.0, ans=0.1 +2024-01-15 15:37:02,862 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=17.89 vs. limit=15.0 +2024-01-15 15:37:08,941 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass.skip_rate, batch_count=40806.666666666664, ans=0.09899494936611666 +2024-01-15 15:37:19,336 INFO [train.py:994] (1/2) Epoch 15, batch 450, loss[loss=0.195, simple_loss=0.2723, pruned_loss=0.05881, over 24382.00 frames. ], tot_loss[loss=0.1892, simple_loss=0.2617, pruned_loss=0.05835, over 4298711.57 frames. ], batch size: 275, lr: 2.48e-02, grad_scale: 32.0 +2024-01-15 15:37:36,602 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.conv_module2.whiten, num_groups=1, num_channels=256, metric=4.64 vs. limit=15.0 +2024-01-15 15:37:44,618 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module1.balancer2.min_positive, batch_count=40906.666666666664, ans=0.05 +2024-01-15 15:38:21,911 INFO [train.py:994] (1/2) Epoch 15, batch 500, loss[loss=0.1849, simple_loss=0.2545, pruned_loss=0.05764, over 24372.00 frames. ], tot_loss[loss=0.189, simple_loss=0.2616, pruned_loss=0.05824, over 4415062.59 frames. ], batch size: 153, lr: 2.48e-02, grad_scale: 32.0 +2024-01-15 15:38:22,298 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.5.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 15:38:42,157 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.855e+02 2.156e+02 2.340e+02 2.628e+02 4.759e+02, threshold=4.679e+02, percent-clipped=0.0 +2024-01-15 15:38:47,155 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=41073.333333333336, ans=0.1 +2024-01-15 15:38:54,766 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.balancer1.prob, batch_count=41073.333333333336, ans=0.125 +2024-01-15 15:39:03,839 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.self_attn1.whiten, num_groups=1, num_channels=192, metric=16.75 vs. limit=22.5 +2024-01-15 15:39:08,491 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.ff2_skip_rate, batch_count=41106.666666666664, ans=0.0019333333333333338 +2024-01-15 15:39:09,027 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.self_attn2.whiten, num_groups=1, num_channels=256, metric=19.53 vs. limit=22.5 +2024-01-15 15:39:18,174 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=256, metric=15.65 vs. limit=22.5 +2024-01-15 15:39:24,329 INFO [train.py:994] (1/2) Epoch 15, batch 550, loss[loss=0.1864, simple_loss=0.2629, pruned_loss=0.055, over 24515.00 frames. ], tot_loss[loss=0.1886, simple_loss=0.2612, pruned_loss=0.058, over 4496762.58 frames. ], batch size: 181, lr: 2.47e-02, grad_scale: 32.0 +2024-01-15 15:39:24,685 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module1.balancer2.prob, batch_count=41173.333333333336, ans=0.125 +2024-01-15 15:39:28,208 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.balancer2.min_positive, batch_count=41173.333333333336, ans=0.05 +2024-01-15 15:39:33,890 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward3.hidden_balancer.prob, batch_count=41173.333333333336, ans=0.125 +2024-01-15 15:39:41,005 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.nonlin_attention.balancer.prob, batch_count=41206.666666666664, ans=0.125 +2024-01-15 15:39:56,061 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.3.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 15:40:00,787 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=9.49 vs. limit=15.0 +2024-01-15 15:40:18,982 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward1.hidden_balancer.prob, batch_count=41306.666666666664, ans=0.125 +2024-01-15 15:40:25,239 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=41306.666666666664, ans=0.125 +2024-01-15 15:40:28,003 INFO [train.py:994] (1/2) Epoch 15, batch 600, loss[loss=0.1705, simple_loss=0.2477, pruned_loss=0.04665, over 24330.00 frames. ], tot_loss[loss=0.1887, simple_loss=0.2613, pruned_loss=0.05798, over 4568787.76 frames. ], batch size: 147, lr: 2.47e-02, grad_scale: 32.0 +2024-01-15 15:40:45,659 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=41373.333333333336, ans=0.125 +2024-01-15 15:40:47,817 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.812e+02 2.181e+02 2.447e+02 2.810e+02 7.048e+02, threshold=4.893e+02, percent-clipped=1.0 +2024-01-15 15:40:57,062 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module2.balancer1.max_abs, batch_count=41406.666666666664, ans=10.0 +2024-01-15 15:41:10,930 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=41440.0, ans=0.0 +2024-01-15 15:41:17,723 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.ff2_skip_rate, batch_count=41473.333333333336, ans=0.0018536231884057966 +2024-01-15 15:41:20,111 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.bypass.skip_rate, batch_count=41473.333333333336, ans=0.09899494936611666 +2024-01-15 15:41:21,332 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass.skip_rate, batch_count=41473.333333333336, ans=0.09899494936611666 +2024-01-15 15:41:31,179 INFO [train.py:994] (1/2) Epoch 15, batch 650, loss[loss=0.2005, simple_loss=0.2645, pruned_loss=0.06826, over 24392.00 frames. ], tot_loss[loss=0.189, simple_loss=0.2619, pruned_loss=0.05801, over 4622156.64 frames. ], batch size: 153, lr: 2.46e-02, grad_scale: 32.0 +2024-01-15 15:41:36,862 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=144, metric=6.03 vs. limit=10.0 +2024-01-15 15:41:57,537 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass.skip_rate, batch_count=41573.333333333336, ans=0.04949747468305833 +2024-01-15 15:42:08,658 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.nonlin_attention.balancer.prob, batch_count=41606.666666666664, ans=0.125 +2024-01-15 15:42:12,419 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.bypass.scale_min, batch_count=41606.666666666664, ans=0.2 +2024-01-15 15:42:31,791 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=41640.0, ans=0.0 +2024-01-15 15:42:33,815 INFO [train.py:994] (1/2) Epoch 15, batch 700, loss[loss=0.1702, simple_loss=0.2384, pruned_loss=0.05096, over 23555.00 frames. ], tot_loss[loss=0.1888, simple_loss=0.2616, pruned_loss=0.05794, over 4643548.97 frames. ], batch size: 119, lr: 2.46e-02, grad_scale: 32.0 +2024-01-15 15:42:53,562 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.804e+02 2.413e+02 2.693e+02 3.109e+02 4.635e+02, threshold=5.386e+02, percent-clipped=0.0 +2024-01-15 15:43:09,759 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module2.balancer1.prob, batch_count=41773.333333333336, ans=0.125 +2024-01-15 15:43:12,651 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_skip_rate, batch_count=41773.333333333336, ans=0.0 +2024-01-15 15:43:18,628 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=41773.333333333336, ans=0.1 +2024-01-15 15:43:27,403 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.attention_skip_rate, batch_count=41806.666666666664, ans=0.0 +2024-01-15 15:43:29,091 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.self_attn1.whiten, num_groups=1, num_channels=192, metric=14.42 vs. limit=22.5 +2024-01-15 15:43:33,421 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=41806.666666666664, ans=0.1 +2024-01-15 15:43:35,334 INFO [train.py:994] (1/2) Epoch 15, batch 750, loss[loss=0.184, simple_loss=0.2546, pruned_loss=0.05669, over 24234.00 frames. ], tot_loss[loss=0.1886, simple_loss=0.2616, pruned_loss=0.05776, over 4681061.95 frames. ], batch size: 140, lr: 2.46e-02, grad_scale: 32.0 +2024-01-15 15:43:38,208 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=41840.0, ans=0.1 +2024-01-15 15:44:03,280 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.attention_skip_rate, batch_count=41906.666666666664, ans=0.0 +2024-01-15 15:44:03,693 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.self_attn2.whiten, num_groups=1, num_channels=384, metric=18.12 vs. limit=22.5 +2024-01-15 15:44:21,340 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module2.balancer2.prob, batch_count=41940.0, ans=0.125 +2024-01-15 15:44:25,389 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward2.out_whiten.whitening_limit, batch_count=41973.333333333336, ans=15.0 +2024-01-15 15:44:36,110 INFO [train.py:994] (1/2) Epoch 15, batch 800, loss[loss=0.192, simple_loss=0.2675, pruned_loss=0.05821, over 24406.00 frames. ], tot_loss[loss=0.1885, simple_loss=0.2614, pruned_loss=0.05773, over 4698324.45 frames. ], batch size: 258, lr: 2.45e-02, grad_scale: 32.0 +2024-01-15 15:44:38,515 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=42006.666666666664, ans=0.0 +2024-01-15 15:44:40,796 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_skip_rate, batch_count=42006.666666666664, ans=0.0 +2024-01-15 15:44:54,406 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.842e+02 2.395e+02 2.798e+02 3.335e+02 4.587e+02, threshold=5.596e+02, percent-clipped=0.0 +2024-01-15 15:45:06,622 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.ff2_skip_rate, batch_count=42073.333333333336, ans=0.0017231884057971013 +2024-01-15 15:45:49,556 INFO [train.py:994] (1/2) Epoch 16, batch 0, loss[loss=0.1969, simple_loss=0.2736, pruned_loss=0.06009, over 22599.00 frames. ], tot_loss[loss=0.1969, simple_loss=0.2736, pruned_loss=0.06009, over 22599.00 frames. ], batch size: 357, lr: 2.39e-02, grad_scale: 32.0 +2024-01-15 15:45:49,557 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 15:46:09,918 INFO [train.py:1026] (1/2) Epoch 16, validation: loss=0.1735, simple_loss=0.26, pruned_loss=0.04348, over 1622729.00 frames. +2024-01-15 15:46:09,919 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 15:46:18,185 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=42150.0, ans=0.1 +2024-01-15 15:46:22,854 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward3.hidden_balancer.prob, batch_count=42183.333333333336, ans=0.125 +2024-01-15 15:46:32,838 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.conv_module2.whiten, num_groups=1, num_channels=256, metric=4.57 vs. limit=15.0 +2024-01-15 15:46:38,623 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.self_attn2.whiten, num_groups=1, num_channels=192, metric=12.58 vs. limit=22.5 +2024-01-15 15:46:54,246 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.bypass.scale_min, batch_count=42250.0, ans=0.2 +2024-01-15 15:46:57,154 INFO [scaling.py:1022] (1/2) Whitening: name=encoder_embed.convnext.out_whiten, num_groups=1, num_channels=128, metric=4.69 vs. limit=5.0 +2024-01-15 15:47:08,430 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.bypass.skip_rate, batch_count=42283.333333333336, ans=0.07 +2024-01-15 15:47:12,894 INFO [train.py:994] (1/2) Epoch 16, batch 50, loss[loss=0.1575, simple_loss=0.226, pruned_loss=0.04452, over 23558.00 frames. ], tot_loss[loss=0.1855, simple_loss=0.2581, pruned_loss=0.05641, over 1081986.31 frames. ], batch size: 119, lr: 2.39e-02, grad_scale: 32.0 +2024-01-15 15:47:16,812 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_skip_rate, batch_count=42316.666666666664, ans=0.0 +2024-01-15 15:47:41,132 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.795e+02 2.203e+02 2.392e+02 2.946e+02 6.436e+02, threshold=4.783e+02, percent-clipped=1.0 +2024-01-15 15:47:46,164 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module1.balancer2.prob, batch_count=42383.333333333336, ans=0.125 +2024-01-15 15:48:04,075 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.attention_skip_rate, batch_count=42450.0, ans=0.0 +2024-01-15 15:48:14,453 INFO [train.py:994] (1/2) Epoch 16, batch 100, loss[loss=0.1949, simple_loss=0.2675, pruned_loss=0.06117, over 24479.00 frames. ], tot_loss[loss=0.1849, simple_loss=0.2574, pruned_loss=0.05624, over 1909069.17 frames. ], batch size: 181, lr: 2.39e-02, grad_scale: 32.0 +2024-01-15 15:48:21,588 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.self_attn2.whiten, num_groups=1, num_channels=384, metric=19.38 vs. limit=22.5 +2024-01-15 15:48:38,527 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.self_attn1.whiten.whitening_limit, batch_count=42550.0, ans=22.5 +2024-01-15 15:48:48,500 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=42550.0, ans=0.125 +2024-01-15 15:49:00,076 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=512, metric=19.43 vs. limit=22.5 +2024-01-15 15:49:17,572 INFO [train.py:994] (1/2) Epoch 16, batch 150, loss[loss=0.1959, simple_loss=0.2626, pruned_loss=0.06459, over 24565.00 frames. ], tot_loss[loss=0.1858, simple_loss=0.2588, pruned_loss=0.05636, over 2559603.81 frames. ], batch size: 176, lr: 2.38e-02, grad_scale: 32.0 +2024-01-15 15:49:17,887 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module2.balancer1.prob, batch_count=42650.0, ans=0.125 +2024-01-15 15:49:32,519 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.bypass.skip_rate, batch_count=42683.333333333336, ans=0.035 +2024-01-15 15:49:38,572 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.bypass.scale_min, batch_count=42683.333333333336, ans=0.2 +2024-01-15 15:49:43,004 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.attention_skip_rate, batch_count=42716.666666666664, ans=0.0 +2024-01-15 15:49:46,977 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.824e+02 2.275e+02 2.640e+02 3.236e+02 4.692e+02, threshold=5.280e+02, percent-clipped=0.0 +2024-01-15 15:49:55,615 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module2.balancer1.prob, batch_count=42750.0, ans=0.125 +2024-01-15 15:49:56,237 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.conv_module1.whiten, num_groups=1, num_channels=192, metric=10.81 vs. limit=15.0 +2024-01-15 15:50:19,983 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module1.balancer2.prob, batch_count=42816.666666666664, ans=0.125 +2024-01-15 15:50:20,919 INFO [train.py:994] (1/2) Epoch 16, batch 200, loss[loss=0.1809, simple_loss=0.2592, pruned_loss=0.05133, over 24504.00 frames. ], tot_loss[loss=0.1853, simple_loss=0.2584, pruned_loss=0.05614, over 3045677.72 frames. ], batch size: 193, lr: 2.38e-02, grad_scale: 32.0 +2024-01-15 15:50:29,837 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.ff2_skip_rate, batch_count=42816.666666666664, ans=0.0015615942028985516 +2024-01-15 15:50:35,651 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.0.layers.0.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 15:50:38,219 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward1.hidden_balancer.prob, batch_count=42850.0, ans=0.125 +2024-01-15 15:50:44,902 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=144, metric=9.54 vs. limit=10.0 +2024-01-15 15:50:55,414 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=384, metric=10.50 vs. limit=15.0 +2024-01-15 15:50:59,196 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module1.balancer1.prob, batch_count=42916.666666666664, ans=0.125 +2024-01-15 15:51:06,267 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.conv_module2.whiten, num_groups=1, num_channels=384, metric=5.70 vs. limit=15.0 +2024-01-15 15:51:08,286 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module1.balancer1.prob, batch_count=42916.666666666664, ans=0.125 +2024-01-15 15:51:17,816 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module2.balancer1.prob, batch_count=42950.0, ans=0.125 +2024-01-15 15:51:23,096 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module1.balancer1.prob, batch_count=42983.333333333336, ans=0.125 +2024-01-15 15:51:23,946 INFO [train.py:994] (1/2) Epoch 16, batch 250, loss[loss=0.1791, simple_loss=0.2585, pruned_loss=0.04992, over 24367.00 frames. ], tot_loss[loss=0.1848, simple_loss=0.2583, pruned_loss=0.05567, over 3446029.67 frames. ], batch size: 298, lr: 2.38e-02, grad_scale: 32.0 +2024-01-15 15:51:28,401 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=42983.333333333336, ans=0.125 +2024-01-15 15:51:31,014 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=384, metric=3.47 vs. limit=15.0 +2024-01-15 15:51:37,977 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module2.balancer2.prob, batch_count=43016.666666666664, ans=0.125 +2024-01-15 15:51:41,582 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=43016.666666666664, ans=0.125 +2024-01-15 15:51:52,662 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.712e+02 2.217e+02 2.657e+02 3.438e+02 6.026e+02, threshold=5.313e+02, percent-clipped=3.0 +2024-01-15 15:51:56,546 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.ff3_skip_rate, batch_count=43050.0, ans=0.0015108695652173917 +2024-01-15 15:51:58,944 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.balancer_na.min_abs, batch_count=43050.0, ans=0.02 +2024-01-15 15:52:06,121 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.ff3_skip_rate, batch_count=43083.333333333336, ans=0.001503623188405797 +2024-01-15 15:52:17,545 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward1.hidden_balancer.prob, batch_count=43116.666666666664, ans=0.125 +2024-01-15 15:52:22,428 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=43116.666666666664, ans=0.125 +2024-01-15 15:52:25,571 INFO [train.py:994] (1/2) Epoch 16, batch 300, loss[loss=0.1775, simple_loss=0.2501, pruned_loss=0.05244, over 24421.00 frames. ], tot_loss[loss=0.1852, simple_loss=0.2589, pruned_loss=0.0558, over 3745955.17 frames. ], batch size: 258, lr: 2.37e-02, grad_scale: 16.0 +2024-01-15 15:52:27,136 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_skip_rate, batch_count=43150.0, ans=0.0 +2024-01-15 15:52:57,516 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=384, metric=2.39 vs. limit=15.0 +2024-01-15 15:53:22,748 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_skip_rate, batch_count=43283.333333333336, ans=0.0 +2024-01-15 15:53:27,087 INFO [train.py:994] (1/2) Epoch 16, batch 350, loss[loss=0.1902, simple_loss=0.2698, pruned_loss=0.05524, over 24509.00 frames. ], tot_loss[loss=0.1849, simple_loss=0.2585, pruned_loss=0.05564, over 3966057.63 frames. ], batch size: 216, lr: 2.37e-02, grad_scale: 16.0 +2024-01-15 15:53:49,156 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.conv_module2.whiten, num_groups=1, num_channels=512, metric=3.88 vs. limit=15.0 +2024-01-15 15:53:56,847 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.814e+02 2.130e+02 2.322e+02 2.639e+02 6.815e+02, threshold=4.644e+02, percent-clipped=1.0 +2024-01-15 15:54:05,284 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.whiten, num_groups=1, num_channels=512, metric=3.62 vs. limit=12.0 +2024-01-15 15:54:30,239 INFO [train.py:994] (1/2) Epoch 16, batch 400, loss[loss=0.1848, simple_loss=0.2545, pruned_loss=0.05751, over 24400.00 frames. ], tot_loss[loss=0.1849, simple_loss=0.2583, pruned_loss=0.0557, over 4152445.08 frames. ], batch size: 153, lr: 2.37e-02, grad_scale: 32.0 +2024-01-15 15:55:13,524 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.attention_skip_rate, batch_count=43583.333333333336, ans=0.0 +2024-01-15 15:55:17,158 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=43583.333333333336, ans=0.125 +2024-01-15 15:55:21,135 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.convnext.hidden_balancer.prob, batch_count=43616.666666666664, ans=0.125 +2024-01-15 15:55:24,234 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=192, metric=12.02 vs. limit=15.0 +2024-01-15 15:55:32,854 INFO [train.py:994] (1/2) Epoch 16, batch 450, loss[loss=0.1919, simple_loss=0.2612, pruned_loss=0.06126, over 24497.00 frames. ], tot_loss[loss=0.184, simple_loss=0.2577, pruned_loss=0.05512, over 4300596.21 frames. ], batch size: 181, lr: 2.36e-02, grad_scale: 16.0 +2024-01-15 15:55:36,594 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.attention_skip_rate, batch_count=43650.0, ans=0.0 +2024-01-15 15:55:36,617 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.ff2_skip_rate, batch_count=43650.0, ans=0.0013804347826086964 +2024-01-15 15:56:03,977 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.746e+02 2.141e+02 2.320e+02 2.673e+02 3.975e+02, threshold=4.639e+02, percent-clipped=0.0 +2024-01-15 15:56:23,394 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward2.hidden_balancer.prob, batch_count=43783.333333333336, ans=0.125 +2024-01-15 15:56:35,839 INFO [train.py:994] (1/2) Epoch 16, batch 500, loss[loss=0.1915, simple_loss=0.2596, pruned_loss=0.06167, over 24548.00 frames. ], tot_loss[loss=0.1839, simple_loss=0.2578, pruned_loss=0.05505, over 4421475.30 frames. ], batch size: 204, lr: 2.36e-02, grad_scale: 16.0 +2024-01-15 15:56:38,317 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=384, metric=17.17 vs. limit=22.5 +2024-01-15 15:56:41,728 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.balancer1.prob, batch_count=43816.666666666664, ans=0.125 +2024-01-15 15:57:38,738 INFO [train.py:994] (1/2) Epoch 16, batch 550, loss[loss=0.157, simple_loss=0.2294, pruned_loss=0.04233, over 24231.00 frames. ], tot_loss[loss=0.1842, simple_loss=0.2581, pruned_loss=0.05516, over 4514008.35 frames. ], batch size: 140, lr: 2.36e-02, grad_scale: 16.0 +2024-01-15 15:57:41,830 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=7.18 vs. limit=10.0 +2024-01-15 15:57:49,420 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=43983.333333333336, ans=0.0 +2024-01-15 15:57:49,423 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module1.balancer1.prob, batch_count=43983.333333333336, ans=0.125 +2024-01-15 15:58:09,544 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.ff2_skip_rate, batch_count=44050.0, ans=0.0012934782608695662 +2024-01-15 15:58:10,239 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.806e+02 2.090e+02 2.410e+02 2.866e+02 4.101e+02, threshold=4.819e+02, percent-clipped=0.0 +2024-01-15 15:58:24,571 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module2.balancer2.prob, batch_count=44083.333333333336, ans=0.125 +2024-01-15 15:58:33,127 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.ff2_skip_rate, batch_count=44116.666666666664, ans=0.0012789855072463784 +2024-01-15 15:58:35,152 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=11.74 vs. limit=15.0 +2024-01-15 15:58:37,227 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.balancer2.prob, batch_count=44116.666666666664, ans=0.125 +2024-01-15 15:58:41,789 INFO [train.py:994] (1/2) Epoch 16, batch 600, loss[loss=0.1882, simple_loss=0.2612, pruned_loss=0.05756, over 24494.00 frames. ], tot_loss[loss=0.1838, simple_loss=0.2578, pruned_loss=0.05491, over 4582266.60 frames. ], batch size: 267, lr: 2.35e-02, grad_scale: 16.0 +2024-01-15 15:58:44,547 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=44150.0, ans=0.1 +2024-01-15 15:58:47,517 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=44150.0, ans=0.125 +2024-01-15 15:58:57,245 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module2.balancer1.prob, batch_count=44183.333333333336, ans=0.125 +2024-01-15 15:59:00,855 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.balancer1.prob, batch_count=44183.333333333336, ans=0.125 +2024-01-15 15:59:31,391 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward2.hidden_balancer.prob, batch_count=44283.333333333336, ans=0.125 +2024-01-15 15:59:44,809 INFO [train.py:994] (1/2) Epoch 16, batch 650, loss[loss=0.1634, simple_loss=0.2328, pruned_loss=0.04696, over 23448.00 frames. ], tot_loss[loss=0.1835, simple_loss=0.2574, pruned_loss=0.0548, over 4625800.73 frames. ], batch size: 119, lr: 2.35e-02, grad_scale: 16.0 +2024-01-15 15:59:58,967 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=44350.0, ans=0.1 +2024-01-15 16:00:00,138 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=44350.0, ans=0.1 +2024-01-15 16:00:03,598 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.balancer1.prob, batch_count=44350.0, ans=0.125 +2024-01-15 16:00:05,916 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.ff2_skip_rate, batch_count=44350.0, ans=0.0012282608695652168 +2024-01-15 16:00:15,762 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.747e+02 2.324e+02 2.634e+02 2.982e+02 4.280e+02, threshold=5.267e+02, percent-clipped=0.0 +2024-01-15 16:00:17,762 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=9.00 vs. limit=15.0 +2024-01-15 16:00:22,675 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.ff2_skip_rate, batch_count=44416.666666666664, ans=0.0012137681159420308 +2024-01-15 16:00:35,964 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=44450.0, ans=0.0 +2024-01-15 16:00:47,144 INFO [train.py:994] (1/2) Epoch 16, batch 700, loss[loss=0.2115, simple_loss=0.2854, pruned_loss=0.06879, over 22450.00 frames. ], tot_loss[loss=0.1841, simple_loss=0.2583, pruned_loss=0.05501, over 4673683.75 frames. ], batch size: 357, lr: 2.35e-02, grad_scale: 16.0 +2024-01-15 16:00:54,452 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.bypass.scale_min, batch_count=44483.333333333336, ans=0.2 +2024-01-15 16:01:04,641 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.ff3_skip_rate, batch_count=44516.666666666664, ans=0.0011920289855072482 +2024-01-15 16:01:17,236 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module2.balancer1.prob, batch_count=44550.0, ans=0.125 +2024-01-15 16:01:48,814 INFO [train.py:994] (1/2) Epoch 16, batch 750, loss[loss=0.1954, simple_loss=0.2704, pruned_loss=0.06015, over 24507.00 frames. ], tot_loss[loss=0.1838, simple_loss=0.2577, pruned_loss=0.05494, over 4699005.18 frames. ], batch size: 210, lr: 2.34e-02, grad_scale: 16.0 +2024-01-15 16:01:50,630 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.self_attn2.whiten, num_groups=1, num_channels=512, metric=18.99 vs. limit=22.5 +2024-01-15 16:01:56,416 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=5.47 vs. limit=6.0 +2024-01-15 16:02:13,082 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.out_combiner.scale_min, batch_count=44716.666666666664, ans=0.2 +2024-01-15 16:02:15,457 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward1.hidden_balancer.prob, batch_count=44716.666666666664, ans=0.125 +2024-01-15 16:02:20,415 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.844e+02 2.335e+02 2.612e+02 3.145e+02 7.520e+02, threshold=5.223e+02, percent-clipped=1.0 +2024-01-15 16:02:25,222 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=44750.0, ans=0.1 +2024-01-15 16:02:44,602 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward3.hidden_balancer.prob, batch_count=44783.333333333336, ans=0.125 +2024-01-15 16:02:48,947 INFO [train.py:994] (1/2) Epoch 16, batch 800, loss[loss=0.1887, simple_loss=0.2587, pruned_loss=0.0594, over 24426.00 frames. ], tot_loss[loss=0.1828, simple_loss=0.2565, pruned_loss=0.05452, over 4719398.87 frames. ], batch size: 159, lr: 2.34e-02, grad_scale: 32.0 +2024-01-15 16:03:00,486 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.bypass.scale_min, batch_count=44850.0, ans=0.2 +2024-01-15 16:03:20,673 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=44883.333333333336, ans=0.1 +2024-01-15 16:03:31,890 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.bypass.skip_rate, batch_count=44916.666666666664, ans=0.07 +2024-01-15 16:04:02,758 INFO [train.py:994] (1/2) Epoch 17, batch 0, loss[loss=0.1997, simple_loss=0.2744, pruned_loss=0.06252, over 24606.00 frames. ], tot_loss[loss=0.1997, simple_loss=0.2744, pruned_loss=0.06252, over 24606.00 frames. ], batch size: 199, lr: 2.28e-02, grad_scale: 32.0 +2024-01-15 16:04:02,758 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 16:04:10,346 INFO [zipformer.py:1858] (1/2) name=encoder.encoders.5.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([2.7798, 3.2947, 3.9728, 2.5904], device='cuda:1') +2024-01-15 16:04:22,814 INFO [train.py:1026] (1/2) Epoch 17, validation: loss=0.1728, simple_loss=0.2592, pruned_loss=0.04315, over 1622729.00 frames. +2024-01-15 16:04:22,815 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 16:04:37,300 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module1.balancer1.min_positive, batch_count=44993.333333333336, ans=0.025 +2024-01-15 16:04:46,231 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module1.balancer1.prob, batch_count=45026.666666666664, ans=0.125 +2024-01-15 16:04:47,327 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module1.balancer1.prob, batch_count=45026.666666666664, ans=0.125 +2024-01-15 16:04:49,972 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=192, metric=9.15 vs. limit=15.0 +2024-01-15 16:05:02,228 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.790e+02 2.297e+02 2.683e+02 3.319e+02 6.335e+02, threshold=5.366e+02, percent-clipped=3.0 +2024-01-15 16:05:25,074 INFO [train.py:994] (1/2) Epoch 17, batch 50, loss[loss=0.1641, simple_loss=0.2426, pruned_loss=0.04273, over 24515.00 frames. ], tot_loss[loss=0.1811, simple_loss=0.2547, pruned_loss=0.0538, over 1086357.35 frames. ], batch size: 243, lr: 2.28e-02, grad_scale: 32.0 +2024-01-15 16:05:27,712 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass_mid.scale_min, batch_count=45126.666666666664, ans=0.2 +2024-01-15 16:05:40,733 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.out_combiner.scale_min, batch_count=45160.0, ans=0.2 +2024-01-15 16:05:41,169 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=384, metric=4.50 vs. limit=15.0 +2024-01-15 16:05:44,293 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=4.72 vs. limit=6.0 +2024-01-15 16:05:52,222 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=45193.333333333336, ans=0.125 +2024-01-15 16:05:58,013 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.convnext.hidden_balancer.prob, batch_count=45193.333333333336, ans=0.125 +2024-01-15 16:06:10,094 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module1.balancer2.prob, batch_count=45226.666666666664, ans=0.125 +2024-01-15 16:06:26,381 INFO [train.py:994] (1/2) Epoch 17, batch 100, loss[loss=0.1743, simple_loss=0.2485, pruned_loss=0.05005, over 24538.00 frames. ], tot_loss[loss=0.1809, simple_loss=0.2547, pruned_loss=0.05359, over 1910311.88 frames. ], batch size: 236, lr: 2.28e-02, grad_scale: 32.0 +2024-01-15 16:06:40,690 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.convnext.layerdrop_rate, batch_count=45326.666666666664, ans=0.015 +2024-01-15 16:06:50,986 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.self_attn1.whiten, num_groups=1, num_channels=384, metric=15.01 vs. limit=22.5 +2024-01-15 16:07:07,393 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.680e+02 2.239e+02 2.511e+02 2.975e+02 4.256e+02, threshold=5.022e+02, percent-clipped=0.0 +2024-01-15 16:07:17,628 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.nonlin_attention.balancer.prob, batch_count=45426.666666666664, ans=0.125 +2024-01-15 16:07:30,237 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=45460.0, ans=0.125 +2024-01-15 16:07:31,163 INFO [train.py:994] (1/2) Epoch 17, batch 150, loss[loss=0.2036, simple_loss=0.2746, pruned_loss=0.06632, over 24482.00 frames. ], tot_loss[loss=0.1809, simple_loss=0.255, pruned_loss=0.05342, over 2553162.93 frames. ], batch size: 210, lr: 2.27e-02, grad_scale: 32.0 +2024-01-15 16:07:38,554 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.balancer1.prob, batch_count=45460.0, ans=0.125 +2024-01-15 16:07:59,248 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=45526.666666666664, ans=0.1 +2024-01-15 16:08:18,660 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass_mid.scale_min, batch_count=45560.0, ans=0.2 +2024-01-15 16:08:24,272 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.ff2_skip_rate, batch_count=45593.333333333336, ans=0.0009579710144927527 +2024-01-15 16:08:27,810 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=45593.333333333336, ans=0.125 +2024-01-15 16:08:33,424 INFO [train.py:994] (1/2) Epoch 17, batch 200, loss[loss=0.1785, simple_loss=0.2515, pruned_loss=0.05275, over 24475.00 frames. ], tot_loss[loss=0.1812, simple_loss=0.2555, pruned_loss=0.05343, over 3056112.16 frames. ], batch size: 216, lr: 2.27e-02, grad_scale: 32.0 +2024-01-15 16:08:45,686 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward3.hidden_balancer.prob, batch_count=45660.0, ans=0.125 +2024-01-15 16:08:50,298 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module2.balancer1.min_positive, batch_count=45660.0, ans=0.025 +2024-01-15 16:08:53,165 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=512, metric=20.31 vs. limit=22.5 +2024-01-15 16:09:14,144 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.829e+02 2.236e+02 2.656e+02 3.265e+02 4.776e+02, threshold=5.312e+02, percent-clipped=0.0 +2024-01-15 16:09:18,058 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=45726.666666666664, ans=0.125 +2024-01-15 16:09:35,446 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.ff2_skip_rate, batch_count=45793.333333333336, ans=0.0009144927536231875 +2024-01-15 16:09:36,312 INFO [train.py:994] (1/2) Epoch 17, batch 250, loss[loss=0.1751, simple_loss=0.254, pruned_loss=0.0481, over 24211.00 frames. ], tot_loss[loss=0.1802, simple_loss=0.2543, pruned_loss=0.05303, over 3429829.05 frames. ], batch size: 311, lr: 2.27e-02, grad_scale: 32.0 +2024-01-15 16:10:24,694 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_skip_rate, batch_count=45893.333333333336, ans=0.0 +2024-01-15 16:10:25,857 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.nonlin_attention.balancer.min_positive, batch_count=45926.666666666664, ans=0.05 +2024-01-15 16:10:39,413 INFO [train.py:994] (1/2) Epoch 17, batch 300, loss[loss=0.1654, simple_loss=0.2414, pruned_loss=0.04468, over 24333.00 frames. ], tot_loss[loss=0.18, simple_loss=0.2541, pruned_loss=0.05293, over 3728219.83 frames. ], batch size: 147, lr: 2.26e-02, grad_scale: 32.0 +2024-01-15 16:10:45,632 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_skip_rate, batch_count=45960.0, ans=0.0 +2024-01-15 16:10:45,659 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module2.balancer2.min_abs, batch_count=45960.0, ans=0.5 +2024-01-15 16:11:02,374 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=45993.333333333336, ans=0.125 +2024-01-15 16:11:18,327 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module1.balancer1.prob, batch_count=46060.0, ans=0.125 +2024-01-15 16:11:19,288 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.788e+02 2.220e+02 2.502e+02 2.982e+02 4.452e+02, threshold=5.005e+02, percent-clipped=0.0 +2024-01-15 16:11:29,734 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.balancer_na.min_abs, batch_count=46093.333333333336, ans=0.02 +2024-01-15 16:11:42,058 INFO [train.py:994] (1/2) Epoch 17, batch 350, loss[loss=0.1422, simple_loss=0.203, pruned_loss=0.04074, over 16449.00 frames. ], tot_loss[loss=0.1802, simple_loss=0.2542, pruned_loss=0.05311, over 3963891.93 frames. ], batch size: 70, lr: 2.26e-02, grad_scale: 32.0 +2024-01-15 16:11:42,327 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.balancer2.prob, batch_count=46126.666666666664, ans=0.125 +2024-01-15 16:11:52,625 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=46126.666666666664, ans=0.1 +2024-01-15 16:12:11,252 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=46193.333333333336, ans=0.1 +2024-01-15 16:12:31,314 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.ff3_skip_rate, batch_count=46260.0, ans=0.0008130434782608695 +2024-01-15 16:12:40,265 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module1.balancer2.prob, batch_count=46260.0, ans=0.125 +2024-01-15 16:12:45,165 INFO [train.py:994] (1/2) Epoch 17, batch 400, loss[loss=0.1809, simple_loss=0.2553, pruned_loss=0.05329, over 24430.00 frames. ], tot_loss[loss=0.18, simple_loss=0.2543, pruned_loss=0.05292, over 4153126.75 frames. ], batch size: 250, lr: 2.26e-02, grad_scale: 32.0 +2024-01-15 16:12:49,005 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.0.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 16:12:59,177 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.out_proj.dropout_p, batch_count=46326.666666666664, ans=0.1 +2024-01-15 16:12:59,219 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.balancer2.min_positive, batch_count=46326.666666666664, ans=0.05 +2024-01-15 16:13:04,174 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_skip_rate, batch_count=46326.666666666664, ans=0.0 +2024-01-15 16:13:25,828 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.732e+02 2.173e+02 2.430e+02 2.709e+02 4.420e+02, threshold=4.860e+02, percent-clipped=0.0 +2024-01-15 16:13:26,486 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.whiten, num_groups=1, num_channels=512, metric=7.41 vs. limit=12.0 +2024-01-15 16:13:29,937 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=384, metric=17.58 vs. limit=22.5 +2024-01-15 16:13:37,408 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module2.balancer2.prob, batch_count=46426.666666666664, ans=0.125 +2024-01-15 16:13:47,997 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=256, metric=8.75 vs. limit=15.0 +2024-01-15 16:13:48,337 INFO [train.py:994] (1/2) Epoch 17, batch 450, loss[loss=0.1589, simple_loss=0.2319, pruned_loss=0.04299, over 23983.00 frames. ], tot_loss[loss=0.1798, simple_loss=0.2542, pruned_loss=0.05275, over 4289048.22 frames. ], batch size: 131, lr: 2.26e-02, grad_scale: 32.0 +2024-01-15 16:13:49,015 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.conv_module1.whiten, num_groups=1, num_channels=384, metric=3.96 vs. limit=15.0 +2024-01-15 16:14:18,762 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=12.87 vs. limit=15.0 +2024-01-15 16:14:24,726 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=4.49 vs. limit=10.0 +2024-01-15 16:14:26,988 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=15.54 vs. limit=15.0 +2024-01-15 16:14:50,456 INFO [train.py:994] (1/2) Epoch 17, batch 500, loss[loss=0.1743, simple_loss=0.2484, pruned_loss=0.05007, over 24603.00 frames. ], tot_loss[loss=0.1801, simple_loss=0.2544, pruned_loss=0.05292, over 4404148.44 frames. ], batch size: 199, lr: 2.25e-02, grad_scale: 32.0 +2024-01-15 16:14:53,052 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=46626.666666666664, ans=0.125 +2024-01-15 16:14:55,909 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module1.balancer1.min_positive, batch_count=46626.666666666664, ans=0.025 +2024-01-15 16:15:00,724 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 16:15:03,749 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module1.balancer2.prob, batch_count=46660.0, ans=0.125 +2024-01-15 16:15:09,842 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.ff2_skip_rate, batch_count=46660.0, ans=0.0007260869565217393 +2024-01-15 16:15:18,647 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.whiten, num_groups=1, num_channels=256, metric=4.08 vs. limit=12.0 +2024-01-15 16:15:31,126 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.769e+02 2.099e+02 2.328e+02 2.632e+02 3.698e+02, threshold=4.655e+02, percent-clipped=0.0 +2024-01-15 16:15:31,501 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_module2.balancer1.prob, batch_count=46726.666666666664, ans=0.125 +2024-01-15 16:15:42,402 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.bypass.scale_min, batch_count=46760.0, ans=0.2 +2024-01-15 16:15:43,952 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=384, metric=22.29 vs. limit=22.5 +2024-01-15 16:15:48,940 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_module2.balancer1.prob, batch_count=46760.0, ans=0.125 +2024-01-15 16:15:53,389 INFO [train.py:994] (1/2) Epoch 17, batch 550, loss[loss=0.1781, simple_loss=0.2491, pruned_loss=0.05351, over 24466.00 frames. ], tot_loss[loss=0.1802, simple_loss=0.2547, pruned_loss=0.05287, over 4499954.11 frames. ], batch size: 181, lr: 2.25e-02, grad_scale: 32.0 +2024-01-15 16:16:01,030 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.self_attn1.whiten, num_groups=1, num_channels=192, metric=12.91 vs. limit=22.5 +2024-01-15 16:16:09,722 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module2.balancer1.max_abs, batch_count=46826.666666666664, ans=10.0 +2024-01-15 16:16:21,081 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.convnext.out_balancer.prob, batch_count=46860.0, ans=0.125 +2024-01-15 16:16:28,287 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.whiten, num_groups=1, num_channels=192, metric=3.96 vs. limit=12.0 +2024-01-15 16:16:39,620 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=46893.333333333336, ans=0.1 +2024-01-15 16:16:56,190 INFO [train.py:994] (1/2) Epoch 17, batch 600, loss[loss=0.1508, simple_loss=0.2163, pruned_loss=0.04265, over 23540.00 frames. ], tot_loss[loss=0.1798, simple_loss=0.2542, pruned_loss=0.05268, over 4566791.59 frames. ], batch size: 119, lr: 2.25e-02, grad_scale: 32.0 +2024-01-15 16:17:23,925 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=47026.666666666664, ans=0.125 +2024-01-15 16:17:26,198 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.ff2_skip_rate, batch_count=47026.666666666664, ans=0.0006463768115942039 +2024-01-15 16:17:36,248 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.786e+02 2.170e+02 2.416e+02 2.985e+02 4.371e+02, threshold=4.832e+02, percent-clipped=0.0 +2024-01-15 16:17:40,064 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=512, metric=15.27 vs. limit=15.0 +2024-01-15 16:17:45,965 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=512, metric=8.20 vs. limit=15.0 +2024-01-15 16:17:56,318 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=47093.333333333336, ans=0.1 +2024-01-15 16:17:58,328 INFO [train.py:994] (1/2) Epoch 17, batch 650, loss[loss=0.1586, simple_loss=0.2333, pruned_loss=0.042, over 23948.00 frames. ], tot_loss[loss=0.1796, simple_loss=0.2543, pruned_loss=0.05249, over 4619060.51 frames. ], batch size: 131, lr: 2.24e-02, grad_scale: 32.0 +2024-01-15 16:18:55,499 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module1.balancer2.prob, batch_count=47260.0, ans=0.125 +2024-01-15 16:18:58,731 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=384, metric=19.27 vs. limit=22.5 +2024-01-15 16:19:01,743 INFO [train.py:994] (1/2) Epoch 17, batch 700, loss[loss=0.1857, simple_loss=0.2567, pruned_loss=0.05735, over 24526.00 frames. ], tot_loss[loss=0.1796, simple_loss=0.254, pruned_loss=0.05258, over 4646565.62 frames. ], batch size: 165, lr: 2.24e-02, grad_scale: 32.0 +2024-01-15 16:19:20,168 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.balancer2.prob, batch_count=47326.666666666664, ans=0.125 +2024-01-15 16:19:41,213 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.873e+02 2.097e+02 2.365e+02 2.867e+02 5.331e+02, threshold=4.730e+02, percent-clipped=3.0 +2024-01-15 16:19:51,375 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.bypass.scale_min, batch_count=47426.666666666664, ans=0.2 +2024-01-15 16:20:04,060 INFO [train.py:994] (1/2) Epoch 17, batch 750, loss[loss=0.1865, simple_loss=0.2608, pruned_loss=0.05609, over 24482.00 frames. ], tot_loss[loss=0.1804, simple_loss=0.2547, pruned_loss=0.053, over 4684560.37 frames. ], batch size: 187, lr: 2.24e-02, grad_scale: 32.0 +2024-01-15 16:20:05,525 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.balancer2.prob, batch_count=47460.0, ans=0.125 +2024-01-15 16:20:18,845 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.self_attn_weights.pos_emb_skip_rate, batch_count=47493.333333333336, ans=0.0 +2024-01-15 16:20:24,823 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass_mid.scale_min, batch_count=47493.333333333336, ans=0.2 +2024-01-15 16:20:28,960 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=13.87 vs. limit=15.0 +2024-01-15 16:20:59,156 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.attention_skip_rate, batch_count=47593.333333333336, ans=0.0 +2024-01-15 16:21:04,613 INFO [train.py:994] (1/2) Epoch 17, batch 800, loss[loss=0.1792, simple_loss=0.2523, pruned_loss=0.05305, over 24490.00 frames. ], tot_loss[loss=0.1796, simple_loss=0.254, pruned_loss=0.05259, over 4706477.28 frames. ], batch size: 222, lr: 2.23e-02, grad_scale: 32.0 +2024-01-15 16:21:39,081 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward2.out_whiten.whitening_limit, batch_count=47726.666666666664, ans=15.0 +2024-01-15 16:21:42,982 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.821e+02 2.266e+02 2.660e+02 3.077e+02 4.335e+02, threshold=5.320e+02, percent-clipped=0.0 +2024-01-15 16:21:43,215 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=47726.666666666664, ans=0.125 +2024-01-15 16:22:17,933 INFO [train.py:994] (1/2) Epoch 18, batch 0, loss[loss=0.1759, simple_loss=0.2549, pruned_loss=0.04846, over 24472.00 frames. ], tot_loss[loss=0.1759, simple_loss=0.2549, pruned_loss=0.04846, over 24472.00 frames. ], batch size: 216, lr: 2.18e-02, grad_scale: 32.0 +2024-01-15 16:22:17,933 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 16:22:38,461 INFO [train.py:1026] (1/2) Epoch 18, validation: loss=0.1722, simple_loss=0.258, pruned_loss=0.04322, over 1622729.00 frames. +2024-01-15 16:22:38,461 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 16:22:42,248 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.convnext.hidden_balancer.prob, batch_count=47770.0, ans=0.125 +2024-01-15 16:22:50,674 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module1.balancer1.prob, batch_count=47803.333333333336, ans=0.125 +2024-01-15 16:23:08,824 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.conv_module2.whiten, num_groups=1, num_channels=384, metric=2.87 vs. limit=15.0 +2024-01-15 16:23:22,962 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=384, metric=22.49 vs. limit=22.5 +2024-01-15 16:23:36,825 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module2.balancer1.prob, batch_count=47903.333333333336, ans=0.125 +2024-01-15 16:23:40,135 INFO [train.py:994] (1/2) Epoch 18, batch 50, loss[loss=0.1798, simple_loss=0.2554, pruned_loss=0.05211, over 24461.00 frames. ], tot_loss[loss=0.1799, simple_loss=0.2538, pruned_loss=0.05296, over 1085096.20 frames. ], batch size: 267, lr: 2.18e-02, grad_scale: 32.0 +2024-01-15 16:24:04,818 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=48003.333333333336, ans=0.1 +2024-01-15 16:24:11,995 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=48003.333333333336, ans=0.125 +2024-01-15 16:24:19,590 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_skip_rate, batch_count=48036.666666666664, ans=0.0 +2024-01-15 16:24:30,477 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.809e+02 2.223e+02 2.437e+02 2.877e+02 4.793e+02, threshold=4.874e+02, percent-clipped=0.0 +2024-01-15 16:24:42,523 INFO [train.py:994] (1/2) Epoch 18, batch 100, loss[loss=0.1816, simple_loss=0.26, pruned_loss=0.05157, over 24542.00 frames. ], tot_loss[loss=0.1776, simple_loss=0.252, pruned_loss=0.05153, over 1918902.40 frames. ], batch size: 187, lr: 2.18e-02, grad_scale: 32.0 +2024-01-15 16:24:46,218 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=384, metric=2.36 vs. limit=15.0 +2024-01-15 16:24:46,218 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.whiten.whitening_limit, batch_count=48103.333333333336, ans=15.0 +2024-01-15 16:25:04,734 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module2.balancer1.prob, batch_count=48136.666666666664, ans=0.125 +2024-01-15 16:25:12,582 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=48170.0, ans=0.0 +2024-01-15 16:25:13,098 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=384, metric=6.09 vs. limit=15.0 +2024-01-15 16:25:15,709 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module2.balancer2.prob, batch_count=48170.0, ans=0.125 +2024-01-15 16:25:46,401 INFO [train.py:994] (1/2) Epoch 18, batch 150, loss[loss=0.1647, simple_loss=0.2405, pruned_loss=0.04442, over 24499.00 frames. ], tot_loss[loss=0.1768, simple_loss=0.2512, pruned_loss=0.05117, over 2556267.05 frames. ], batch size: 229, lr: 2.17e-02, grad_scale: 32.0 +2024-01-15 16:25:47,964 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=48270.0, ans=0.0 +2024-01-15 16:25:49,163 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.ff2_skip_rate, batch_count=48270.0, ans=0.0003760869565217397 +2024-01-15 16:25:50,819 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.nonlin_attention.whiten2, num_groups=1, num_channels=384, metric=6.64 vs. limit=15.0 +2024-01-15 16:26:11,816 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_module2.balancer2.prob, batch_count=48336.666666666664, ans=0.125 +2024-01-15 16:26:22,342 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.ff3_skip_rate, batch_count=48370.0, ans=0.00035434782608695717 +2024-01-15 16:26:22,348 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=48370.0, ans=0.125 +2024-01-15 16:26:31,839 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_module1.balancer2.min_abs, batch_count=48370.0, ans=0.5 +2024-01-15 16:26:36,204 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.829e+02 2.101e+02 2.311e+02 2.686e+02 3.880e+02, threshold=4.622e+02, percent-clipped=0.0 +2024-01-15 16:26:48,288 INFO [train.py:994] (1/2) Epoch 18, batch 200, loss[loss=0.1812, simple_loss=0.2578, pruned_loss=0.05232, over 24540.00 frames. ], tot_loss[loss=0.1766, simple_loss=0.251, pruned_loss=0.05106, over 3054782.11 frames. ], batch size: 243, lr: 2.17e-02, grad_scale: 32.0 +2024-01-15 16:27:28,443 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=3.66 vs. limit=10.0 +2024-01-15 16:27:30,465 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=48536.666666666664, ans=0.1 +2024-01-15 16:27:33,388 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=384, metric=26.99 vs. limit=22.5 +2024-01-15 16:27:41,243 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.attention_skip_rate, batch_count=48570.0, ans=0.0 +2024-01-15 16:27:41,335 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.ff3_skip_rate, batch_count=48570.0, ans=0.00031086956521739206 +2024-01-15 16:27:53,065 INFO [train.py:994] (1/2) Epoch 18, batch 250, loss[loss=0.1931, simple_loss=0.2703, pruned_loss=0.05798, over 24492.00 frames. ], tot_loss[loss=0.1765, simple_loss=0.2512, pruned_loss=0.05089, over 3430923.37 frames. ], batch size: 187, lr: 2.17e-02, grad_scale: 32.0 +2024-01-15 16:27:55,788 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.1.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 16:28:10,106 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=48636.666666666664, ans=0.1 +2024-01-15 16:28:10,162 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 16:28:13,685 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.ff2_skip_rate, batch_count=48636.666666666664, ans=0.00029637681159420254 +2024-01-15 16:28:31,835 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.feed_forward1.out_whiten, num_groups=1, num_channels=192, metric=9.47 vs. limit=15.0 +2024-01-15 16:28:35,529 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.self_attn1.whiten, num_groups=1, num_channels=192, metric=14.72 vs. limit=22.5 +2024-01-15 16:28:44,347 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.879e+02 2.119e+02 2.331e+02 2.735e+02 4.692e+02, threshold=4.662e+02, percent-clipped=2.0 +2024-01-15 16:28:49,476 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module2.balancer2.prob, batch_count=48736.666666666664, ans=0.125 +2024-01-15 16:28:49,489 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module1.balancer1.prob, batch_count=48736.666666666664, ans=0.125 +2024-01-15 16:28:56,398 INFO [train.py:994] (1/2) Epoch 18, batch 300, loss[loss=0.1944, simple_loss=0.271, pruned_loss=0.05891, over 22282.00 frames. ], tot_loss[loss=0.177, simple_loss=0.2521, pruned_loss=0.05101, over 3737837.06 frames. ], batch size: 357, lr: 2.16e-02, grad_scale: 32.0 +2024-01-15 16:29:23,413 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=48836.666666666664, ans=0.125 +2024-01-15 16:30:00,009 INFO [train.py:994] (1/2) Epoch 18, batch 350, loss[loss=0.1772, simple_loss=0.2535, pruned_loss=0.05046, over 24595.00 frames. ], tot_loss[loss=0.177, simple_loss=0.2522, pruned_loss=0.05092, over 3983890.15 frames. ], batch size: 199, lr: 2.16e-02, grad_scale: 32.0 +2024-01-15 16:30:07,040 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.ff2_skip_rate, batch_count=48936.666666666664, ans=0.00023115942028985488 +2024-01-15 16:30:47,900 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.whiten, num_groups=1, num_channels=384, metric=6.04 vs. limit=12.0 +2024-01-15 16:30:50,931 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.733e+02 2.195e+02 2.456e+02 2.912e+02 4.150e+02, threshold=4.913e+02, percent-clipped=0.0 +2024-01-15 16:30:51,667 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.whiten.whitening_limit, batch_count=49070.0, ans=12.0 +2024-01-15 16:30:56,579 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 16:30:59,882 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=49070.0, ans=0.1 +2024-01-15 16:31:03,351 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_skip_rate, batch_count=49103.333333333336, ans=0.0 +2024-01-15 16:31:04,286 INFO [train.py:994] (1/2) Epoch 18, batch 400, loss[loss=0.17, simple_loss=0.2473, pruned_loss=0.04635, over 24395.00 frames. ], tot_loss[loss=0.1765, simple_loss=0.2514, pruned_loss=0.05077, over 4165763.76 frames. ], batch size: 159, lr: 2.16e-02, grad_scale: 32.0 +2024-01-15 16:31:05,866 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.4.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 16:31:32,719 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward2.hidden_balancer.prob, batch_count=49170.0, ans=0.125 +2024-01-15 16:32:07,758 INFO [train.py:994] (1/2) Epoch 18, batch 450, loss[loss=0.1757, simple_loss=0.2469, pruned_loss=0.05218, over 24472.00 frames. ], tot_loss[loss=0.1762, simple_loss=0.2513, pruned_loss=0.05058, over 4314796.77 frames. ], batch size: 216, lr: 2.16e-02, grad_scale: 32.0 +2024-01-15 16:32:10,481 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=49270.0, ans=0.1 +2024-01-15 16:32:21,509 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=49303.333333333336, ans=0.125 +2024-01-15 16:32:26,455 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.balancer1.prob, batch_count=49303.333333333336, ans=0.125 +2024-01-15 16:32:45,863 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.conv_module2.whiten, num_groups=1, num_channels=384, metric=10.33 vs. limit=15.0 +2024-01-15 16:32:46,020 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.feed_forward1.out_whiten, num_groups=1, num_channels=192, metric=11.29 vs. limit=15.0 +2024-01-15 16:32:52,753 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module1.balancer2.prob, batch_count=49370.0, ans=0.125 +2024-01-15 16:32:54,076 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.bypass_mid.scale_min, batch_count=49370.0, ans=0.2 +2024-01-15 16:32:59,685 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.700e+02 2.073e+02 2.302e+02 2.664e+02 3.981e+02, threshold=4.605e+02, percent-clipped=0.0 +2024-01-15 16:33:12,294 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.ff3_skip_rate, batch_count=49436.666666666664, ans=0.00012246376811594212 +2024-01-15 16:33:13,173 INFO [train.py:994] (1/2) Epoch 18, batch 500, loss[loss=0.1796, simple_loss=0.2515, pruned_loss=0.05387, over 24535.00 frames. ], tot_loss[loss=0.1759, simple_loss=0.2511, pruned_loss=0.05034, over 4414270.75 frames. ], batch size: 236, lr: 2.15e-02, grad_scale: 32.0 +2024-01-15 16:33:34,488 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module2.balancer1.max_abs, batch_count=49470.0, ans=10.0 +2024-01-15 16:33:41,371 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.5.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 16:33:44,734 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.attention_skip_rate, batch_count=49503.333333333336, ans=0.0 +2024-01-15 16:33:46,493 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.whiten, num_groups=1, num_channels=512, metric=5.17 vs. limit=12.0 +2024-01-15 16:33:48,421 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_skip_rate, batch_count=49503.333333333336, ans=0.0 +2024-01-15 16:33:54,888 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward2.out_whiten.whitening_limit, batch_count=49536.666666666664, ans=15.0 +2024-01-15 16:33:55,687 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.ff2_skip_rate, batch_count=49536.666666666664, ans=0.00010072463768115957 +2024-01-15 16:34:12,336 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.balancer2.prob, batch_count=49570.0, ans=0.125 +2024-01-15 16:34:15,825 INFO [train.py:994] (1/2) Epoch 18, batch 550, loss[loss=0.1847, simple_loss=0.2592, pruned_loss=0.0551, over 24491.00 frames. ], tot_loss[loss=0.176, simple_loss=0.2514, pruned_loss=0.05027, over 4512683.31 frames. ], batch size: 216, lr: 2.15e-02, grad_scale: 16.0 +2024-01-15 16:34:26,268 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=49603.333333333336, ans=0.125 +2024-01-15 16:34:39,127 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=384, metric=3.42 vs. limit=15.0 +2024-01-15 16:34:47,441 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.nonlin_attention.balancer.min_positive, batch_count=49670.0, ans=0.05 +2024-01-15 16:35:01,224 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module2.balancer2.prob, batch_count=49703.333333333336, ans=0.125 +2024-01-15 16:35:07,148 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.bypass_mid.scale_min, batch_count=49736.666666666664, ans=0.2 +2024-01-15 16:35:08,017 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.782e+02 2.215e+02 2.472e+02 3.117e+02 4.918e+02, threshold=4.944e+02, percent-clipped=1.0 +2024-01-15 16:35:08,334 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.attention_skip_rate, batch_count=49736.666666666664, ans=0.0 +2024-01-15 16:35:19,118 INFO [train.py:994] (1/2) Epoch 18, batch 600, loss[loss=0.169, simple_loss=0.2337, pruned_loss=0.05213, over 23568.00 frames. ], tot_loss[loss=0.176, simple_loss=0.2512, pruned_loss=0.05038, over 4582954.56 frames. ], batch size: 119, lr: 2.15e-02, grad_scale: 16.0 +2024-01-15 16:35:33,470 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.balancer.max_positive, batch_count=49803.333333333336, ans=0.95 +2024-01-15 16:35:45,891 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module2.balancer2.prob, batch_count=49836.666666666664, ans=0.125 +2024-01-15 16:35:59,841 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.bypass.skip_rate, batch_count=49870.0, ans=0.035 +2024-01-15 16:36:12,278 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.bypass.skip_rate, batch_count=49903.333333333336, ans=0.04949747468305833 +2024-01-15 16:36:22,527 INFO [train.py:994] (1/2) Epoch 18, batch 650, loss[loss=0.1713, simple_loss=0.2489, pruned_loss=0.04688, over 24493.00 frames. ], tot_loss[loss=0.175, simple_loss=0.2503, pruned_loss=0.04981, over 4631732.68 frames. ], batch size: 243, lr: 2.14e-02, grad_scale: 8.0 +2024-01-15 16:36:25,260 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.bypass_mid.scale_min, batch_count=49936.666666666664, ans=0.2 +2024-01-15 16:36:32,457 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_skip_rate, batch_count=49936.666666666664, ans=0.0 +2024-01-15 16:36:52,207 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=50003.333333333336, ans=0.1 +2024-01-15 16:36:56,862 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.balancer2.prob, batch_count=50003.333333333336, ans=0.125 +2024-01-15 16:37:09,732 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.conv_module1.whiten, num_groups=1, num_channels=384, metric=3.40 vs. limit=15.0 +2024-01-15 16:37:10,445 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.ff2_skip_rate, batch_count=50036.666666666664, ans=0.0 +2024-01-15 16:37:10,507 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module2.balancer1.prob, batch_count=50036.666666666664, ans=0.125 +2024-01-15 16:37:10,880 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=4.95 vs. limit=6.0 +2024-01-15 16:37:15,975 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.737e+02 2.026e+02 2.388e+02 2.731e+02 3.877e+02, threshold=4.775e+02, percent-clipped=0.0 +2024-01-15 16:37:24,713 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward1.hidden_balancer.prob, batch_count=50103.333333333336, ans=0.125 +2024-01-15 16:37:25,636 INFO [train.py:994] (1/2) Epoch 18, batch 700, loss[loss=0.1627, simple_loss=0.2439, pruned_loss=0.04071, over 24310.00 frames. ], tot_loss[loss=0.1751, simple_loss=0.2506, pruned_loss=0.04982, over 4678088.90 frames. ], batch size: 147, lr: 2.14e-02, grad_scale: 8.0 +2024-01-15 16:37:30,122 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass.skip_rate, batch_count=50103.333333333336, ans=0.04949747468305833 +2024-01-15 16:37:35,599 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=50103.333333333336, ans=0.125 +2024-01-15 16:37:39,337 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.bypass.scale_min, batch_count=50136.666666666664, ans=0.2 +2024-01-15 16:37:58,539 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=50170.0, ans=0.1 +2024-01-15 16:38:02,206 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=50170.0, ans=0.125 +2024-01-15 16:38:06,374 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=13.77 vs. limit=15.0 +2024-01-15 16:38:11,397 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=4.86 vs. limit=6.0 +2024-01-15 16:38:29,424 INFO [train.py:994] (1/2) Epoch 18, batch 750, loss[loss=0.1563, simple_loss=0.2338, pruned_loss=0.0394, over 24234.00 frames. ], tot_loss[loss=0.1748, simple_loss=0.2506, pruned_loss=0.04954, over 4707389.00 frames. ], batch size: 140, lr: 2.14e-02, grad_scale: 8.0 +2024-01-15 16:38:32,148 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.ff2_skip_rate, batch_count=50270.0, ans=0.0 +2024-01-15 16:38:57,015 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=50336.666666666664, ans=0.0 +2024-01-15 16:39:07,644 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.self_attn2.whiten, num_groups=1, num_channels=384, metric=15.86 vs. limit=22.5 +2024-01-15 16:39:20,544 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.709e+02 2.077e+02 2.417e+02 3.053e+02 4.815e+02, threshold=4.834e+02, percent-clipped=1.0 +2024-01-15 16:39:25,287 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.out_proj.dropout_p, batch_count=50403.333333333336, ans=0.1 +2024-01-15 16:39:25,337 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_skip_rate, batch_count=50403.333333333336, ans=0.0 +2024-01-15 16:39:29,606 INFO [train.py:994] (1/2) Epoch 18, batch 800, loss[loss=0.1598, simple_loss=0.2354, pruned_loss=0.0421, over 24169.00 frames. ], tot_loss[loss=0.1752, simple_loss=0.2507, pruned_loss=0.04984, over 4724038.60 frames. ], batch size: 140, lr: 2.14e-02, grad_scale: 16.0 +2024-01-15 16:39:33,839 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=11.75 vs. limit=15.0 +2024-01-15 16:39:35,271 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.self_attn2.whiten, num_groups=1, num_channels=384, metric=17.97 vs. limit=22.5 +2024-01-15 16:39:39,407 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_skip_rate, batch_count=50436.666666666664, ans=0.0 +2024-01-15 16:39:53,785 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.ff3_skip_rate, batch_count=50503.333333333336, ans=0.0 +2024-01-15 16:39:59,977 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.ff3_skip_rate, batch_count=50503.333333333336, ans=0.0 +2024-01-15 16:40:09,693 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=7.69 vs. limit=15.0 +2024-01-15 16:40:13,079 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_skip_rate, batch_count=50536.666666666664, ans=0.0 +2024-01-15 16:40:42,953 INFO [train.py:994] (1/2) Epoch 19, batch 0, loss[loss=0.1803, simple_loss=0.2541, pruned_loss=0.05325, over 24387.00 frames. ], tot_loss[loss=0.1803, simple_loss=0.2541, pruned_loss=0.05325, over 24387.00 frames. ], batch size: 298, lr: 2.09e-02, grad_scale: 16.0 +2024-01-15 16:40:42,954 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 16:41:03,786 INFO [train.py:1026] (1/2) Epoch 19, validation: loss=0.17, simple_loss=0.2559, pruned_loss=0.0421, over 1622729.00 frames. +2024-01-15 16:41:03,787 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 16:41:14,615 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module2.balancer2.prob, batch_count=50613.333333333336, ans=0.125 +2024-01-15 16:41:46,110 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_skip_rate, batch_count=50680.0, ans=0.0 +2024-01-15 16:41:54,557 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.whiten, num_groups=1, num_channels=384, metric=3.08 vs. limit=12.0 +2024-01-15 16:42:04,257 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.balancer2.prob, batch_count=50746.666666666664, ans=0.125 +2024-01-15 16:42:05,124 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.800e+02 2.143e+02 2.478e+02 3.058e+02 5.421e+02, threshold=4.957e+02, percent-clipped=1.0 +2024-01-15 16:42:05,152 INFO [train.py:994] (1/2) Epoch 19, batch 50, loss[loss=0.1795, simple_loss=0.258, pruned_loss=0.05047, over 24492.00 frames. ], tot_loss[loss=0.1752, simple_loss=0.2503, pruned_loss=0.05001, over 1078757.24 frames. ], batch size: 181, lr: 2.08e-02, grad_scale: 16.0 +2024-01-15 16:42:13,250 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module1.balancer1.prob, batch_count=50746.666666666664, ans=0.125 +2024-01-15 16:42:26,919 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.bypass.skip_rate, batch_count=50780.0, ans=0.04949747468305833 +2024-01-15 16:42:26,956 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.attention_skip_rate, batch_count=50780.0, ans=0.0 +2024-01-15 16:42:40,480 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=50813.333333333336, ans=0.125 +2024-01-15 16:42:41,567 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.out_proj.dropout_p, batch_count=50846.666666666664, ans=0.1 +2024-01-15 16:42:44,080 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.balancer2.min_positive, batch_count=50846.666666666664, ans=0.05 +2024-01-15 16:42:46,995 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module1.balancer2.prob, batch_count=50846.666666666664, ans=0.125 +2024-01-15 16:42:53,033 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.bypass_mid.scale_min, batch_count=50846.666666666664, ans=0.2 +2024-01-15 16:43:07,602 INFO [train.py:994] (1/2) Epoch 19, batch 100, loss[loss=0.1477, simple_loss=0.207, pruned_loss=0.04415, over 17596.00 frames. ], tot_loss[loss=0.1737, simple_loss=0.2491, pruned_loss=0.04919, over 1904508.41 frames. ], batch size: 76, lr: 2.08e-02, grad_scale: 16.0 +2024-01-15 16:43:09,128 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=50913.333333333336, ans=0.1 +2024-01-15 16:43:22,329 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_skip_rate, batch_count=50946.666666666664, ans=0.0 +2024-01-15 16:43:33,478 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_skip_rate, batch_count=50980.0, ans=0.0 +2024-01-15 16:43:49,536 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.balancer2.prob, batch_count=51013.333333333336, ans=0.125 +2024-01-15 16:44:09,231 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.706e+02 2.155e+02 2.420e+02 2.780e+02 3.768e+02, threshold=4.840e+02, percent-clipped=0.0 +2024-01-15 16:44:09,259 INFO [train.py:994] (1/2) Epoch 19, batch 150, loss[loss=0.1767, simple_loss=0.2544, pruned_loss=0.04951, over 24615.00 frames. ], tot_loss[loss=0.1747, simple_loss=0.2504, pruned_loss=0.04947, over 2550565.84 frames. ], batch size: 199, lr: 2.08e-02, grad_scale: 16.0 +2024-01-15 16:44:20,450 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module1.whiten.whitening_limit, batch_count=51113.333333333336, ans=15.0 +2024-01-15 16:44:36,341 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module1.balancer1.prob, batch_count=51146.666666666664, ans=0.125 +2024-01-15 16:44:48,379 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.bypass.scale_min, batch_count=51180.0, ans=0.2 +2024-01-15 16:44:50,247 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module2.balancer1.min_positive, batch_count=51180.0, ans=0.025 +2024-01-15 16:45:02,593 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=51213.333333333336, ans=0.125 +2024-01-15 16:45:02,633 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module1.balancer2.prob, batch_count=51213.333333333336, ans=0.125 +2024-01-15 16:45:03,760 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module2.balancer1.prob, batch_count=51213.333333333336, ans=0.125 +2024-01-15 16:45:06,183 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.self_attn_weights.pos_emb_skip_rate, batch_count=51213.333333333336, ans=0.0 +2024-01-15 16:45:08,904 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=9.05 vs. limit=15.0 +2024-01-15 16:45:10,497 INFO [train.py:994] (1/2) Epoch 19, batch 200, loss[loss=0.1534, simple_loss=0.2277, pruned_loss=0.03949, over 23978.00 frames. ], tot_loss[loss=0.1741, simple_loss=0.2495, pruned_loss=0.04932, over 3050696.76 frames. ], batch size: 131, lr: 2.08e-02, grad_scale: 16.0 +2024-01-15 16:45:10,779 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=51246.666666666664, ans=0.1 +2024-01-15 16:45:18,617 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=7.20 vs. limit=10.0 +2024-01-15 16:45:21,674 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.ff2_skip_rate, batch_count=51246.666666666664, ans=0.0 +2024-01-15 16:45:35,775 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.3.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 16:45:41,092 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module1.balancer2.prob, batch_count=51313.333333333336, ans=0.125 +2024-01-15 16:45:47,844 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=51346.666666666664, ans=0.1 +2024-01-15 16:45:51,221 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.bypass_mid.scale_min, batch_count=51346.666666666664, ans=0.2 +2024-01-15 16:45:57,203 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module2.balancer2.prob, batch_count=51346.666666666664, ans=0.125 +2024-01-15 16:46:10,500 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_skip_rate, batch_count=51380.0, ans=0.0 +2024-01-15 16:46:13,660 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.675e+02 2.155e+02 2.351e+02 2.778e+02 4.374e+02, threshold=4.701e+02, percent-clipped=0.0 +2024-01-15 16:46:13,688 INFO [train.py:994] (1/2) Epoch 19, batch 250, loss[loss=0.1674, simple_loss=0.2481, pruned_loss=0.0433, over 24348.00 frames. ], tot_loss[loss=0.1743, simple_loss=0.2498, pruned_loss=0.04942, over 3430587.30 frames. ], batch size: 298, lr: 2.07e-02, grad_scale: 16.0 +2024-01-15 16:46:17,484 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_skip_rate, batch_count=51413.333333333336, ans=0.0 +2024-01-15 16:46:21,001 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass.skip_rate, batch_count=51413.333333333336, ans=0.04949747468305833 +2024-01-15 16:46:26,891 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_skip_rate, batch_count=51446.666666666664, ans=0.0 +2024-01-15 16:46:51,839 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.attention_skip_rate, batch_count=51513.333333333336, ans=0.0 +2024-01-15 16:47:01,271 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_skip_rate, batch_count=51513.333333333336, ans=0.0 +2024-01-15 16:47:03,757 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=51546.666666666664, ans=0.1 +2024-01-15 16:47:15,126 INFO [train.py:994] (1/2) Epoch 19, batch 300, loss[loss=0.1616, simple_loss=0.24, pruned_loss=0.04164, over 24319.00 frames. ], tot_loss[loss=0.1738, simple_loss=0.2495, pruned_loss=0.04905, over 3746083.24 frames. ], batch size: 147, lr: 2.07e-02, grad_scale: 16.0 +2024-01-15 16:47:20,130 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.balancer1.prob, batch_count=51580.0, ans=0.125 +2024-01-15 16:47:35,176 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.conv_module2.whiten, num_groups=1, num_channels=192, metric=3.30 vs. limit=15.0 +2024-01-15 16:47:52,142 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward1.out_proj.dropout_p, batch_count=51680.0, ans=0.1 +2024-01-15 16:48:02,137 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=3.86 vs. limit=6.0 +2024-01-15 16:48:05,159 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=51713.333333333336, ans=0.1 +2024-01-15 16:48:05,167 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.balancer1.prob, batch_count=51713.333333333336, ans=0.125 +2024-01-15 16:48:06,353 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward1.hidden_balancer.prob, batch_count=51713.333333333336, ans=0.125 +2024-01-15 16:48:12,587 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_skip_rate, batch_count=51713.333333333336, ans=0.0 +2024-01-15 16:48:17,479 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.797e+02 2.144e+02 2.394e+02 2.843e+02 3.932e+02, threshold=4.787e+02, percent-clipped=0.0 +2024-01-15 16:48:17,509 INFO [train.py:994] (1/2) Epoch 19, batch 350, loss[loss=0.1729, simple_loss=0.2526, pruned_loss=0.04661, over 24306.00 frames. ], tot_loss[loss=0.1734, simple_loss=0.2492, pruned_loss=0.04884, over 3971562.68 frames. ], batch size: 285, lr: 2.07e-02, grad_scale: 16.0 +2024-01-15 16:48:22,545 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_skip_rate, batch_count=51746.666666666664, ans=0.0 +2024-01-15 16:48:49,904 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=51813.333333333336, ans=0.0 +2024-01-15 16:48:55,813 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.ff2_skip_rate, batch_count=51846.666666666664, ans=0.0 +2024-01-15 16:49:07,543 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.ff3_skip_rate, batch_count=51880.0, ans=0.0 +2024-01-15 16:49:19,257 INFO [train.py:994] (1/2) Epoch 19, batch 400, loss[loss=0.1849, simple_loss=0.2589, pruned_loss=0.05549, over 24612.00 frames. ], tot_loss[loss=0.1735, simple_loss=0.2492, pruned_loss=0.04886, over 4148785.76 frames. ], batch size: 199, lr: 2.07e-02, grad_scale: 32.0 +2024-01-15 16:49:20,822 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module2.balancer2.prob, batch_count=51913.333333333336, ans=0.125 +2024-01-15 16:49:22,005 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=51913.333333333336, ans=0.1 +2024-01-15 16:49:29,672 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.ff3_skip_rate, batch_count=51913.333333333336, ans=0.0 +2024-01-15 16:49:38,663 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass_mid.scale_min, batch_count=51946.666666666664, ans=0.2 +2024-01-15 16:50:06,384 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_skip_rate, batch_count=52013.333333333336, ans=0.0 +2024-01-15 16:50:19,879 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward2.hidden_balancer.prob, batch_count=52046.666666666664, ans=0.125 +2024-01-15 16:50:21,894 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.790e+02 2.221e+02 2.607e+02 3.142e+02 4.730e+02, threshold=5.215e+02, percent-clipped=0.0 +2024-01-15 16:50:21,922 INFO [train.py:994] (1/2) Epoch 19, batch 450, loss[loss=0.1823, simple_loss=0.2598, pruned_loss=0.05244, over 24516.00 frames. ], tot_loss[loss=0.1733, simple_loss=0.2491, pruned_loss=0.04874, over 4294978.81 frames. ], batch size: 204, lr: 2.06e-02, grad_scale: 32.0 +2024-01-15 16:50:27,897 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.whiten, num_groups=1, num_channels=384, metric=2.96 vs. limit=12.0 +2024-01-15 16:50:43,570 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=52113.333333333336, ans=0.0 +2024-01-15 16:50:45,914 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.attention_skip_rate, batch_count=52146.666666666664, ans=0.0 +2024-01-15 16:50:57,140 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.nonlin_attention.balancer.prob, batch_count=52146.666666666664, ans=0.125 +2024-01-15 16:50:57,151 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.balancer2.prob, batch_count=52146.666666666664, ans=0.125 +2024-01-15 16:51:00,717 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass.scale_min, batch_count=52180.0, ans=0.2 +2024-01-15 16:51:06,477 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.ff2_skip_rate, batch_count=52180.0, ans=0.0 +2024-01-15 16:51:15,627 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=512, metric=2.32 vs. limit=15.0 +2024-01-15 16:51:23,723 INFO [train.py:994] (1/2) Epoch 19, batch 500, loss[loss=0.1681, simple_loss=0.244, pruned_loss=0.04609, over 24506.00 frames. ], tot_loss[loss=0.1728, simple_loss=0.2485, pruned_loss=0.04853, over 4399779.26 frames. ], batch size: 181, lr: 2.06e-02, grad_scale: 32.0 +2024-01-15 16:51:31,605 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=384, metric=22.88 vs. limit=22.5 +2024-01-15 16:51:34,000 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=256, metric=16.33 vs. limit=22.5 +2024-01-15 16:52:03,762 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=52346.666666666664, ans=0.125 +2024-01-15 16:52:12,935 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.ff2_skip_rate, batch_count=52380.0, ans=0.0 +2024-01-15 16:52:23,507 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=52380.0, ans=0.1 +2024-01-15 16:52:25,452 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.709e+02 2.156e+02 2.463e+02 2.947e+02 4.843e+02, threshold=4.926e+02, percent-clipped=0.0 +2024-01-15 16:52:25,480 INFO [train.py:994] (1/2) Epoch 19, batch 550, loss[loss=0.1946, simple_loss=0.2738, pruned_loss=0.0577, over 23841.00 frames. ], tot_loss[loss=0.1724, simple_loss=0.2484, pruned_loss=0.04826, over 4490243.29 frames. ], batch size: 328, lr: 2.06e-02, grad_scale: 32.0 +2024-01-15 16:52:28,797 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=52413.333333333336, ans=0.1 +2024-01-15 16:52:35,570 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=52413.333333333336, ans=0.0 +2024-01-15 16:52:43,731 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=52446.666666666664, ans=0.125 +2024-01-15 16:53:08,565 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=52513.333333333336, ans=0.0 +2024-01-15 16:53:14,337 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module2.balancer1.min_positive, batch_count=52546.666666666664, ans=0.025 +2024-01-15 16:53:19,670 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module1.balancer1.prob, batch_count=52546.666666666664, ans=0.125 +2024-01-15 16:53:28,419 INFO [train.py:994] (1/2) Epoch 19, batch 600, loss[loss=0.18, simple_loss=0.2532, pruned_loss=0.05343, over 24308.00 frames. ], tot_loss[loss=0.1728, simple_loss=0.2489, pruned_loss=0.04837, over 4565679.95 frames. ], batch size: 285, lr: 2.05e-02, grad_scale: 32.0 +2024-01-15 16:53:54,680 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.whiten, num_groups=1, num_channels=512, metric=4.99 vs. limit=12.0 +2024-01-15 16:54:02,305 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module2.balancer1.prob, batch_count=52646.666666666664, ans=0.125 +2024-01-15 16:54:09,984 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward3.hidden_balancer.prob, batch_count=52680.0, ans=0.125 +2024-01-15 16:54:11,086 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=52680.0, ans=0.125 +2024-01-15 16:54:27,215 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_skip_rate, batch_count=52713.333333333336, ans=0.0 +2024-01-15 16:54:30,297 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.813e+02 2.052e+02 2.320e+02 2.671e+02 5.625e+02, threshold=4.639e+02, percent-clipped=1.0 +2024-01-15 16:54:30,325 INFO [train.py:994] (1/2) Epoch 19, batch 650, loss[loss=0.1596, simple_loss=0.2393, pruned_loss=0.03997, over 24392.00 frames. ], tot_loss[loss=0.173, simple_loss=0.2489, pruned_loss=0.04852, over 4620119.03 frames. ], batch size: 258, lr: 2.05e-02, grad_scale: 32.0 +2024-01-15 16:55:20,636 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.out_combiner.scale_min, batch_count=52880.0, ans=0.2 +2024-01-15 16:55:32,700 INFO [train.py:994] (1/2) Epoch 19, batch 700, loss[loss=0.1721, simple_loss=0.2516, pruned_loss=0.04626, over 24485.00 frames. ], tot_loss[loss=0.1727, simple_loss=0.2487, pruned_loss=0.04835, over 4664787.95 frames. ], batch size: 216, lr: 2.05e-02, grad_scale: 32.0 +2024-01-15 16:55:40,955 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_skip_rate, batch_count=52913.333333333336, ans=0.0 +2024-01-15 16:55:44,921 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=9.48 vs. limit=15.0 +2024-01-15 16:55:59,230 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=52980.0, ans=0.1 +2024-01-15 16:56:19,275 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.ff3_skip_rate, batch_count=53013.333333333336, ans=0.0 +2024-01-15 16:56:34,290 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward3.hidden_balancer.prob, batch_count=53080.0, ans=0.125 +2024-01-15 16:56:35,019 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.688e+02 2.044e+02 2.314e+02 2.797e+02 4.143e+02, threshold=4.628e+02, percent-clipped=0.0 +2024-01-15 16:56:35,047 INFO [train.py:994] (1/2) Epoch 19, batch 750, loss[loss=0.1784, simple_loss=0.2534, pruned_loss=0.0517, over 24513.00 frames. ], tot_loss[loss=0.1723, simple_loss=0.2481, pruned_loss=0.04821, over 4689336.31 frames. ], batch size: 236, lr: 2.05e-02, grad_scale: 32.0 +2024-01-15 16:56:43,005 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.attention_skip_rate, batch_count=53080.0, ans=0.0 +2024-01-15 16:56:43,333 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.nonlin_attention.whiten2, num_groups=1, num_channels=512, metric=4.07 vs. limit=15.0 +2024-01-15 16:57:12,664 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward3.hidden_balancer.prob, batch_count=53180.0, ans=0.125 +2024-01-15 16:57:19,500 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.balancer1.prob, batch_count=53180.0, ans=0.125 +2024-01-15 16:57:35,453 INFO [train.py:994] (1/2) Epoch 19, batch 800, loss[loss=0.1633, simple_loss=0.2453, pruned_loss=0.04071, over 24418.00 frames. ], tot_loss[loss=0.1725, simple_loss=0.2484, pruned_loss=0.04828, over 4716733.82 frames. ], batch size: 258, lr: 2.04e-02, grad_scale: 32.0 +2024-01-15 16:57:38,177 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=256, metric=2.12 vs. limit=15.0 +2024-01-15 16:57:55,299 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=53280.0, ans=0.125 +2024-01-15 16:58:13,055 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward2.hidden_balancer.prob, batch_count=53346.666666666664, ans=0.125 +2024-01-15 16:58:14,124 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=53346.666666666664, ans=0.1 +2024-01-15 16:58:50,926 INFO [train.py:994] (1/2) Epoch 20, batch 0, loss[loss=0.1715, simple_loss=0.2511, pruned_loss=0.04598, over 23883.00 frames. ], tot_loss[loss=0.1715, simple_loss=0.2511, pruned_loss=0.04598, over 23883.00 frames. ], batch size: 328, lr: 2.00e-02, grad_scale: 32.0 +2024-01-15 16:58:50,927 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 16:59:11,889 INFO [train.py:1026] (1/2) Epoch 20, validation: loss=0.1698, simple_loss=0.2555, pruned_loss=0.042, over 1622729.00 frames. +2024-01-15 16:59:11,890 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 16:59:21,977 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.658e+02 2.223e+02 2.554e+02 3.029e+02 4.256e+02, threshold=5.107e+02, percent-clipped=0.0 +2024-01-15 16:59:24,836 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.balancer2.prob, batch_count=53423.333333333336, ans=0.125 +2024-01-15 16:59:27,106 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module1.balancer2.prob, batch_count=53423.333333333336, ans=0.125 +2024-01-15 16:59:57,876 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=53490.0, ans=0.1 +2024-01-15 17:00:05,650 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module2.balancer1.prob, batch_count=53523.333333333336, ans=0.125 +2024-01-15 17:00:08,525 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module2.balancer2.min_positive, batch_count=53523.333333333336, ans=0.05 +2024-01-15 17:00:14,350 INFO [train.py:994] (1/2) Epoch 20, batch 50, loss[loss=0.1652, simple_loss=0.2413, pruned_loss=0.04448, over 24321.00 frames. ], tot_loss[loss=0.1692, simple_loss=0.2458, pruned_loss=0.04633, over 1084716.08 frames. ], batch size: 285, lr: 2.00e-02, grad_scale: 32.0 +2024-01-15 17:00:37,607 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=384, metric=6.75 vs. limit=10.0 +2024-01-15 17:00:49,060 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.out_combiner.scale_min, batch_count=53623.333333333336, ans=0.2 +2024-01-15 17:01:04,647 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module2.balancer2.prob, batch_count=53690.0, ans=0.125 +2024-01-15 17:01:16,004 INFO [train.py:994] (1/2) Epoch 20, batch 100, loss[loss=0.1553, simple_loss=0.2351, pruned_loss=0.03771, over 24204.00 frames. ], tot_loss[loss=0.1704, simple_loss=0.2466, pruned_loss=0.0471, over 1912176.57 frames. ], batch size: 140, lr: 1.99e-02, grad_scale: 32.0 +2024-01-15 17:01:19,159 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.attention_skip_rate, batch_count=53723.333333333336, ans=0.0 +2024-01-15 17:01:26,642 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.665e+02 1.987e+02 2.287e+02 2.655e+02 4.191e+02, threshold=4.574e+02, percent-clipped=0.0 +2024-01-15 17:01:32,862 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.ff2_skip_rate, batch_count=53756.666666666664, ans=0.0 +2024-01-15 17:01:38,899 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=53756.666666666664, ans=0.1 +2024-01-15 17:01:43,142 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.bypass.skip_rate, batch_count=53790.0, ans=0.07 +2024-01-15 17:01:46,672 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=53790.0, ans=0.125 +2024-01-15 17:01:54,254 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.bypass_mid.scale_min, batch_count=53823.333333333336, ans=0.2 +2024-01-15 17:02:03,515 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.balancer2.prob, batch_count=53823.333333333336, ans=0.125 +2024-01-15 17:02:05,849 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass_mid.scale_min, batch_count=53856.666666666664, ans=0.2 +2024-01-15 17:02:10,260 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward1.hidden_balancer.prob, batch_count=53856.666666666664, ans=0.125 +2024-01-15 17:02:18,899 INFO [train.py:994] (1/2) Epoch 20, batch 150, loss[loss=0.1731, simple_loss=0.2553, pruned_loss=0.04541, over 24381.00 frames. ], tot_loss[loss=0.1696, simple_loss=0.2458, pruned_loss=0.04671, over 2551514.68 frames. ], batch size: 258, lr: 1.99e-02, grad_scale: 32.0 +2024-01-15 17:02:25,122 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=53890.0, ans=0.125 +2024-01-15 17:02:47,706 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.bypass.scale_min, batch_count=53956.666666666664, ans=0.2 +2024-01-15 17:03:12,290 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=54023.333333333336, ans=0.125 +2024-01-15 17:03:21,484 INFO [train.py:994] (1/2) Epoch 20, batch 200, loss[loss=0.1832, simple_loss=0.2607, pruned_loss=0.05286, over 24494.00 frames. ], tot_loss[loss=0.1706, simple_loss=0.2469, pruned_loss=0.04712, over 3050314.10 frames. ], batch size: 187, lr: 1.99e-02, grad_scale: 32.0 +2024-01-15 17:03:31,625 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.717e+02 2.121e+02 2.376e+02 2.791e+02 4.349e+02, threshold=4.752e+02, percent-clipped=0.0 +2024-01-15 17:04:05,437 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module1.balancer1.max_abs, batch_count=54156.666666666664, ans=10.0 +2024-01-15 17:04:12,674 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=54190.0, ans=0.1 +2024-01-15 17:04:16,847 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.attention_skip_rate, batch_count=54190.0, ans=0.0 +2024-01-15 17:04:19,272 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module1.balancer1.min_positive, batch_count=54190.0, ans=0.025 +2024-01-15 17:04:23,734 INFO [train.py:994] (1/2) Epoch 20, batch 250, loss[loss=0.1882, simple_loss=0.2626, pruned_loss=0.05693, over 24495.00 frames. ], tot_loss[loss=0.1707, simple_loss=0.247, pruned_loss=0.04723, over 3435820.16 frames. ], batch size: 187, lr: 1.99e-02, grad_scale: 16.0 +2024-01-15 17:04:47,373 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.ff2_skip_rate, batch_count=54256.666666666664, ans=0.0 +2024-01-15 17:05:01,916 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.convnext.hidden_balancer.prob, batch_count=54323.333333333336, ans=0.125 +2024-01-15 17:05:12,289 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=54323.333333333336, ans=0.1 +2024-01-15 17:05:27,038 INFO [train.py:994] (1/2) Epoch 20, batch 300, loss[loss=0.1711, simple_loss=0.2466, pruned_loss=0.04785, over 24353.00 frames. ], tot_loss[loss=0.1706, simple_loss=0.2471, pruned_loss=0.04703, over 3738885.55 frames. ], batch size: 275, lr: 1.98e-02, grad_scale: 16.0 +2024-01-15 17:05:38,417 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.684e+02 2.137e+02 2.346e+02 2.815e+02 4.642e+02, threshold=4.692e+02, percent-clipped=0.0 +2024-01-15 17:05:39,441 INFO [scaling.py:1022] (1/2) Whitening: name=encoder_embed.out_whiten, num_groups=1, num_channels=192, metric=7.47 vs. limit=8.0 +2024-01-15 17:05:55,663 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.ff3_skip_rate, batch_count=54456.666666666664, ans=0.0 +2024-01-15 17:06:23,350 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.whiten, num_groups=1, num_channels=384, metric=3.12 vs. limit=12.0 +2024-01-15 17:06:29,564 INFO [train.py:994] (1/2) Epoch 20, batch 350, loss[loss=0.1523, simple_loss=0.2284, pruned_loss=0.03804, over 24042.00 frames. ], tot_loss[loss=0.1697, simple_loss=0.246, pruned_loss=0.04666, over 3967594.20 frames. ], batch size: 131, lr: 1.98e-02, grad_scale: 8.0 +2024-01-15 17:06:45,945 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=54590.0, ans=0.0 +2024-01-15 17:07:11,198 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=54656.666666666664, ans=0.125 +2024-01-15 17:07:16,801 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.balancer2.prob, batch_count=54656.666666666664, ans=0.125 +2024-01-15 17:07:32,044 INFO [train.py:994] (1/2) Epoch 20, batch 400, loss[loss=0.173, simple_loss=0.2504, pruned_loss=0.04782, over 24619.00 frames. ], tot_loss[loss=0.1696, simple_loss=0.2462, pruned_loss=0.04645, over 4165311.90 frames. ], batch size: 199, lr: 1.98e-02, grad_scale: 16.0 +2024-01-15 17:07:32,312 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.1.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 17:07:43,018 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module2.balancer2.prob, batch_count=54723.333333333336, ans=0.125 +2024-01-15 17:07:45,034 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.720e+02 1.981e+02 2.188e+02 2.566e+02 3.832e+02, threshold=4.376e+02, percent-clipped=0.0 +2024-01-15 17:08:03,540 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward2.hidden_balancer.prob, batch_count=54790.0, ans=0.125 +2024-01-15 17:08:03,542 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module2.balancer1.max_abs, batch_count=54790.0, ans=10.0 +2024-01-15 17:08:11,663 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.self_attn1.whiten, num_groups=1, num_channels=512, metric=19.88 vs. limit=22.5 +2024-01-15 17:08:12,459 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module2.balancer2.prob, batch_count=54823.333333333336, ans=0.125 +2024-01-15 17:08:25,438 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.bypass.scale_min, batch_count=54856.666666666664, ans=0.2 +2024-01-15 17:08:34,670 INFO [train.py:994] (1/2) Epoch 20, batch 450, loss[loss=0.1757, simple_loss=0.2451, pruned_loss=0.0532, over 24337.00 frames. ], tot_loss[loss=0.1695, simple_loss=0.2462, pruned_loss=0.04637, over 4313374.32 frames. ], batch size: 153, lr: 1.98e-02, grad_scale: 16.0 +2024-01-15 17:08:48,291 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.out_combiner.scale_min, batch_count=54923.333333333336, ans=0.2 +2024-01-15 17:08:51,782 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=54923.333333333336, ans=0.0 +2024-01-15 17:08:58,306 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.bypass_mid.scale_min, batch_count=54956.666666666664, ans=0.2 +2024-01-15 17:09:03,693 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module2.balancer2.prob, batch_count=54956.666666666664, ans=0.125 +2024-01-15 17:09:05,953 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.ff3_skip_rate, batch_count=54956.666666666664, ans=0.0 +2024-01-15 17:09:25,931 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.bypass_mid.scale_min, batch_count=55023.333333333336, ans=0.2 +2024-01-15 17:09:31,943 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.4.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 17:09:36,336 INFO [train.py:994] (1/2) Epoch 20, batch 500, loss[loss=0.1666, simple_loss=0.2449, pruned_loss=0.04417, over 24501.00 frames. ], tot_loss[loss=0.1693, simple_loss=0.2457, pruned_loss=0.04644, over 4422019.97 frames. ], batch size: 210, lr: 1.97e-02, grad_scale: 16.0 +2024-01-15 17:09:49,084 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.745e+02 2.194e+02 2.583e+02 3.181e+02 4.072e+02, threshold=5.167e+02, percent-clipped=0.0 +2024-01-15 17:10:19,596 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.nonlin_attention.whiten2, num_groups=1, num_channels=384, metric=6.18 vs. limit=15.0 +2024-01-15 17:10:37,934 INFO [train.py:994] (1/2) Epoch 20, batch 550, loss[loss=0.1678, simple_loss=0.2443, pruned_loss=0.04563, over 24371.00 frames. ], tot_loss[loss=0.1701, simple_loss=0.2465, pruned_loss=0.04684, over 4510914.74 frames. ], batch size: 285, lr: 1.97e-02, grad_scale: 16.0 +2024-01-15 17:10:42,459 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.conv.5.prob, batch_count=55223.333333333336, ans=0.125 +2024-01-15 17:10:58,992 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=6.82 vs. limit=15.0 +2024-01-15 17:11:41,598 INFO [train.py:994] (1/2) Epoch 20, batch 600, loss[loss=0.1649, simple_loss=0.2438, pruned_loss=0.04303, over 24418.00 frames. ], tot_loss[loss=0.1707, simple_loss=0.2472, pruned_loss=0.04709, over 4575484.54 frames. ], batch size: 258, lr: 1.97e-02, grad_scale: 16.0 +2024-01-15 17:11:52,879 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.whiten, num_groups=1, num_channels=384, metric=4.55 vs. limit=12.0 +2024-01-15 17:11:53,437 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.775e+02 2.054e+02 2.216e+02 2.564e+02 3.399e+02, threshold=4.432e+02, percent-clipped=0.0 +2024-01-15 17:12:02,074 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=144, metric=6.38 vs. limit=10.0 +2024-01-15 17:12:14,425 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.self_attn1.whiten.whitening_limit, batch_count=55456.666666666664, ans=22.5 +2024-01-15 17:12:42,380 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=55556.666666666664, ans=0.125 +2024-01-15 17:12:43,223 INFO [train.py:994] (1/2) Epoch 20, batch 650, loss[loss=0.171, simple_loss=0.2534, pruned_loss=0.04434, over 24503.00 frames. ], tot_loss[loss=0.1703, simple_loss=0.2468, pruned_loss=0.04687, over 4632136.99 frames. ], batch size: 204, lr: 1.97e-02, grad_scale: 16.0 +2024-01-15 17:12:55,470 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.out_proj.dropout_p, batch_count=55590.0, ans=0.1 +2024-01-15 17:12:59,197 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=55590.0, ans=0.1 +2024-01-15 17:13:00,873 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.conv_module2.whiten, num_groups=1, num_channels=256, metric=5.59 vs. limit=15.0 +2024-01-15 17:13:38,360 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.ff2_skip_rate, batch_count=55690.0, ans=0.0 +2024-01-15 17:13:46,242 INFO [train.py:994] (1/2) Epoch 20, batch 700, loss[loss=0.1722, simple_loss=0.2533, pruned_loss=0.04555, over 24487.00 frames. ], tot_loss[loss=0.1696, simple_loss=0.2462, pruned_loss=0.04651, over 4664485.25 frames. ], batch size: 216, lr: 1.96e-02, grad_scale: 16.0 +2024-01-15 17:13:47,834 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_module1.balancer1.prob, batch_count=55723.333333333336, ans=0.125 +2024-01-15 17:13:58,089 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.842e+02 2.109e+02 2.299e+02 2.928e+02 4.254e+02, threshold=4.598e+02, percent-clipped=0.0 +2024-01-15 17:14:30,622 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module2.balancer1.prob, batch_count=55823.333333333336, ans=0.125 +2024-01-15 17:14:37,232 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=55856.666666666664, ans=0.1 +2024-01-15 17:14:39,020 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=192, metric=17.68 vs. limit=15.0 +2024-01-15 17:14:43,104 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.1.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 17:14:47,515 INFO [train.py:994] (1/2) Epoch 20, batch 750, loss[loss=0.1682, simple_loss=0.2471, pruned_loss=0.04462, over 24525.00 frames. ], tot_loss[loss=0.1694, simple_loss=0.246, pruned_loss=0.04642, over 4688744.02 frames. ], batch size: 165, lr: 1.96e-02, grad_scale: 16.0 +2024-01-15 17:14:50,210 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.self_attn_weights.pos_emb_skip_rate, batch_count=55890.0, ans=0.0 +2024-01-15 17:14:57,970 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward2.hidden_balancer.prob, batch_count=55890.0, ans=0.125 +2024-01-15 17:15:08,105 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.hidden_balancer.prob, batch_count=55923.333333333336, ans=0.125 +2024-01-15 17:15:21,309 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module2.balancer1.prob, batch_count=55956.666666666664, ans=0.125 +2024-01-15 17:15:23,298 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=3.80 vs. limit=10.0 +2024-01-15 17:15:40,837 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_module1.balancer2.min_abs, batch_count=56023.333333333336, ans=0.5 +2024-01-15 17:15:48,434 INFO [train.py:994] (1/2) Epoch 20, batch 800, loss[loss=0.155, simple_loss=0.2155, pruned_loss=0.04722, over 18476.00 frames. ], tot_loss[loss=0.1695, simple_loss=0.2461, pruned_loss=0.04648, over 4711111.40 frames. ], batch size: 79, lr: 1.96e-02, grad_scale: 32.0 +2024-01-15 17:15:57,860 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=56056.666666666664, ans=0.1 +2024-01-15 17:15:59,789 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.756e+02 2.072e+02 2.375e+02 2.936e+02 6.285e+02, threshold=4.751e+02, percent-clipped=2.0 +2024-01-15 17:16:20,571 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.bypass.scale_min, batch_count=56123.333333333336, ans=0.2 +2024-01-15 17:16:30,608 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward2.hidden_balancer.prob, batch_count=56156.666666666664, ans=0.125 +2024-01-15 17:17:01,641 INFO [train.py:994] (1/2) Epoch 21, batch 0, loss[loss=0.1743, simple_loss=0.2483, pruned_loss=0.0501, over 24401.00 frames. ], tot_loss[loss=0.1743, simple_loss=0.2483, pruned_loss=0.0501, over 24401.00 frames. ], batch size: 159, lr: 1.92e-02, grad_scale: 32.0 +2024-01-15 17:17:01,642 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 17:17:22,203 INFO [train.py:1026] (1/2) Epoch 21, validation: loss=0.1697, simple_loss=0.2547, pruned_loss=0.04237, over 1622729.00 frames. +2024-01-15 17:17:22,203 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 17:17:24,247 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=384, metric=21.85 vs. limit=22.5 +2024-01-15 17:17:36,029 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_skip_rate, batch_count=56233.333333333336, ans=0.0 +2024-01-15 17:17:58,648 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=56300.0, ans=0.1 +2024-01-15 17:18:04,390 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.attention_skip_rate, batch_count=56300.0, ans=0.0 +2024-01-15 17:18:15,916 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.bypass.skip_rate, batch_count=56333.333333333336, ans=0.04949747468305833 +2024-01-15 17:18:17,014 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.bypass.skip_rate, batch_count=56333.333333333336, ans=0.09899494936611666 +2024-01-15 17:18:24,439 INFO [train.py:994] (1/2) Epoch 21, batch 50, loss[loss=0.1756, simple_loss=0.2512, pruned_loss=0.04997, over 24496.00 frames. ], tot_loss[loss=0.1667, simple_loss=0.2429, pruned_loss=0.04528, over 1079775.67 frames. ], batch size: 267, lr: 1.92e-02, grad_scale: 32.0 +2024-01-15 17:18:46,974 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.740e+02 1.978e+02 2.192e+02 2.494e+02 4.583e+02, threshold=4.384e+02, percent-clipped=0.0 +2024-01-15 17:19:03,610 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.bypass_mid.scale_min, batch_count=56466.666666666664, ans=0.2 +2024-01-15 17:19:27,060 INFO [train.py:994] (1/2) Epoch 21, batch 100, loss[loss=0.1823, simple_loss=0.251, pruned_loss=0.05675, over 24520.00 frames. ], tot_loss[loss=0.1662, simple_loss=0.243, pruned_loss=0.04474, over 1902541.78 frames. ], batch size: 187, lr: 1.91e-02, grad_scale: 32.0 +2024-01-15 17:20:08,738 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=56633.333333333336, ans=0.1 +2024-01-15 17:20:11,558 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.self_attn2.whiten, num_groups=1, num_channels=384, metric=17.29 vs. limit=22.5 +2024-01-15 17:20:25,775 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=6.44 vs. limit=10.0 +2024-01-15 17:20:28,283 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=56700.0, ans=0.125 +2024-01-15 17:20:29,199 INFO [train.py:994] (1/2) Epoch 21, batch 150, loss[loss=0.1771, simple_loss=0.2503, pruned_loss=0.05199, over 24468.00 frames. ], tot_loss[loss=0.1668, simple_loss=0.2435, pruned_loss=0.04506, over 2556580.72 frames. ], batch size: 170, lr: 1.91e-02, grad_scale: 32.0 +2024-01-15 17:20:30,667 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=56700.0, ans=0.1 +2024-01-15 17:20:42,087 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.conv_module2.whiten, num_groups=1, num_channels=384, metric=2.42 vs. limit=15.0 +2024-01-15 17:20:50,848 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.737e+02 1.990e+02 2.168e+02 2.546e+02 3.705e+02, threshold=4.335e+02, percent-clipped=0.0 +2024-01-15 17:20:55,343 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=56766.666666666664, ans=0.1 +2024-01-15 17:20:55,379 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_module1.balancer2.prob, batch_count=56766.666666666664, ans=0.125 +2024-01-15 17:20:56,455 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=56766.666666666664, ans=0.1 +2024-01-15 17:21:04,659 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=56800.0, ans=0.125 +2024-01-15 17:21:17,007 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_skip_rate, batch_count=56833.333333333336, ans=0.0 +2024-01-15 17:21:21,104 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=56833.333333333336, ans=0.125 +2024-01-15 17:21:30,252 INFO [train.py:994] (1/2) Epoch 21, batch 200, loss[loss=0.1568, simple_loss=0.2425, pruned_loss=0.03555, over 24369.00 frames. ], tot_loss[loss=0.1664, simple_loss=0.243, pruned_loss=0.0449, over 3050404.87 frames. ], batch size: 298, lr: 1.91e-02, grad_scale: 32.0 +2024-01-15 17:21:32,344 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.balancer1.prob, batch_count=56866.666666666664, ans=0.125 +2024-01-15 17:21:36,019 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.ff2_skip_rate, batch_count=56866.666666666664, ans=0.0 +2024-01-15 17:21:41,783 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward2.hidden_balancer.prob, batch_count=56900.0, ans=0.125 +2024-01-15 17:21:52,999 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.attention_skip_rate, batch_count=56900.0, ans=0.0 +2024-01-15 17:21:58,333 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module1.balancer1.prob, batch_count=56933.333333333336, ans=0.125 +2024-01-15 17:22:00,954 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.feed_forward1.out_whiten, num_groups=1, num_channels=384, metric=9.40 vs. limit=15.0 +2024-01-15 17:22:05,278 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.bypass.skip_rate, batch_count=56933.333333333336, ans=0.09899494936611666 +2024-01-15 17:22:29,635 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.self_attn2.whiten, num_groups=1, num_channels=384, metric=25.35 vs. limit=22.5 +2024-01-15 17:22:30,394 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module2.balancer2.min_positive, batch_count=57000.0, ans=0.05 +2024-01-15 17:22:32,340 INFO [train.py:994] (1/2) Epoch 21, batch 250, loss[loss=0.1777, simple_loss=0.2588, pruned_loss=0.04833, over 23915.00 frames. ], tot_loss[loss=0.1664, simple_loss=0.2431, pruned_loss=0.04488, over 3436882.56 frames. ], batch size: 328, lr: 1.91e-02, grad_scale: 32.0 +2024-01-15 17:22:50,605 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=57066.666666666664, ans=0.1 +2024-01-15 17:22:55,034 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.694e+02 2.081e+02 2.417e+02 3.180e+02 5.430e+02, threshold=4.834e+02, percent-clipped=4.0 +2024-01-15 17:23:04,171 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module2.balancer1.prob, batch_count=57100.0, ans=0.125 +2024-01-15 17:23:19,180 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=384, metric=10.31 vs. limit=15.0 +2024-01-15 17:23:34,517 INFO [train.py:994] (1/2) Epoch 21, batch 300, loss[loss=0.1683, simple_loss=0.2443, pruned_loss=0.04612, over 24339.00 frames. ], tot_loss[loss=0.1653, simple_loss=0.2418, pruned_loss=0.04435, over 3731382.28 frames. ], batch size: 153, lr: 1.90e-02, grad_scale: 32.0 +2024-01-15 17:23:58,677 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=57266.666666666664, ans=0.0 +2024-01-15 17:23:58,790 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.balancer1.prob, batch_count=57266.666666666664, ans=0.125 +2024-01-15 17:24:08,826 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module2.balancer2.prob, batch_count=57266.666666666664, ans=0.125 +2024-01-15 17:24:08,922 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_skip_rate, batch_count=57266.666666666664, ans=0.0 +2024-01-15 17:24:09,103 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.conv_module2.whiten, num_groups=1, num_channels=384, metric=2.37 vs. limit=15.0 +2024-01-15 17:24:21,173 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module2.balancer1.prob, batch_count=57300.0, ans=0.125 +2024-01-15 17:24:35,323 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass.scale_min, batch_count=57333.333333333336, ans=0.2 +2024-01-15 17:24:37,375 INFO [train.py:994] (1/2) Epoch 21, batch 350, loss[loss=0.1728, simple_loss=0.2508, pruned_loss=0.04747, over 24296.00 frames. ], tot_loss[loss=0.1662, simple_loss=0.2428, pruned_loss=0.04479, over 3955706.98 frames. ], batch size: 285, lr: 1.90e-02, grad_scale: 32.0 +2024-01-15 17:24:59,879 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.672e+02 2.143e+02 2.321e+02 2.694e+02 4.146e+02, threshold=4.643e+02, percent-clipped=0.0 +2024-01-15 17:25:02,958 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.feed_forward2.out_whiten, num_groups=1, num_channels=384, metric=11.00 vs. limit=15.0 +2024-01-15 17:25:27,796 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.ff3_skip_rate, batch_count=57500.0, ans=0.0 +2024-01-15 17:25:31,887 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=10.34 vs. limit=15.0 +2024-01-15 17:25:34,820 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module2.balancer1.prob, batch_count=57500.0, ans=0.125 +2024-01-15 17:25:39,160 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward1.out_proj.dropout_p, batch_count=57533.333333333336, ans=0.1 +2024-01-15 17:25:40,070 INFO [train.py:994] (1/2) Epoch 21, batch 400, loss[loss=0.142, simple_loss=0.2215, pruned_loss=0.03123, over 24445.00 frames. ], tot_loss[loss=0.1656, simple_loss=0.2421, pruned_loss=0.0445, over 4140447.74 frames. ], batch size: 148, lr: 1.90e-02, grad_scale: 32.0 +2024-01-15 17:25:49,810 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.out_proj.dropout_p, batch_count=57533.333333333336, ans=0.1 +2024-01-15 17:26:27,171 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module1.balancer2.min_positive, batch_count=57633.333333333336, ans=0.05 +2024-01-15 17:26:41,763 INFO [train.py:994] (1/2) Epoch 21, batch 450, loss[loss=0.169, simple_loss=0.2475, pruned_loss=0.0453, over 24392.00 frames. ], tot_loss[loss=0.1657, simple_loss=0.2425, pruned_loss=0.04449, over 4291361.49 frames. ], batch size: 159, lr: 1.90e-02, grad_scale: 32.0 +2024-01-15 17:26:45,779 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=6.36 vs. limit=10.0 +2024-01-15 17:26:46,603 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.balancer1.prob, batch_count=57700.0, ans=0.125 +2024-01-15 17:27:00,415 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.bypass.scale_min, batch_count=57733.333333333336, ans=0.2 +2024-01-15 17:27:04,931 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.716e+02 2.075e+02 2.378e+02 2.796e+02 3.944e+02, threshold=4.755e+02, percent-clipped=0.0 +2024-01-15 17:27:05,242 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.bypass.scale_min, batch_count=57733.333333333336, ans=0.2 +2024-01-15 17:27:06,478 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=57766.666666666664, ans=0.1 +2024-01-15 17:27:08,246 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.dropout.p, batch_count=57766.666666666664, ans=0.1 +2024-01-15 17:27:38,443 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.bypass_mid.scale_min, batch_count=57833.333333333336, ans=0.2 +2024-01-15 17:27:40,893 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_skip_rate, batch_count=57833.333333333336, ans=0.0 +2024-01-15 17:27:43,506 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=384, metric=3.42 vs. limit=15.0 +2024-01-15 17:27:45,382 INFO [train.py:994] (1/2) Epoch 21, batch 500, loss[loss=0.1708, simple_loss=0.2461, pruned_loss=0.04779, over 24391.00 frames. ], tot_loss[loss=0.1658, simple_loss=0.2428, pruned_loss=0.04443, over 4410352.14 frames. ], batch size: 275, lr: 1.90e-02, grad_scale: 32.0 +2024-01-15 17:28:19,366 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.conv_module2.whiten, num_groups=1, num_channels=192, metric=4.02 vs. limit=15.0 +2024-01-15 17:28:33,167 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.attention_skip_rate, batch_count=57966.666666666664, ans=0.0 +2024-01-15 17:28:43,699 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module1.balancer2.prob, batch_count=58000.0, ans=0.125 +2024-01-15 17:28:49,556 INFO [train.py:994] (1/2) Epoch 21, batch 550, loss[loss=0.1707, simple_loss=0.2455, pruned_loss=0.04795, over 24404.00 frames. ], tot_loss[loss=0.1656, simple_loss=0.2426, pruned_loss=0.04423, over 4499783.57 frames. ], batch size: 159, lr: 1.89e-02, grad_scale: 32.0 +2024-01-15 17:28:59,248 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.balancer1.prob, batch_count=58033.333333333336, ans=0.125 +2024-01-15 17:29:12,696 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.721e+02 2.030e+02 2.271e+02 2.828e+02 4.259e+02, threshold=4.542e+02, percent-clipped=0.0 +2024-01-15 17:29:13,084 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_module2.balancer2.prob, batch_count=58066.666666666664, ans=0.125 +2024-01-15 17:29:14,227 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module2.balancer1.prob, batch_count=58100.0, ans=0.125 +2024-01-15 17:29:16,827 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.self_attn1.whiten, num_groups=1, num_channels=384, metric=14.09 vs. limit=22.5 +2024-01-15 17:29:22,226 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_module2.balancer2.min_abs, batch_count=58100.0, ans=0.5 +2024-01-15 17:29:52,730 INFO [train.py:994] (1/2) Epoch 21, batch 600, loss[loss=0.1643, simple_loss=0.2437, pruned_loss=0.04249, over 24500.00 frames. ], tot_loss[loss=0.1664, simple_loss=0.2436, pruned_loss=0.04462, over 4565724.29 frames. ], batch size: 181, lr: 1.89e-02, grad_scale: 32.0 +2024-01-15 17:29:55,445 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.self_attn_weights.pos_emb_skip_rate, batch_count=58200.0, ans=0.0 +2024-01-15 17:30:25,402 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=58266.666666666664, ans=0.125 +2024-01-15 17:30:42,334 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.bypass.skip_rate, batch_count=58333.333333333336, ans=0.035 +2024-01-15 17:30:51,804 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.0.layers.0.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 17:30:55,842 INFO [train.py:994] (1/2) Epoch 21, batch 650, loss[loss=0.1626, simple_loss=0.2408, pruned_loss=0.0422, over 24399.00 frames. ], tot_loss[loss=0.1661, simple_loss=0.2433, pruned_loss=0.04441, over 4622329.51 frames. ], batch size: 159, lr: 1.89e-02, grad_scale: 32.0 +2024-01-15 17:31:03,056 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=3.77 vs. limit=10.0 +2024-01-15 17:31:17,906 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.703e+02 1.977e+02 2.187e+02 2.482e+02 3.523e+02, threshold=4.375e+02, percent-clipped=0.0 +2024-01-15 17:31:29,977 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module2.balancer1.prob, batch_count=58433.333333333336, ans=0.125 +2024-01-15 17:31:56,629 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module1.balancer2.prob, batch_count=58500.0, ans=0.125 +2024-01-15 17:31:57,881 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=58533.333333333336, ans=0.125 +2024-01-15 17:31:58,763 INFO [train.py:994] (1/2) Epoch 21, batch 700, loss[loss=0.1412, simple_loss=0.2205, pruned_loss=0.03096, over 24355.00 frames. ], tot_loss[loss=0.1662, simple_loss=0.2435, pruned_loss=0.0445, over 4670520.85 frames. ], batch size: 147, lr: 1.89e-02, grad_scale: 32.0 +2024-01-15 17:32:19,954 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=58566.666666666664, ans=0.0 +2024-01-15 17:32:32,651 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=58600.0, ans=0.1 +2024-01-15 17:33:00,902 INFO [train.py:994] (1/2) Epoch 21, batch 750, loss[loss=0.1724, simple_loss=0.2551, pruned_loss=0.04485, over 24488.00 frames. ], tot_loss[loss=0.1657, simple_loss=0.2429, pruned_loss=0.0443, over 4681816.38 frames. ], batch size: 216, lr: 1.88e-02, grad_scale: 32.0 +2024-01-15 17:33:23,635 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.736e+02 2.081e+02 2.558e+02 3.319e+02 5.261e+02, threshold=5.116e+02, percent-clipped=3.0 +2024-01-15 17:33:26,182 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 17:33:26,227 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=58766.666666666664, ans=0.125 +2024-01-15 17:33:32,722 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.bypass_mid.scale_min, batch_count=58766.666666666664, ans=0.2 +2024-01-15 17:33:54,577 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward1.out_proj.dropout_p, batch_count=58833.333333333336, ans=0.1 +2024-01-15 17:34:01,123 INFO [train.py:994] (1/2) Epoch 21, batch 800, loss[loss=0.1493, simple_loss=0.2221, pruned_loss=0.03825, over 23432.00 frames. ], tot_loss[loss=0.1657, simple_loss=0.2427, pruned_loss=0.04437, over 4700408.13 frames. ], batch size: 119, lr: 1.88e-02, grad_scale: 32.0 +2024-01-15 17:35:14,664 INFO [train.py:994] (1/2) Epoch 22, batch 0, loss[loss=0.1662, simple_loss=0.2439, pruned_loss=0.04421, over 24511.00 frames. ], tot_loss[loss=0.1662, simple_loss=0.2439, pruned_loss=0.04421, over 24511.00 frames. ], batch size: 210, lr: 1.84e-02, grad_scale: 32.0 +2024-01-15 17:35:14,665 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 17:35:28,431 INFO [zipformer.py:1858] (1/2) name=encoder.encoders.4.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([1.7754, 2.8686, 2.8639, 2.5027], device='cuda:1') +2024-01-15 17:35:35,901 INFO [train.py:1026] (1/2) Epoch 22, validation: loss=0.1695, simple_loss=0.2543, pruned_loss=0.04238, over 1622729.00 frames. +2024-01-15 17:35:35,902 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 17:35:39,194 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.self_attn_weights.pos_emb_skip_rate, batch_count=59010.0, ans=0.0 +2024-01-15 17:35:42,832 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module1.balancer1.prob, batch_count=59010.0, ans=0.125 +2024-01-15 17:35:46,437 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=59010.0, ans=0.0 +2024-01-15 17:35:55,111 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=144, metric=9.46 vs. limit=10.0 +2024-01-15 17:36:01,310 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module2.balancer2.prob, batch_count=59076.666666666664, ans=0.125 +2024-01-15 17:36:07,383 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.634e+02 2.084e+02 2.248e+02 2.659e+02 3.763e+02, threshold=4.496e+02, percent-clipped=0.0 +2024-01-15 17:36:14,846 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module2.balancer2.prob, batch_count=59110.0, ans=0.125 +2024-01-15 17:36:15,974 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward3.hidden_balancer.prob, batch_count=59110.0, ans=0.125 +2024-01-15 17:36:37,926 INFO [train.py:994] (1/2) Epoch 22, batch 50, loss[loss=0.1651, simple_loss=0.243, pruned_loss=0.04356, over 24437.00 frames. ], tot_loss[loss=0.1642, simple_loss=0.2397, pruned_loss=0.04429, over 1090632.67 frames. ], batch size: 258, lr: 1.84e-02, grad_scale: 32.0 +2024-01-15 17:36:43,102 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=59176.666666666664, ans=0.125 +2024-01-15 17:36:47,638 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.balancer2.prob, batch_count=59176.666666666664, ans=0.125 +2024-01-15 17:37:18,793 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.conv.5.prob, batch_count=59276.666666666664, ans=0.125 +2024-01-15 17:37:39,166 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.nonlin_attention.whiten1, num_groups=1, num_channels=384, metric=4.73 vs. limit=10.0 +2024-01-15 17:37:39,661 INFO [train.py:994] (1/2) Epoch 22, batch 100, loss[loss=0.1591, simple_loss=0.2395, pruned_loss=0.03935, over 24459.00 frames. ], tot_loss[loss=0.1639, simple_loss=0.2408, pruned_loss=0.04352, over 1909481.94 frames. ], batch size: 250, lr: 1.84e-02, grad_scale: 32.0 +2024-01-15 17:37:52,835 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward1.hidden_balancer.prob, batch_count=59376.666666666664, ans=0.125 +2024-01-15 17:37:52,884 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_skip_rate, batch_count=59376.666666666664, ans=0.0 +2024-01-15 17:38:10,825 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.619e+02 1.958e+02 2.230e+02 2.471e+02 4.355e+02, threshold=4.460e+02, percent-clipped=0.0 +2024-01-15 17:38:11,077 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module2.balancer1.prob, batch_count=59410.0, ans=0.125 +2024-01-15 17:38:31,923 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module2.balancer2.min_abs, batch_count=59476.666666666664, ans=0.5 +2024-01-15 17:38:42,732 INFO [train.py:994] (1/2) Epoch 22, batch 150, loss[loss=0.1685, simple_loss=0.2465, pruned_loss=0.04529, over 24433.00 frames. ], tot_loss[loss=0.1635, simple_loss=0.2403, pruned_loss=0.04341, over 2550018.35 frames. ], batch size: 250, lr: 1.84e-02, grad_scale: 32.0 +2024-01-15 17:38:50,190 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.bypass.scale_min, batch_count=59510.0, ans=0.2 +2024-01-15 17:39:10,451 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.conv_module2.whiten, num_groups=1, num_channels=256, metric=3.91 vs. limit=15.0 +2024-01-15 17:39:18,448 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_skip_rate, batch_count=59610.0, ans=0.0 +2024-01-15 17:39:44,215 INFO [train.py:994] (1/2) Epoch 22, batch 200, loss[loss=0.1681, simple_loss=0.2449, pruned_loss=0.0456, over 24398.00 frames. ], tot_loss[loss=0.1642, simple_loss=0.2412, pruned_loss=0.04358, over 3059885.94 frames. ], batch size: 275, lr: 1.83e-02, grad_scale: 32.0 +2024-01-15 17:39:44,471 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=59676.666666666664, ans=0.125 +2024-01-15 17:39:49,309 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.bypass.skip_rate, batch_count=59676.666666666664, ans=0.035 +2024-01-15 17:40:07,075 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_skip_rate, batch_count=59710.0, ans=0.0 +2024-01-15 17:40:13,136 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module2.balancer1.prob, batch_count=59743.333333333336, ans=0.125 +2024-01-15 17:40:15,061 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.716e+02 2.149e+02 2.370e+02 2.795e+02 5.088e+02, threshold=4.739e+02, percent-clipped=1.0 +2024-01-15 17:40:24,863 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.ff3_skip_rate, batch_count=59776.666666666664, ans=0.0 +2024-01-15 17:40:45,353 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module2.balancer2.prob, batch_count=59843.333333333336, ans=0.125 +2024-01-15 17:40:46,857 INFO [train.py:994] (1/2) Epoch 22, batch 250, loss[loss=0.1727, simple_loss=0.2414, pruned_loss=0.05201, over 24415.00 frames. ], tot_loss[loss=0.1647, simple_loss=0.2417, pruned_loss=0.04387, over 3436416.22 frames. ], batch size: 159, lr: 1.83e-02, grad_scale: 32.0 +2024-01-15 17:40:50,739 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward2.hidden_balancer.prob, batch_count=59843.333333333336, ans=0.125 +2024-01-15 17:41:06,150 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.nonlin_attention.balancer.prob, batch_count=59876.666666666664, ans=0.125 +2024-01-15 17:41:07,337 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=59876.666666666664, ans=0.125 +2024-01-15 17:41:13,351 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=59910.0, ans=0.0 +2024-01-15 17:41:25,215 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=59943.333333333336, ans=0.125 +2024-01-15 17:41:47,106 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=3.55 vs. limit=10.0 +2024-01-15 17:41:48,906 INFO [train.py:994] (1/2) Epoch 22, batch 300, loss[loss=0.178, simple_loss=0.2548, pruned_loss=0.05061, over 24438.00 frames. ], tot_loss[loss=0.1644, simple_loss=0.2415, pruned_loss=0.04362, over 3741735.75 frames. ], batch size: 170, lr: 1.83e-02, grad_scale: 32.0 +2024-01-15 17:42:13,102 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_module2.balancer1.prob, batch_count=60076.666666666664, ans=0.125 +2024-01-15 17:42:19,710 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.740e+02 2.217e+02 2.648e+02 3.149e+02 4.235e+02, threshold=5.295e+02, percent-clipped=0.0 +2024-01-15 17:42:35,998 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=7.93 vs. limit=15.0 +2024-01-15 17:42:44,965 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.nonlin_attention.balancer.prob, batch_count=60143.333333333336, ans=0.125 +2024-01-15 17:42:50,684 INFO [train.py:994] (1/2) Epoch 22, batch 350, loss[loss=0.1693, simple_loss=0.2514, pruned_loss=0.04356, over 24304.00 frames. ], tot_loss[loss=0.1643, simple_loss=0.2416, pruned_loss=0.04347, over 3985389.03 frames. ], batch size: 285, lr: 1.83e-02, grad_scale: 32.0 +2024-01-15 17:42:51,629 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.balancer1.prob, batch_count=60176.666666666664, ans=0.125 +2024-01-15 17:42:51,975 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.self_attn2.whiten, num_groups=1, num_channels=512, metric=17.25 vs. limit=22.5 +2024-01-15 17:42:52,742 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=60176.666666666664, ans=0.1 +2024-01-15 17:42:52,937 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=12.90 vs. limit=15.0 +2024-01-15 17:42:53,825 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=60176.666666666664, ans=0.1 +2024-01-15 17:43:27,785 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=60276.666666666664, ans=0.125 +2024-01-15 17:43:45,979 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=60310.0, ans=0.1 +2024-01-15 17:43:53,304 INFO [train.py:994] (1/2) Epoch 22, batch 400, loss[loss=0.1733, simple_loss=0.2487, pruned_loss=0.04894, over 24626.00 frames. ], tot_loss[loss=0.1647, simple_loss=0.2421, pruned_loss=0.04367, over 4178372.15 frames. ], batch size: 199, lr: 1.83e-02, grad_scale: 32.0 +2024-01-15 17:43:54,591 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.conv.8.prob, batch_count=60343.333333333336, ans=0.125 +2024-01-15 17:44:01,912 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.3.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 17:44:08,383 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module1.balancer1.prob, batch_count=60376.666666666664, ans=0.125 +2024-01-15 17:44:16,040 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=60376.666666666664, ans=0.0 +2024-01-15 17:44:18,487 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.1.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 17:44:19,601 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=60410.0, ans=0.1 +2024-01-15 17:44:24,058 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.630e+02 2.042e+02 2.258e+02 2.641e+02 4.188e+02, threshold=4.517e+02, percent-clipped=0.0 +2024-01-15 17:44:30,910 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module1.balancer2.prob, batch_count=60443.333333333336, ans=0.125 +2024-01-15 17:44:39,125 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module1.balancer2.prob, batch_count=60443.333333333336, ans=0.125 +2024-01-15 17:44:52,832 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.nonlin_attention.balancer.prob, batch_count=60476.666666666664, ans=0.125 +2024-01-15 17:44:53,973 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=60510.0, ans=0.0 +2024-01-15 17:44:54,833 INFO [train.py:994] (1/2) Epoch 22, batch 450, loss[loss=0.1658, simple_loss=0.242, pruned_loss=0.04481, over 24373.00 frames. ], tot_loss[loss=0.1644, simple_loss=0.2418, pruned_loss=0.04348, over 4312731.63 frames. ], batch size: 275, lr: 1.82e-02, grad_scale: 32.0 +2024-01-15 17:45:01,740 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.2.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 17:45:50,945 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.convnext.hidden_balancer.prob, batch_count=60643.333333333336, ans=0.125 +2024-01-15 17:45:53,972 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.whiten, num_groups=1, num_channels=384, metric=2.93 vs. limit=12.0 +2024-01-15 17:45:56,110 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module1.balancer2.prob, batch_count=60676.666666666664, ans=0.125 +2024-01-15 17:45:56,875 INFO [train.py:994] (1/2) Epoch 22, batch 500, loss[loss=0.1743, simple_loss=0.25, pruned_loss=0.04933, over 24541.00 frames. ], tot_loss[loss=0.1639, simple_loss=0.2413, pruned_loss=0.04324, over 4416625.18 frames. ], batch size: 236, lr: 1.82e-02, grad_scale: 32.0 +2024-01-15 17:45:58,221 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.bypass.skip_rate, batch_count=60676.666666666664, ans=0.035 +2024-01-15 17:46:07,250 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.ff2_skip_rate, batch_count=60676.666666666664, ans=0.0 +2024-01-15 17:46:27,589 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.balancer2.prob, batch_count=60743.333333333336, ans=0.125 +2024-01-15 17:46:28,279 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.658e+02 2.049e+02 2.276e+02 2.528e+02 4.336e+02, threshold=4.552e+02, percent-clipped=0.0 +2024-01-15 17:46:33,342 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.attention_skip_rate, batch_count=60776.666666666664, ans=0.0 +2024-01-15 17:46:35,726 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.ff2_skip_rate, batch_count=60776.666666666664, ans=0.0 +2024-01-15 17:46:35,776 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.bypass.skip_rate, batch_count=60776.666666666664, ans=0.09899494936611666 +2024-01-15 17:46:41,647 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=384, metric=24.98 vs. limit=22.5 +2024-01-15 17:46:59,012 INFO [train.py:994] (1/2) Epoch 22, batch 550, loss[loss=0.173, simple_loss=0.2496, pruned_loss=0.04818, over 24221.00 frames. ], tot_loss[loss=0.1641, simple_loss=0.2415, pruned_loss=0.04331, over 4513905.90 frames. ], batch size: 311, lr: 1.82e-02, grad_scale: 32.0 +2024-01-15 17:47:12,208 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward2.out_whiten.whitening_limit, batch_count=60876.666666666664, ans=15.0 +2024-01-15 17:47:15,206 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.nonlin_attention.balancer.prob, batch_count=60876.666666666664, ans=0.125 +2024-01-15 17:47:36,154 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.conv_module2.whiten, num_groups=1, num_channels=384, metric=2.80 vs. limit=15.0 +2024-01-15 17:47:54,700 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=512, metric=17.55 vs. limit=22.5 +2024-01-15 17:48:01,714 INFO [train.py:994] (1/2) Epoch 22, batch 600, loss[loss=0.1742, simple_loss=0.2466, pruned_loss=0.05086, over 24480.00 frames. ], tot_loss[loss=0.1644, simple_loss=0.2418, pruned_loss=0.04352, over 4571947.82 frames. ], batch size: 181, lr: 1.82e-02, grad_scale: 16.0 +2024-01-15 17:48:02,008 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward3.hidden_balancer.prob, batch_count=61010.0, ans=0.125 +2024-01-15 17:48:33,918 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.577e+02 2.035e+02 2.386e+02 2.905e+02 4.646e+02, threshold=4.772e+02, percent-clipped=1.0 +2024-01-15 17:49:03,445 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.ff2_skip_rate, batch_count=61176.666666666664, ans=0.0 +2024-01-15 17:49:04,293 INFO [train.py:994] (1/2) Epoch 22, batch 650, loss[loss=0.1642, simple_loss=0.2425, pruned_loss=0.04298, over 24258.00 frames. ], tot_loss[loss=0.1643, simple_loss=0.2416, pruned_loss=0.0435, over 4626313.05 frames. ], batch size: 311, lr: 1.81e-02, grad_scale: 16.0 +2024-01-15 17:49:06,956 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward1.out_proj.dropout_p, batch_count=61176.666666666664, ans=0.1 +2024-01-15 17:49:09,293 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.ff2_skip_rate, batch_count=61176.666666666664, ans=0.0 +2024-01-15 17:49:19,391 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=61210.0, ans=0.125 +2024-01-15 17:49:42,361 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.feed_forward1.out_whiten, num_groups=1, num_channels=192, metric=8.75 vs. limit=15.0 +2024-01-15 17:49:44,134 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=61276.666666666664, ans=0.1 +2024-01-15 17:50:06,523 INFO [train.py:994] (1/2) Epoch 22, batch 700, loss[loss=0.1815, simple_loss=0.2628, pruned_loss=0.05009, over 23862.00 frames. ], tot_loss[loss=0.1639, simple_loss=0.2412, pruned_loss=0.04332, over 4665309.27 frames. ], batch size: 328, lr: 1.81e-02, grad_scale: 16.0 +2024-01-15 17:50:15,290 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.self_attn_weights.pos_emb_skip_rate, batch_count=61343.333333333336, ans=0.0 +2024-01-15 17:50:36,494 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=61410.0, ans=0.125 +2024-01-15 17:50:38,474 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.674e+02 2.041e+02 2.372e+02 2.713e+02 3.962e+02, threshold=4.745e+02, percent-clipped=0.0 +2024-01-15 17:51:07,956 INFO [train.py:994] (1/2) Epoch 22, batch 750, loss[loss=0.1756, simple_loss=0.2482, pruned_loss=0.05149, over 24384.00 frames. ], tot_loss[loss=0.1641, simple_loss=0.2415, pruned_loss=0.04333, over 4701407.48 frames. ], batch size: 153, lr: 1.81e-02, grad_scale: 16.0 +2024-01-15 17:51:32,077 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module1.balancer2.prob, batch_count=61576.666666666664, ans=0.125 +2024-01-15 17:51:50,436 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module2.balancer2.min_positive, batch_count=61610.0, ans=0.05 +2024-01-15 17:51:56,135 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=512, metric=5.47 vs. limit=15.0 +2024-01-15 17:52:07,929 INFO [train.py:994] (1/2) Epoch 22, batch 800, loss[loss=0.1747, simple_loss=0.2522, pruned_loss=0.0486, over 24513.00 frames. ], tot_loss[loss=0.1643, simple_loss=0.2419, pruned_loss=0.04333, over 4723271.09 frames. ], batch size: 210, lr: 1.81e-02, grad_scale: 32.0 +2024-01-15 17:52:24,772 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.self_attn2.whiten, num_groups=1, num_channels=384, metric=22.59 vs. limit=22.5 +2024-01-15 17:52:27,606 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module1.balancer2.min_positive, batch_count=61710.0, ans=0.05 +2024-01-15 17:52:35,881 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.bypass.skip_rate, batch_count=61743.333333333336, ans=0.07 +2024-01-15 17:52:37,905 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.710e+02 2.029e+02 2.222e+02 2.719e+02 4.293e+02, threshold=4.444e+02, percent-clipped=0.0 +2024-01-15 17:52:42,586 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.1.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 17:52:42,670 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.bypass_mid.scale_min, batch_count=61776.666666666664, ans=0.2 +2024-01-15 17:53:15,432 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.conv_module2.whiten, num_groups=1, num_channels=256, metric=11.16 vs. limit=15.0 +2024-01-15 17:53:19,401 INFO [train.py:994] (1/2) Epoch 23, batch 0, loss[loss=0.1572, simple_loss=0.2368, pruned_loss=0.03878, over 24408.00 frames. ], tot_loss[loss=0.1572, simple_loss=0.2368, pruned_loss=0.03878, over 24408.00 frames. ], batch size: 286, lr: 1.77e-02, grad_scale: 32.0 +2024-01-15 17:53:19,402 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 17:53:40,001 INFO [train.py:1026] (1/2) Epoch 23, validation: loss=0.1681, simple_loss=0.2529, pruned_loss=0.0416, over 1622729.00 frames. +2024-01-15 17:53:40,002 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 17:53:41,557 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=61820.0, ans=0.1 +2024-01-15 17:53:49,457 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=384, metric=2.59 vs. limit=15.0 +2024-01-15 17:54:15,726 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward3.hidden_balancer.prob, batch_count=61886.666666666664, ans=0.125 +2024-01-15 17:54:42,855 INFO [train.py:994] (1/2) Epoch 23, batch 50, loss[loss=0.1543, simple_loss=0.2333, pruned_loss=0.03766, over 24506.00 frames. ], tot_loss[loss=0.1622, simple_loss=0.2399, pruned_loss=0.0423, over 1089665.72 frames. ], batch size: 267, lr: 1.77e-02, grad_scale: 32.0 +2024-01-15 17:54:46,732 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=61986.666666666664, ans=0.1 +2024-01-15 17:55:00,047 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass.skip_rate, batch_count=62020.0, ans=0.07 +2024-01-15 17:55:10,554 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.self_attn_weights.pos_emb_skip_rate, batch_count=62053.333333333336, ans=0.0 +2024-01-15 17:55:10,584 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.2.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 17:55:24,020 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.nonlin_attention.whiten1, num_groups=1, num_channels=384, metric=4.51 vs. limit=10.0 +2024-01-15 17:55:24,543 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.702e+02 2.035e+02 2.267e+02 2.598e+02 4.096e+02, threshold=4.534e+02, percent-clipped=0.0 +2024-01-15 17:55:34,049 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module2.balancer2.min_positive, batch_count=62120.0, ans=0.05 +2024-01-15 17:55:35,293 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward3.hidden_balancer.prob, batch_count=62120.0, ans=0.125 +2024-01-15 17:55:42,523 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=512, metric=13.38 vs. limit=15.0 +2024-01-15 17:55:44,913 INFO [train.py:994] (1/2) Epoch 23, batch 100, loss[loss=0.1604, simple_loss=0.2346, pruned_loss=0.0431, over 24601.00 frames. ], tot_loss[loss=0.1624, simple_loss=0.2402, pruned_loss=0.04233, over 1920198.34 frames. ], batch size: 199, lr: 1.77e-02, grad_scale: 32.0 +2024-01-15 17:55:49,242 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module1.balancer2.min_positive, batch_count=62153.333333333336, ans=0.05 +2024-01-15 17:55:58,803 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.balancer2.prob, batch_count=62186.666666666664, ans=0.125 +2024-01-15 17:56:46,319 INFO [train.py:994] (1/2) Epoch 23, batch 150, loss[loss=0.1686, simple_loss=0.2482, pruned_loss=0.04447, over 24486.00 frames. ], tot_loss[loss=0.1614, simple_loss=0.2394, pruned_loss=0.04171, over 2560135.32 frames. ], batch size: 181, lr: 1.77e-02, grad_scale: 32.0 +2024-01-15 17:56:48,274 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=5.15 vs. limit=6.0 +2024-01-15 17:56:55,920 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module2.balancer1.max_abs, batch_count=62320.0, ans=10.0 +2024-01-15 17:57:01,328 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module2.balancer1.prob, batch_count=62353.333333333336, ans=0.125 +2024-01-15 17:57:26,685 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.664e+02 1.927e+02 2.204e+02 2.443e+02 3.212e+02, threshold=4.407e+02, percent-clipped=0.0 +2024-01-15 17:57:47,201 INFO [train.py:994] (1/2) Epoch 23, batch 200, loss[loss=0.1725, simple_loss=0.2506, pruned_loss=0.04715, over 24545.00 frames. ], tot_loss[loss=0.161, simple_loss=0.239, pruned_loss=0.04147, over 3059204.81 frames. ], batch size: 176, lr: 1.76e-02, grad_scale: 32.0 +2024-01-15 17:57:48,779 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module1.balancer1.prob, batch_count=62486.666666666664, ans=0.125 +2024-01-15 17:57:59,595 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.attention_skip_rate, batch_count=62520.0, ans=0.0 +2024-01-15 17:58:22,059 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=62553.333333333336, ans=0.1 +2024-01-15 17:58:29,331 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.ff2_skip_rate, batch_count=62586.666666666664, ans=0.0 +2024-01-15 17:58:31,598 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.ff2_skip_rate, batch_count=62586.666666666664, ans=0.0 +2024-01-15 17:58:45,853 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.hidden_balancer.prob, batch_count=62620.0, ans=0.125 +2024-01-15 17:58:50,411 INFO [train.py:994] (1/2) Epoch 23, batch 250, loss[loss=0.1769, simple_loss=0.2515, pruned_loss=0.05122, over 24517.00 frames. ], tot_loss[loss=0.1615, simple_loss=0.2397, pruned_loss=0.04169, over 3455979.86 frames. ], batch size: 165, lr: 1.76e-02, grad_scale: 32.0 +2024-01-15 17:59:31,373 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.703e+02 1.957e+02 2.240e+02 2.593e+02 5.482e+02, threshold=4.480e+02, percent-clipped=1.0 +2024-01-15 17:59:47,766 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward3.hidden_balancer.prob, batch_count=62786.666666666664, ans=0.125 +2024-01-15 17:59:52,185 INFO [train.py:994] (1/2) Epoch 23, batch 300, loss[loss=0.1694, simple_loss=0.2442, pruned_loss=0.04728, over 24507.00 frames. ], tot_loss[loss=0.1617, simple_loss=0.2396, pruned_loss=0.04194, over 3750775.53 frames. ], batch size: 243, lr: 1.76e-02, grad_scale: 32.0 +2024-01-15 18:00:04,578 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=512, metric=5.19 vs. limit=15.0 +2024-01-15 18:00:20,754 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward1.hidden_balancer.prob, batch_count=62886.666666666664, ans=0.125 +2024-01-15 18:00:33,964 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.nonlin_attention.balancer.max_positive, batch_count=62920.0, ans=0.95 +2024-01-15 18:00:49,270 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.nonlin_attention.balancer.max_positive, batch_count=62953.333333333336, ans=0.95 +2024-01-15 18:00:54,352 INFO [train.py:994] (1/2) Epoch 23, batch 350, loss[loss=0.1614, simple_loss=0.2414, pruned_loss=0.04067, over 24625.00 frames. ], tot_loss[loss=0.1613, simple_loss=0.2391, pruned_loss=0.04172, over 3987983.88 frames. ], batch size: 199, lr: 1.76e-02, grad_scale: 16.0 +2024-01-15 18:00:57,818 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=62986.666666666664, ans=0.125 +2024-01-15 18:00:58,872 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.balancer2.prob, batch_count=62986.666666666664, ans=0.125 +2024-01-15 18:01:27,145 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.balancer1.prob, batch_count=63053.333333333336, ans=0.125 +2024-01-15 18:01:30,588 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=63086.666666666664, ans=0.125 +2024-01-15 18:01:36,163 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.633e+02 2.040e+02 2.331e+02 2.751e+02 4.067e+02, threshold=4.662e+02, percent-clipped=0.0 +2024-01-15 18:01:38,715 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.out_proj.dropout_p, batch_count=63086.666666666664, ans=0.1 +2024-01-15 18:01:41,935 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module2.balancer1.prob, batch_count=63086.666666666664, ans=0.125 +2024-01-15 18:01:43,121 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.balancer2.min_positive, batch_count=63120.0, ans=0.05 +2024-01-15 18:01:48,359 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module1.balancer1.prob, batch_count=63120.0, ans=0.125 +2024-01-15 18:01:56,453 INFO [train.py:994] (1/2) Epoch 23, batch 400, loss[loss=0.1724, simple_loss=0.2563, pruned_loss=0.04423, over 24318.00 frames. ], tot_loss[loss=0.1612, simple_loss=0.2391, pruned_loss=0.04164, over 4179139.13 frames. ], batch size: 285, lr: 1.76e-02, grad_scale: 32.0 +2024-01-15 18:02:02,685 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.balancer2.prob, batch_count=63153.333333333336, ans=0.125 +2024-01-15 18:02:05,633 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.0.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 18:02:25,352 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.balancer_ff2.min_abs, batch_count=63220.0, ans=0.1 +2024-01-15 18:02:50,401 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.self_attn_weights.whiten_keys, num_groups=8, num_channels=256, metric=4.06 vs. limit=6.0 +2024-01-15 18:02:58,696 INFO [train.py:994] (1/2) Epoch 23, batch 450, loss[loss=0.1639, simple_loss=0.2445, pruned_loss=0.04168, over 24491.00 frames. ], tot_loss[loss=0.1606, simple_loss=0.2385, pruned_loss=0.04136, over 4323350.36 frames. ], batch size: 229, lr: 1.75e-02, grad_scale: 32.0 +2024-01-15 18:03:17,959 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=11.64 vs. limit=15.0 +2024-01-15 18:03:41,525 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.613e+02 1.977e+02 2.334e+02 2.752e+02 4.665e+02, threshold=4.668e+02, percent-clipped=1.0 +2024-01-15 18:03:53,056 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.balancer2.prob, batch_count=63453.333333333336, ans=0.125 +2024-01-15 18:04:01,685 INFO [train.py:994] (1/2) Epoch 23, batch 500, loss[loss=0.1581, simple_loss=0.2405, pruned_loss=0.03783, over 24316.00 frames. ], tot_loss[loss=0.161, simple_loss=0.2389, pruned_loss=0.04149, over 4435343.36 frames. ], batch size: 285, lr: 1.75e-02, grad_scale: 32.0 +2024-01-15 18:04:08,608 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=4.36 vs. limit=6.0 +2024-01-15 18:04:52,833 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module2.balancer1.prob, batch_count=63620.0, ans=0.125 +2024-01-15 18:04:57,502 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_skip_rate, batch_count=63620.0, ans=0.0 +2024-01-15 18:04:59,940 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module2.balancer2.prob, batch_count=63620.0, ans=0.125 +2024-01-15 18:05:03,140 INFO [train.py:994] (1/2) Epoch 23, batch 550, loss[loss=0.1365, simple_loss=0.2116, pruned_loss=0.03066, over 23977.00 frames. ], tot_loss[loss=0.1607, simple_loss=0.2384, pruned_loss=0.04147, over 4517459.29 frames. ], batch size: 131, lr: 1.75e-02, grad_scale: 32.0 +2024-01-15 18:05:17,558 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass.skip_rate, batch_count=63686.666666666664, ans=0.07 +2024-01-15 18:05:31,398 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=63720.0, ans=0.1 +2024-01-15 18:05:33,750 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=63720.0, ans=0.125 +2024-01-15 18:05:38,421 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.bypass.scale_min, batch_count=63720.0, ans=0.2 +2024-01-15 18:05:43,837 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.balancer1.prob, batch_count=63753.333333333336, ans=0.125 +2024-01-15 18:05:45,852 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.552e+02 2.008e+02 2.265e+02 2.948e+02 4.653e+02, threshold=4.530e+02, percent-clipped=0.0 +2024-01-15 18:05:47,312 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.ff3_skip_rate, batch_count=63753.333333333336, ans=0.0 +2024-01-15 18:05:52,014 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.4.encoder.layers.2.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 18:05:53,098 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.bypass.scale_min, batch_count=63786.666666666664, ans=0.2 +2024-01-15 18:06:06,065 INFO [train.py:994] (1/2) Epoch 23, batch 600, loss[loss=0.1652, simple_loss=0.2366, pruned_loss=0.04691, over 24415.00 frames. ], tot_loss[loss=0.1602, simple_loss=0.2378, pruned_loss=0.04123, over 4572233.12 frames. ], batch size: 153, lr: 1.75e-02, grad_scale: 32.0 +2024-01-15 18:06:11,147 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.bypass.scale_min, batch_count=63820.0, ans=0.2 +2024-01-15 18:06:13,504 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.balancer2.prob, batch_count=63820.0, ans=0.125 +2024-01-15 18:06:24,560 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.balancer1.prob, batch_count=63853.333333333336, ans=0.125 +2024-01-15 18:06:36,917 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module2.balancer1.prob, batch_count=63886.666666666664, ans=0.125 +2024-01-15 18:07:03,066 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module1.balancer2.prob, batch_count=63953.333333333336, ans=0.125 +2024-01-15 18:07:05,412 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.0.layers.0.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 18:07:07,547 INFO [train.py:994] (1/2) Epoch 23, batch 650, loss[loss=0.1516, simple_loss=0.2305, pruned_loss=0.0364, over 24475.00 frames. ], tot_loss[loss=0.1604, simple_loss=0.2382, pruned_loss=0.04137, over 4626627.80 frames. ], batch size: 181, lr: 1.75e-02, grad_scale: 32.0 +2024-01-15 18:07:23,496 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.ff3_skip_rate, batch_count=64020.0, ans=0.0 +2024-01-15 18:07:24,884 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.conv_module1.whiten, num_groups=1, num_channels=512, metric=5.74 vs. limit=15.0 +2024-01-15 18:07:27,410 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=256, metric=10.34 vs. limit=15.0 +2024-01-15 18:07:50,192 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.541e+02 1.943e+02 2.139e+02 2.607e+02 5.108e+02, threshold=4.278e+02, percent-clipped=1.0 +2024-01-15 18:08:01,256 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_skip_rate, batch_count=64120.0, ans=0.0 +2024-01-15 18:08:09,941 INFO [train.py:994] (1/2) Epoch 23, batch 700, loss[loss=0.1676, simple_loss=0.2434, pruned_loss=0.04594, over 24462.00 frames. ], tot_loss[loss=0.1605, simple_loss=0.2379, pruned_loss=0.04158, over 4639344.35 frames. ], batch size: 222, lr: 1.74e-02, grad_scale: 32.0 +2024-01-15 18:08:15,678 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.ff2_skip_rate, batch_count=64153.333333333336, ans=0.0 +2024-01-15 18:08:23,214 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.nonlin_attention.whiten1.whitening_limit, batch_count=64186.666666666664, ans=10.0 +2024-01-15 18:08:46,705 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.bypass_mid.scale_min, batch_count=64253.333333333336, ans=0.2 +2024-01-15 18:09:00,170 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.attention_skip_rate, batch_count=64286.666666666664, ans=0.0 +2024-01-15 18:09:03,172 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=192, metric=10.86 vs. limit=15.0 +2024-01-15 18:09:12,574 INFO [train.py:994] (1/2) Epoch 23, batch 750, loss[loss=0.1735, simple_loss=0.2489, pruned_loss=0.04903, over 24464.00 frames. ], tot_loss[loss=0.1605, simple_loss=0.238, pruned_loss=0.04153, over 4665058.67 frames. ], batch size: 170, lr: 1.74e-02, grad_scale: 32.0 +2024-01-15 18:09:53,920 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.497e+02 2.042e+02 2.299e+02 2.655e+02 3.861e+02, threshold=4.598e+02, percent-clipped=0.0 +2024-01-15 18:10:00,607 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_skip_rate, batch_count=64453.333333333336, ans=0.0 +2024-01-15 18:10:12,656 INFO [train.py:994] (1/2) Epoch 23, batch 800, loss[loss=0.1771, simple_loss=0.2584, pruned_loss=0.04797, over 23899.00 frames. ], tot_loss[loss=0.1605, simple_loss=0.2379, pruned_loss=0.04153, over 4691043.00 frames. ], batch size: 328, lr: 1.74e-02, grad_scale: 32.0 +2024-01-15 18:10:14,514 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.whiten, num_groups=1, num_channels=512, metric=4.10 vs. limit=12.0 +2024-01-15 18:10:18,278 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module1.balancer1.prob, batch_count=64486.666666666664, ans=0.125 +2024-01-15 18:10:33,884 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.nonlin_attention.balancer.prob, batch_count=64520.0, ans=0.125 +2024-01-15 18:10:33,889 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=64520.0, ans=0.1 +2024-01-15 18:11:25,227 INFO [train.py:994] (1/2) Epoch 24, batch 0, loss[loss=0.1635, simple_loss=0.2388, pruned_loss=0.04412, over 24350.00 frames. ], tot_loss[loss=0.1635, simple_loss=0.2388, pruned_loss=0.04412, over 24350.00 frames. ], batch size: 153, lr: 1.71e-02, grad_scale: 32.0 +2024-01-15 18:11:25,229 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 18:11:45,821 INFO [train.py:1026] (1/2) Epoch 24, validation: loss=0.1707, simple_loss=0.2556, pruned_loss=0.04287, over 1622729.00 frames. +2024-01-15 18:11:45,822 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 18:11:50,819 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.1.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 18:12:03,573 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module2.balancer1.max_abs, batch_count=64663.333333333336, ans=10.0 +2024-01-15 18:12:03,889 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=10.97 vs. limit=15.0 +2024-01-15 18:12:06,024 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.bypass.skip_rate, batch_count=64663.333333333336, ans=0.07 +2024-01-15 18:12:10,285 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.balancer2.prob, batch_count=64696.666666666664, ans=0.125 +2024-01-15 18:12:31,069 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.ff3_skip_rate, batch_count=64730.0, ans=0.0 +2024-01-15 18:12:37,383 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.627e+02 2.033e+02 2.201e+02 2.614e+02 3.866e+02, threshold=4.401e+02, percent-clipped=0.0 +2024-01-15 18:12:47,970 INFO [train.py:994] (1/2) Epoch 24, batch 50, loss[loss=0.1661, simple_loss=0.2471, pruned_loss=0.04255, over 24599.00 frames. ], tot_loss[loss=0.157, simple_loss=0.2351, pruned_loss=0.03943, over 1085967.34 frames. ], batch size: 199, lr: 1.70e-02, grad_scale: 32.0 +2024-01-15 18:13:01,802 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=192, metric=12.01 vs. limit=15.0 +2024-01-15 18:13:05,980 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward1.out_proj.dropout_p, batch_count=64830.0, ans=0.1 +2024-01-15 18:13:05,982 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.ff2_skip_rate, batch_count=64830.0, ans=0.0 +2024-01-15 18:13:34,563 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.4.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 18:13:37,351 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.conv_module2.whiten, num_groups=1, num_channels=384, metric=7.30 vs. limit=15.0 +2024-01-15 18:13:50,496 INFO [train.py:994] (1/2) Epoch 24, batch 100, loss[loss=0.152, simple_loss=0.2317, pruned_loss=0.03611, over 24505.00 frames. ], tot_loss[loss=0.1582, simple_loss=0.2358, pruned_loss=0.0403, over 1903701.60 frames. ], batch size: 210, lr: 1.70e-02, grad_scale: 32.0 +2024-01-15 18:13:54,260 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=256, metric=13.58 vs. limit=22.5 +2024-01-15 18:13:57,292 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.ff2_skip_rate, batch_count=64963.333333333336, ans=0.0 +2024-01-15 18:14:25,011 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_skip_rate, batch_count=65030.0, ans=0.0 +2024-01-15 18:14:31,257 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.bypass_mid.scale_min, batch_count=65063.333333333336, ans=0.2 +2024-01-15 18:14:37,188 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.bypass_mid.scale_min, batch_count=65063.333333333336, ans=0.2 +2024-01-15 18:14:42,973 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.411e+02 1.963e+02 2.259e+02 2.725e+02 4.452e+02, threshold=4.517e+02, percent-clipped=1.0 +2024-01-15 18:14:53,854 INFO [train.py:994] (1/2) Epoch 24, batch 150, loss[loss=0.167, simple_loss=0.251, pruned_loss=0.04151, over 24527.00 frames. ], tot_loss[loss=0.1578, simple_loss=0.2357, pruned_loss=0.03996, over 2549425.85 frames. ], batch size: 204, lr: 1.70e-02, grad_scale: 32.0 +2024-01-15 18:15:33,837 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.balancer2.prob, batch_count=65230.0, ans=0.125 +2024-01-15 18:15:34,970 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module2.balancer1.prob, batch_count=65230.0, ans=0.125 +2024-01-15 18:15:50,098 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.nonlin_attention.balancer.prob, batch_count=65263.333333333336, ans=0.125 +2024-01-15 18:15:52,471 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.bypass.scale_min, batch_count=65263.333333333336, ans=0.2 +2024-01-15 18:15:55,775 INFO [train.py:994] (1/2) Epoch 24, batch 200, loss[loss=0.1745, simple_loss=0.2501, pruned_loss=0.04947, over 24513.00 frames. ], tot_loss[loss=0.1588, simple_loss=0.2368, pruned_loss=0.04043, over 3047588.34 frames. ], batch size: 229, lr: 1.70e-02, grad_scale: 32.0 +2024-01-15 18:15:57,881 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.ff2_skip_rate, batch_count=65296.666666666664, ans=0.0 +2024-01-15 18:16:04,337 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.out_combiner.scale_min, batch_count=65296.666666666664, ans=0.2 +2024-01-15 18:16:05,346 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.bypass_mid.scale_min, batch_count=65296.666666666664, ans=0.2 +2024-01-15 18:16:38,881 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_module1.balancer2.prob, batch_count=65396.666666666664, ans=0.125 +2024-01-15 18:16:46,747 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.618e+02 1.863e+02 2.065e+02 2.277e+02 3.194e+02, threshold=4.131e+02, percent-clipped=0.0 +2024-01-15 18:16:58,442 INFO [train.py:994] (1/2) Epoch 24, batch 250, loss[loss=0.1557, simple_loss=0.2415, pruned_loss=0.03491, over 24332.00 frames. ], tot_loss[loss=0.1589, simple_loss=0.2373, pruned_loss=0.04023, over 3447756.27 frames. ], batch size: 285, lr: 1.70e-02, grad_scale: 32.0 +2024-01-15 18:17:04,556 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module2.balancer1.prob, batch_count=65463.333333333336, ans=0.125 +2024-01-15 18:17:54,438 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module2.balancer2.prob, batch_count=65596.66666666667, ans=0.125 +2024-01-15 18:18:00,113 INFO [train.py:994] (1/2) Epoch 24, batch 300, loss[loss=0.1675, simple_loss=0.2464, pruned_loss=0.04436, over 24513.00 frames. ], tot_loss[loss=0.158, simple_loss=0.2364, pruned_loss=0.03978, over 3741338.72 frames. ], batch size: 210, lr: 1.70e-02, grad_scale: 32.0 +2024-01-15 18:18:23,210 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward2.hidden_balancer.prob, batch_count=65696.66666666667, ans=0.125 +2024-01-15 18:18:30,554 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.balancer1.prob, batch_count=65696.66666666667, ans=0.125 +2024-01-15 18:18:45,785 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=12.64 vs. limit=15.0 +2024-01-15 18:18:50,111 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.726e+02 1.979e+02 2.184e+02 2.464e+02 3.972e+02, threshold=4.368e+02, percent-clipped=0.0 +2024-01-15 18:18:51,727 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.ff3_skip_rate, batch_count=65763.33333333333, ans=0.0 +2024-01-15 18:18:52,829 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.bypass.scale_min, batch_count=65763.33333333333, ans=0.2 +2024-01-15 18:19:02,059 INFO [train.py:994] (1/2) Epoch 24, batch 350, loss[loss=0.1674, simple_loss=0.2483, pruned_loss=0.04328, over 24489.00 frames. ], tot_loss[loss=0.1587, simple_loss=0.2373, pruned_loss=0.04007, over 3974239.57 frames. ], batch size: 210, lr: 1.69e-02, grad_scale: 32.0 +2024-01-15 18:19:03,627 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=65796.66666666667, ans=0.125 +2024-01-15 18:19:55,622 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward3.hidden_balancer.prob, batch_count=65930.0, ans=0.125 +2024-01-15 18:20:02,430 INFO [train.py:994] (1/2) Epoch 24, batch 400, loss[loss=0.1615, simple_loss=0.2425, pruned_loss=0.04029, over 24491.00 frames. ], tot_loss[loss=0.1583, simple_loss=0.237, pruned_loss=0.03977, over 4167641.37 frames. ], batch size: 229, lr: 1.69e-02, grad_scale: 32.0 +2024-01-15 18:20:12,484 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.balancer1.prob, batch_count=65963.33333333333, ans=0.125 +2024-01-15 18:20:19,804 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.ff2_skip_rate, batch_count=65996.66666666667, ans=0.0 +2024-01-15 18:20:27,179 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=384, metric=22.94 vs. limit=22.5 +2024-01-15 18:20:27,373 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.self_attn1.whiten, num_groups=1, num_channels=192, metric=14.60 vs. limit=22.5 +2024-01-15 18:20:41,869 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.whiten, num_groups=1, num_channels=512, metric=3.53 vs. limit=12.0 +2024-01-15 18:20:53,874 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.573e+02 1.999e+02 2.258e+02 2.615e+02 4.418e+02, threshold=4.515e+02, percent-clipped=1.0 +2024-01-15 18:21:04,454 INFO [train.py:994] (1/2) Epoch 24, batch 450, loss[loss=0.1633, simple_loss=0.2407, pruned_loss=0.043, over 24499.00 frames. ], tot_loss[loss=0.1589, simple_loss=0.2374, pruned_loss=0.04013, over 4310837.13 frames. ], batch size: 216, lr: 1.69e-02, grad_scale: 32.0 +2024-01-15 18:21:11,847 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=66130.0, ans=0.125 +2024-01-15 18:21:22,645 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.bypass.skip_rate, batch_count=66163.33333333333, ans=0.04949747468305833 +2024-01-15 18:21:47,832 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=66230.0, ans=0.1 +2024-01-15 18:21:50,558 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.whiten, num_groups=1, num_channels=512, metric=6.30 vs. limit=12.0 +2024-01-15 18:22:07,792 INFO [train.py:994] (1/2) Epoch 24, batch 500, loss[loss=0.1655, simple_loss=0.2431, pruned_loss=0.04396, over 24228.00 frames. ], tot_loss[loss=0.1586, simple_loss=0.2371, pruned_loss=0.0401, over 4418142.69 frames. ], batch size: 311, lr: 1.69e-02, grad_scale: 32.0 +2024-01-15 18:22:16,357 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module2.balancer1.prob, batch_count=66296.66666666667, ans=0.125 +2024-01-15 18:22:31,164 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.nonlin_attention.balancer.min_positive, batch_count=66330.0, ans=0.05 +2024-01-15 18:22:59,403 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.575e+02 2.052e+02 2.388e+02 2.760e+02 4.154e+02, threshold=4.776e+02, percent-clipped=0.0 +2024-01-15 18:22:59,706 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module2.balancer2.prob, batch_count=66430.0, ans=0.125 +2024-01-15 18:23:10,360 INFO [train.py:994] (1/2) Epoch 24, batch 550, loss[loss=0.1606, simple_loss=0.2348, pruned_loss=0.04321, over 24496.00 frames. ], tot_loss[loss=0.1592, simple_loss=0.2376, pruned_loss=0.04044, over 4502447.16 frames. ], batch size: 165, lr: 1.69e-02, grad_scale: 32.0 +2024-01-15 18:23:18,697 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=11.22 vs. limit=15.0 +2024-01-15 18:23:19,732 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_skip_rate, batch_count=66463.33333333333, ans=0.0 +2024-01-15 18:23:41,269 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.balancer1.prob, batch_count=66530.0, ans=0.125 +2024-01-15 18:24:00,032 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.5.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 18:24:14,006 INFO [train.py:994] (1/2) Epoch 24, batch 600, loss[loss=0.1752, simple_loss=0.2569, pruned_loss=0.04673, over 23788.00 frames. ], tot_loss[loss=0.159, simple_loss=0.2374, pruned_loss=0.04026, over 4570339.29 frames. ], batch size: 328, lr: 1.68e-02, grad_scale: 32.0 +2024-01-15 18:24:14,301 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module2.balancer2.min_positive, batch_count=66630.0, ans=0.05 +2024-01-15 18:24:19,002 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=66630.0, ans=0.1 +2024-01-15 18:24:32,572 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module1.balancer2.min_abs, batch_count=66663.33333333333, ans=0.5 +2024-01-15 18:24:52,700 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=66730.0, ans=0.0 +2024-01-15 18:25:01,550 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module1.balancer2.prob, batch_count=66730.0, ans=0.125 +2024-01-15 18:25:02,824 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.ff3_skip_rate, batch_count=66730.0, ans=0.0 +2024-01-15 18:25:07,911 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.583e+02 1.946e+02 2.215e+02 2.637e+02 3.675e+02, threshold=4.429e+02, percent-clipped=0.0 +2024-01-15 18:25:18,670 INFO [train.py:994] (1/2) Epoch 24, batch 650, loss[loss=0.157, simple_loss=0.2371, pruned_loss=0.03847, over 24495.00 frames. ], tot_loss[loss=0.1594, simple_loss=0.238, pruned_loss=0.04038, over 4631837.44 frames. ], batch size: 187, lr: 1.68e-02, grad_scale: 32.0 +2024-01-15 18:25:36,851 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass_mid.scale_min, batch_count=66830.0, ans=0.2 +2024-01-15 18:25:38,147 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_skip_rate, batch_count=66830.0, ans=0.0 +2024-01-15 18:26:01,769 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.nonlin_attention.whiten1, num_groups=1, num_channels=384, metric=5.02 vs. limit=10.0 +2024-01-15 18:26:21,261 INFO [train.py:994] (1/2) Epoch 24, batch 700, loss[loss=0.156, simple_loss=0.2355, pruned_loss=0.03825, over 24486.00 frames. ], tot_loss[loss=0.1588, simple_loss=0.2373, pruned_loss=0.04017, over 4663593.45 frames. ], batch size: 222, lr: 1.68e-02, grad_scale: 32.0 +2024-01-15 18:26:42,521 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=66996.66666666667, ans=0.125 +2024-01-15 18:27:05,637 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=192, metric=13.19 vs. limit=15.0 +2024-01-15 18:27:06,133 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module2.balancer2.prob, batch_count=67063.33333333333, ans=0.125 +2024-01-15 18:27:10,513 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.attention_skip_rate, batch_count=67063.33333333333, ans=0.0 +2024-01-15 18:27:13,876 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.651e+02 1.976e+02 2.136e+02 2.638e+02 3.753e+02, threshold=4.272e+02, percent-clipped=0.0 +2024-01-15 18:27:19,955 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.conv.8.prob, batch_count=67096.66666666667, ans=0.125 +2024-01-15 18:27:25,232 INFO [train.py:994] (1/2) Epoch 24, batch 750, loss[loss=0.1605, simple_loss=0.237, pruned_loss=0.04199, over 24546.00 frames. ], tot_loss[loss=0.1586, simple_loss=0.2368, pruned_loss=0.04015, over 4685492.91 frames. ], batch size: 193, lr: 1.68e-02, grad_scale: 32.0 +2024-01-15 18:27:56,571 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.whiten, num_groups=1, num_channels=192, metric=3.77 vs. limit=12.0 +2024-01-15 18:27:57,200 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=67196.66666666667, ans=0.1 +2024-01-15 18:27:59,782 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=384, metric=6.54 vs. limit=10.0 +2024-01-15 18:28:03,056 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.conv_module2.whiten, num_groups=1, num_channels=384, metric=2.67 vs. limit=15.0 +2024-01-15 18:28:05,551 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.ff2_skip_rate, batch_count=67230.0, ans=0.0 +2024-01-15 18:28:08,996 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.bypass_mid.scale_min, batch_count=67230.0, ans=0.2 +2024-01-15 18:28:13,430 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.balancer2.prob, batch_count=67263.33333333333, ans=0.125 +2024-01-15 18:28:21,205 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward3.hidden_balancer.prob, batch_count=67263.33333333333, ans=0.125 +2024-01-15 18:28:24,255 INFO [train.py:994] (1/2) Epoch 24, batch 800, loss[loss=0.1434, simple_loss=0.2197, pruned_loss=0.0335, over 24180.00 frames. ], tot_loss[loss=0.158, simple_loss=0.2362, pruned_loss=0.03989, over 4706021.56 frames. ], batch size: 140, lr: 1.68e-02, grad_scale: 32.0 +2024-01-15 18:28:28,344 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.hidden_balancer.prob, batch_count=67296.66666666667, ans=0.125 +2024-01-15 18:28:33,433 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=512, metric=2.43 vs. limit=15.0 +2024-01-15 18:28:35,325 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.bypass.scale_min, batch_count=67330.0, ans=0.2 +2024-01-15 18:28:45,523 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=192, metric=5.64 vs. limit=10.0 +2024-01-15 18:28:56,524 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.whiten, num_groups=1, num_channels=384, metric=2.52 vs. limit=12.0 +2024-01-15 18:29:02,855 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_skip_rate, batch_count=67396.66666666667, ans=0.0 +2024-01-15 18:29:03,850 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module1.balancer2.prob, batch_count=67396.66666666667, ans=0.125 +2024-01-15 18:29:05,025 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward3.hidden_balancer.prob, batch_count=67396.66666666667, ans=0.125 +2024-01-15 18:29:11,604 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.577e+02 1.963e+02 2.169e+02 2.600e+02 5.090e+02, threshold=4.339e+02, percent-clipped=1.0 +2024-01-15 18:29:36,195 INFO [train.py:994] (1/2) Epoch 25, batch 0, loss[loss=0.155, simple_loss=0.2342, pruned_loss=0.03791, over 24464.00 frames. ], tot_loss[loss=0.155, simple_loss=0.2342, pruned_loss=0.03791, over 24464.00 frames. ], batch size: 222, lr: 1.65e-02, grad_scale: 32.0 +2024-01-15 18:29:36,195 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 18:29:55,379 INFO [train.py:1026] (1/2) Epoch 25, validation: loss=0.1673, simple_loss=0.2515, pruned_loss=0.04159, over 1622729.00 frames. +2024-01-15 18:29:55,380 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 18:29:56,064 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=512, metric=19.76 vs. limit=22.5 +2024-01-15 18:30:08,313 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.3.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 18:30:19,912 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=256, metric=3.29 vs. limit=15.0 +2024-01-15 18:30:26,211 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=3.54 vs. limit=6.0 +2024-01-15 18:30:31,425 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.attention_skip_rate, batch_count=67540.0, ans=0.0 +2024-01-15 18:30:36,816 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=67540.0, ans=0.0 +2024-01-15 18:30:41,576 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module2.balancer2.prob, batch_count=67540.0, ans=0.125 +2024-01-15 18:30:47,065 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.attention_skip_rate, batch_count=67573.33333333333, ans=0.0 +2024-01-15 18:30:57,377 INFO [train.py:994] (1/2) Epoch 25, batch 50, loss[loss=0.1233, simple_loss=0.2008, pruned_loss=0.0229, over 24083.00 frames. ], tot_loss[loss=0.1563, simple_loss=0.2346, pruned_loss=0.03902, over 1093370.61 frames. ], batch size: 131, lr: 1.64e-02, grad_scale: 32.0 +2024-01-15 18:31:08,856 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.bypass_mid.scale_min, batch_count=67640.0, ans=0.2 +2024-01-15 18:31:25,681 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.balancer2.prob, batch_count=67673.33333333333, ans=0.125 +2024-01-15 18:31:57,090 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.667e+02 2.049e+02 2.347e+02 2.774e+02 4.343e+02, threshold=4.695e+02, percent-clipped=1.0 +2024-01-15 18:31:59,459 INFO [train.py:994] (1/2) Epoch 25, batch 100, loss[loss=0.13, simple_loss=0.2026, pruned_loss=0.02873, over 23621.00 frames. ], tot_loss[loss=0.156, simple_loss=0.2343, pruned_loss=0.03886, over 1917201.25 frames. ], batch size: 119, lr: 1.64e-02, grad_scale: 32.0 +2024-01-15 18:32:26,109 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module2.balancer1.prob, batch_count=67840.0, ans=0.125 +2024-01-15 18:32:49,144 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.whiten, num_groups=1, num_channels=384, metric=3.39 vs. limit=12.0 +2024-01-15 18:33:02,619 INFO [train.py:994] (1/2) Epoch 25, batch 150, loss[loss=0.1657, simple_loss=0.2405, pruned_loss=0.0455, over 24500.00 frames. ], tot_loss[loss=0.1566, simple_loss=0.2348, pruned_loss=0.03923, over 2556746.49 frames. ], batch size: 243, lr: 1.64e-02, grad_scale: 32.0 +2024-01-15 18:33:23,056 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module1.balancer2.min_abs, batch_count=67973.33333333333, ans=0.5 +2024-01-15 18:33:31,621 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.conv.8.prob, batch_count=68006.66666666667, ans=0.125 +2024-01-15 18:33:39,345 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=68040.0, ans=0.0 +2024-01-15 18:34:02,707 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.662e+02 2.019e+02 2.286e+02 2.815e+02 4.282e+02, threshold=4.572e+02, percent-clipped=0.0 +2024-01-15 18:34:05,155 INFO [train.py:994] (1/2) Epoch 25, batch 200, loss[loss=0.1448, simple_loss=0.225, pruned_loss=0.03235, over 24348.00 frames. ], tot_loss[loss=0.1568, simple_loss=0.2349, pruned_loss=0.03935, over 3052663.29 frames. ], batch size: 298, lr: 1.64e-02, grad_scale: 32.0 +2024-01-15 18:34:19,592 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.self_attn2.whiten, num_groups=1, num_channels=192, metric=16.58 vs. limit=22.5 +2024-01-15 18:34:42,616 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.balancer_na.min_abs, batch_count=68206.66666666667, ans=0.02 +2024-01-15 18:34:43,824 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward1.out_proj.dropout_p, batch_count=68206.66666666667, ans=0.1 +2024-01-15 18:34:44,948 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.balancer1.prob, batch_count=68206.66666666667, ans=0.125 +2024-01-15 18:34:44,974 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=68206.66666666667, ans=0.125 +2024-01-15 18:34:48,557 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.balancer1.prob, batch_count=68206.66666666667, ans=0.125 +2024-01-15 18:35:04,061 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.bypass.skip_rate, batch_count=68240.0, ans=0.035 +2024-01-15 18:35:07,281 INFO [train.py:994] (1/2) Epoch 25, batch 250, loss[loss=0.1588, simple_loss=0.241, pruned_loss=0.03824, over 24415.00 frames. ], tot_loss[loss=0.157, simple_loss=0.2355, pruned_loss=0.03925, over 3450991.71 frames. ], batch size: 250, lr: 1.64e-02, grad_scale: 32.0 +2024-01-15 18:35:07,540 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.out_combiner.scale_min, batch_count=68273.33333333333, ans=0.2 +2024-01-15 18:35:17,113 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=68273.33333333333, ans=0.1 +2024-01-15 18:35:22,564 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_skip_rate, batch_count=68306.66666666667, ans=0.0 +2024-01-15 18:35:26,198 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=68306.66666666667, ans=0.125 +2024-01-15 18:35:57,445 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=68406.66666666667, ans=0.125 +2024-01-15 18:36:06,620 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.569e+02 1.897e+02 2.000e+02 2.240e+02 4.389e+02, threshold=4.000e+02, percent-clipped=0.0 +2024-01-15 18:36:09,503 INFO [train.py:994] (1/2) Epoch 25, batch 300, loss[loss=0.1472, simple_loss=0.2309, pruned_loss=0.03179, over 24201.00 frames. ], tot_loss[loss=0.1576, simple_loss=0.2361, pruned_loss=0.03956, over 3750365.77 frames. ], batch size: 140, lr: 1.63e-02, grad_scale: 32.0 +2024-01-15 18:36:28,467 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.self_attn_weights.pos_emb_skip_rate, batch_count=68473.33333333333, ans=0.0 +2024-01-15 18:36:47,099 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.ff3_skip_rate, batch_count=68540.0, ans=0.0 +2024-01-15 18:36:51,598 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.conv.8.prob, batch_count=68540.0, ans=0.125 +2024-01-15 18:37:00,139 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.hidden_balancer.prob, batch_count=68573.33333333333, ans=0.125 +2024-01-15 18:37:08,318 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.self_attn2.whiten, num_groups=1, num_channels=384, metric=13.77 vs. limit=22.5 +2024-01-15 18:37:09,178 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=68573.33333333333, ans=0.125 +2024-01-15 18:37:12,978 INFO [train.py:994] (1/2) Epoch 25, batch 350, loss[loss=0.1686, simple_loss=0.2514, pruned_loss=0.04288, over 23861.00 frames. ], tot_loss[loss=0.1573, simple_loss=0.2354, pruned_loss=0.03958, over 3972713.23 frames. ], batch size: 328, lr: 1.63e-02, grad_scale: 16.0 +2024-01-15 18:37:18,065 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward3.hidden_balancer.prob, batch_count=68606.66666666667, ans=0.125 +2024-01-15 18:37:32,784 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_module2.balancer2.prob, batch_count=68640.0, ans=0.125 +2024-01-15 18:38:06,567 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.4.encoder.layers.2.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 18:38:08,951 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_module2.balancer2.prob, batch_count=68740.0, ans=0.125 +2024-01-15 18:38:13,305 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.625e+02 1.840e+02 2.010e+02 2.261e+02 4.053e+02, threshold=4.019e+02, percent-clipped=1.0 +2024-01-15 18:38:14,550 INFO [train.py:994] (1/2) Epoch 25, batch 400, loss[loss=0.1626, simple_loss=0.2399, pruned_loss=0.04264, over 24464.00 frames. ], tot_loss[loss=0.1567, simple_loss=0.2351, pruned_loss=0.03916, over 4162789.49 frames. ], batch size: 170, lr: 1.63e-02, grad_scale: 32.0 +2024-01-15 18:38:20,627 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=68773.33333333333, ans=0.0 +2024-01-15 18:38:37,307 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.nonlin_attention.balancer.prob, batch_count=68806.66666666667, ans=0.125 +2024-01-15 18:38:45,355 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=68840.0, ans=0.0 +2024-01-15 18:38:46,653 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.balancer1.prob, batch_count=68840.0, ans=0.125 +2024-01-15 18:38:52,027 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module2.balancer2.prob, batch_count=68873.33333333333, ans=0.125 +2024-01-15 18:39:01,018 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.hidden_balancer.prob, batch_count=68873.33333333333, ans=0.125 +2024-01-15 18:39:08,439 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.conv_module2.whiten, num_groups=1, num_channels=512, metric=3.62 vs. limit=15.0 +2024-01-15 18:39:15,926 INFO [train.py:994] (1/2) Epoch 25, batch 450, loss[loss=0.1582, simple_loss=0.2364, pruned_loss=0.03994, over 24486.00 frames. ], tot_loss[loss=0.1569, simple_loss=0.2352, pruned_loss=0.03928, over 4301863.25 frames. ], batch size: 267, lr: 1.63e-02, grad_scale: 16.0 +2024-01-15 18:39:17,993 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=68940.0, ans=0.125 +2024-01-15 18:39:27,054 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.bypass_mid.scale_min, batch_count=68940.0, ans=0.2 +2024-01-15 18:40:09,645 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.conv_module2.whiten, num_groups=1, num_channels=512, metric=14.82 vs. limit=15.0 +2024-01-15 18:40:18,566 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.611e+02 1.852e+02 2.038e+02 2.316e+02 3.412e+02, threshold=4.077e+02, percent-clipped=0.0 +2024-01-15 18:40:18,595 INFO [train.py:994] (1/2) Epoch 25, batch 500, loss[loss=0.1563, simple_loss=0.2347, pruned_loss=0.03895, over 24406.00 frames. ], tot_loss[loss=0.1565, simple_loss=0.2351, pruned_loss=0.03898, over 4419791.15 frames. ], batch size: 159, lr: 1.63e-02, grad_scale: 16.0 +2024-01-15 18:40:18,868 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.out_proj.dropout_p, batch_count=69106.66666666667, ans=0.1 +2024-01-15 18:40:20,149 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.5.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 18:40:27,355 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.conv_module2.whiten, num_groups=1, num_channels=384, metric=2.34 vs. limit=15.0 +2024-01-15 18:40:41,253 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass_mid.scale_min, batch_count=69140.0, ans=0.2 +2024-01-15 18:40:46,100 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=9.65 vs. limit=15.0 +2024-01-15 18:40:48,530 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=3.94 vs. limit=10.0 +2024-01-15 18:40:51,972 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=512, metric=2.18 vs. limit=15.0 +2024-01-15 18:40:52,748 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.balancer1.prob, batch_count=69173.33333333333, ans=0.125 +2024-01-15 18:40:57,973 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.balancer2.prob, batch_count=69206.66666666667, ans=0.125 +2024-01-15 18:41:19,569 INFO [train.py:994] (1/2) Epoch 25, batch 550, loss[loss=0.1748, simple_loss=0.2562, pruned_loss=0.04666, over 23896.00 frames. ], tot_loss[loss=0.1566, simple_loss=0.2352, pruned_loss=0.03895, over 4511989.89 frames. ], batch size: 328, lr: 1.63e-02, grad_scale: 8.0 +2024-01-15 18:41:35,231 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.balancer2.prob, batch_count=69306.66666666667, ans=0.125 +2024-01-15 18:41:57,822 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.conv.2.prob, batch_count=69373.33333333333, ans=0.125 +2024-01-15 18:42:03,864 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=69373.33333333333, ans=0.125 +2024-01-15 18:42:13,410 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.balancer2.prob, batch_count=69406.66666666667, ans=0.125 +2024-01-15 18:42:14,525 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.self_attn_weights.pos_emb_skip_rate, batch_count=69406.66666666667, ans=0.0 +2024-01-15 18:42:22,461 INFO [train.py:994] (1/2) Epoch 25, batch 600, loss[loss=0.1472, simple_loss=0.2323, pruned_loss=0.03106, over 24504.00 frames. ], tot_loss[loss=0.1561, simple_loss=0.2349, pruned_loss=0.03859, over 4575511.78 frames. ], batch size: 229, lr: 1.62e-02, grad_scale: 8.0 +2024-01-15 18:42:23,596 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.655e+02 1.874e+02 2.040e+02 2.282e+02 3.344e+02, threshold=4.081e+02, percent-clipped=0.0 +2024-01-15 18:42:39,133 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module2.balancer1.prob, batch_count=69473.33333333333, ans=0.125 +2024-01-15 18:42:52,604 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.attention_skip_rate, batch_count=69506.66666666667, ans=0.0 +2024-01-15 18:43:12,977 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.bypass_mid.scale_min, batch_count=69573.33333333333, ans=0.2 +2024-01-15 18:43:22,324 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=69606.66666666667, ans=0.1 +2024-01-15 18:43:23,165 INFO [train.py:994] (1/2) Epoch 25, batch 650, loss[loss=0.129, simple_loss=0.1894, pruned_loss=0.03429, over 18986.00 frames. ], tot_loss[loss=0.1553, simple_loss=0.2341, pruned_loss=0.03825, over 4620370.08 frames. ], batch size: 82, lr: 1.62e-02, grad_scale: 8.0 +2024-01-15 18:43:41,281 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.ff2_skip_rate, batch_count=69640.0, ans=0.0 +2024-01-15 18:43:47,176 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.attention_skip_rate, batch_count=69673.33333333333, ans=0.0 +2024-01-15 18:43:49,486 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=69673.33333333333, ans=0.0 +2024-01-15 18:43:55,780 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.whiten, num_groups=1, num_channels=512, metric=4.84 vs. limit=12.0 +2024-01-15 18:43:56,446 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward2.hidden_balancer.prob, batch_count=69673.33333333333, ans=0.125 +2024-01-15 18:43:58,879 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=256, metric=16.35 vs. limit=22.5 +2024-01-15 18:44:03,452 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=69706.66666666667, ans=0.1 +2024-01-15 18:44:15,158 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.balancer1.prob, batch_count=69740.0, ans=0.125 +2024-01-15 18:44:25,311 INFO [train.py:994] (1/2) Epoch 25, batch 700, loss[loss=0.1525, simple_loss=0.2304, pruned_loss=0.03732, over 24462.00 frames. ], tot_loss[loss=0.1558, simple_loss=0.2343, pruned_loss=0.03869, over 4659482.59 frames. ], batch size: 222, lr: 1.62e-02, grad_scale: 8.0 +2024-01-15 18:44:26,460 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.644e+02 1.989e+02 2.242e+02 2.586e+02 3.902e+02, threshold=4.483e+02, percent-clipped=0.0 +2024-01-15 18:44:38,492 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=69806.66666666667, ans=0.125 +2024-01-15 18:44:47,824 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.balancer2.prob, batch_count=69840.0, ans=0.125 +2024-01-15 18:45:14,896 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.hidden_balancer.prob, batch_count=69906.66666666667, ans=0.125 +2024-01-15 18:45:15,469 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.self_attn2.whiten, num_groups=1, num_channels=256, metric=20.29 vs. limit=22.5 +2024-01-15 18:45:24,909 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass.skip_rate, batch_count=69906.66666666667, ans=0.07 +2024-01-15 18:45:25,002 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module2.balancer2.prob, batch_count=69906.66666666667, ans=0.125 +2024-01-15 18:45:27,036 INFO [train.py:994] (1/2) Epoch 25, batch 750, loss[loss=0.1368, simple_loss=0.2099, pruned_loss=0.03189, over 23526.00 frames. ], tot_loss[loss=0.1555, simple_loss=0.2337, pruned_loss=0.0386, over 4667720.60 frames. ], batch size: 119, lr: 1.62e-02, grad_scale: 8.0 +2024-01-15 18:45:51,316 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module2.balancer2.min_abs, batch_count=70006.66666666667, ans=0.5 +2024-01-15 18:46:07,536 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.ff2_skip_rate, batch_count=70040.0, ans=0.0 +2024-01-15 18:46:25,459 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.out_proj.dropout_p, batch_count=70073.33333333333, ans=0.1 +2024-01-15 18:46:27,545 INFO [train.py:994] (1/2) Epoch 25, batch 800, loss[loss=0.1588, simple_loss=0.2349, pruned_loss=0.04133, over 24341.00 frames. ], tot_loss[loss=0.1555, simple_loss=0.2338, pruned_loss=0.03859, over 4701018.66 frames. ], batch size: 153, lr: 1.62e-02, grad_scale: 16.0 +2024-01-15 18:46:28,646 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.641e+02 1.948e+02 2.174e+02 2.457e+02 3.780e+02, threshold=4.348e+02, percent-clipped=0.0 +2024-01-15 18:46:30,002 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=70106.66666666667, ans=0.125 +2024-01-15 18:46:36,857 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.feed_forward1.out_whiten, num_groups=1, num_channels=384, metric=8.30 vs. limit=15.0 +2024-01-15 18:46:56,240 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module2.balancer1.prob, batch_count=70173.33333333333, ans=0.125 +2024-01-15 18:46:57,987 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.self_attn1.whiten, num_groups=1, num_channels=192, metric=12.70 vs. limit=22.5 +2024-01-15 18:47:07,620 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=384, metric=16.68 vs. limit=22.5 +2024-01-15 18:47:10,036 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=7.22 vs. limit=10.0 +2024-01-15 18:47:38,870 INFO [train.py:994] (1/2) Epoch 26, batch 0, loss[loss=0.1574, simple_loss=0.2404, pruned_loss=0.03721, over 24326.00 frames. ], tot_loss[loss=0.1574, simple_loss=0.2404, pruned_loss=0.03721, over 24326.00 frames. ], batch size: 147, lr: 1.59e-02, grad_scale: 32.0 +2024-01-15 18:47:38,870 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 18:47:59,167 INFO [train.py:1026] (1/2) Epoch 26, validation: loss=0.1671, simple_loss=0.2515, pruned_loss=0.04137, over 1622729.00 frames. +2024-01-15 18:47:59,168 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 18:48:00,925 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=384, metric=12.21 vs. limit=15.0 +2024-01-15 18:48:09,455 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module1.balancer1.min_positive, batch_count=70250.0, ans=0.025 +2024-01-15 18:48:11,917 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=70283.33333333333, ans=0.125 +2024-01-15 18:48:13,139 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.balancer1.prob, batch_count=70283.33333333333, ans=0.125 +2024-01-15 18:48:17,865 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.balancer2.prob, batch_count=70283.33333333333, ans=0.125 +2024-01-15 18:48:29,860 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module2.balancer1.prob, batch_count=70316.66666666667, ans=0.125 +2024-01-15 18:49:01,130 INFO [train.py:994] (1/2) Epoch 26, batch 50, loss[loss=0.1625, simple_loss=0.2414, pruned_loss=0.04185, over 24604.00 frames. ], tot_loss[loss=0.1549, simple_loss=0.2332, pruned_loss=0.03831, over 1085636.00 frames. ], batch size: 199, lr: 1.59e-02, grad_scale: 32.0 +2024-01-15 18:49:07,848 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.whiten, num_groups=1, num_channels=256, metric=4.12 vs. limit=12.0 +2024-01-15 18:49:09,698 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_module1.balancer1.max_abs, batch_count=70416.66666666667, ans=10.0 +2024-01-15 18:49:09,712 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=70416.66666666667, ans=0.0 +2024-01-15 18:49:10,542 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.574e+02 1.886e+02 2.079e+02 2.344e+02 3.682e+02, threshold=4.157e+02, percent-clipped=0.0 +2024-01-15 18:49:41,148 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_skip_rate, batch_count=70516.66666666667, ans=0.0 +2024-01-15 18:49:53,442 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.balancer2.prob, batch_count=70550.0, ans=0.125 +2024-01-15 18:49:59,384 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.ff3_skip_rate, batch_count=70550.0, ans=0.0 +2024-01-15 18:50:02,554 INFO [train.py:994] (1/2) Epoch 26, batch 100, loss[loss=0.1514, simple_loss=0.2295, pruned_loss=0.03667, over 24426.00 frames. ], tot_loss[loss=0.1553, simple_loss=0.2336, pruned_loss=0.03856, over 1909963.63 frames. ], batch size: 250, lr: 1.58e-02, grad_scale: 32.0 +2024-01-15 18:50:22,089 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.nonlin_attention.balancer.max_positive, batch_count=70616.66666666667, ans=0.95 +2024-01-15 18:50:31,008 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.4.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 18:50:32,170 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=70650.0, ans=0.125 +2024-01-15 18:50:34,987 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=384, metric=2.39 vs. limit=15.0 +2024-01-15 18:50:44,782 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=70683.33333333333, ans=0.125 +2024-01-15 18:51:05,736 INFO [train.py:994] (1/2) Epoch 26, batch 150, loss[loss=0.1592, simple_loss=0.2351, pruned_loss=0.04164, over 24346.00 frames. ], tot_loss[loss=0.1556, simple_loss=0.234, pruned_loss=0.03866, over 2551164.48 frames. ], batch size: 275, lr: 1.58e-02, grad_scale: 32.0 +2024-01-15 18:51:08,283 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module2.balancer2.prob, batch_count=70750.0, ans=0.125 +2024-01-15 18:51:13,598 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.self_attn_weights.whiten_keys.whitening_limit, batch_count=70750.0, ans=6.0 +2024-01-15 18:51:15,097 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.572e+02 1.828e+02 2.027e+02 2.407e+02 4.770e+02, threshold=4.053e+02, percent-clipped=1.0 +2024-01-15 18:51:30,217 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=11.97 vs. limit=15.0 +2024-01-15 18:52:00,036 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_skip_rate, batch_count=70883.33333333333, ans=0.0 +2024-01-15 18:52:06,870 INFO [train.py:994] (1/2) Epoch 26, batch 200, loss[loss=0.1536, simple_loss=0.2316, pruned_loss=0.03782, over 24521.00 frames. ], tot_loss[loss=0.1552, simple_loss=0.2331, pruned_loss=0.03861, over 3051014.13 frames. ], batch size: 165, lr: 1.58e-02, grad_scale: 32.0 +2024-01-15 18:52:09,534 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=70916.66666666667, ans=0.0 +2024-01-15 18:52:13,176 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=10.02 vs. limit=15.0 +2024-01-15 18:52:14,409 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=256, metric=5.44 vs. limit=15.0 +2024-01-15 18:52:15,433 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.self_attn1.whiten, num_groups=1, num_channels=512, metric=19.84 vs. limit=22.5 +2024-01-15 18:52:49,656 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.nonlin_attention.balancer.prob, batch_count=71016.66666666667, ans=0.125 +2024-01-15 18:53:04,419 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward1.out_proj.dropout_p, batch_count=71050.0, ans=0.1 +2024-01-15 18:53:07,927 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_skip_rate, batch_count=71083.33333333333, ans=0.0 +2024-01-15 18:53:08,805 INFO [train.py:994] (1/2) Epoch 26, batch 250, loss[loss=0.157, simple_loss=0.237, pruned_loss=0.03847, over 24208.00 frames. ], tot_loss[loss=0.1556, simple_loss=0.2341, pruned_loss=0.03854, over 3438462.75 frames. ], batch size: 311, lr: 1.58e-02, grad_scale: 32.0 +2024-01-15 18:53:11,461 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module2.balancer2.prob, batch_count=71083.33333333333, ans=0.125 +2024-01-15 18:53:18,919 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.661e+02 1.897e+02 2.050e+02 2.398e+02 3.623e+02, threshold=4.101e+02, percent-clipped=0.0 +2024-01-15 18:53:43,697 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.nonlin_attention.balancer.min_positive, batch_count=71150.0, ans=0.05 +2024-01-15 18:54:07,274 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.balancer2.prob, batch_count=71216.66666666667, ans=0.125 +2024-01-15 18:54:10,522 INFO [train.py:994] (1/2) Epoch 26, batch 300, loss[loss=0.1721, simple_loss=0.2457, pruned_loss=0.04921, over 24492.00 frames. ], tot_loss[loss=0.1556, simple_loss=0.2341, pruned_loss=0.03852, over 3748025.03 frames. ], batch size: 165, lr: 1.58e-02, grad_scale: 32.0 +2024-01-15 18:54:12,354 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=3.34 vs. limit=6.0 +2024-01-15 18:54:16,255 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_skip_rate, batch_count=71250.0, ans=0.0 +2024-01-15 18:54:40,889 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.conv_module2.whiten, num_groups=1, num_channels=512, metric=2.76 vs. limit=15.0 +2024-01-15 18:54:58,355 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_skip_rate, batch_count=71350.0, ans=0.0 +2024-01-15 18:55:00,762 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=71383.33333333333, ans=0.125 +2024-01-15 18:55:06,576 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_skip_rate, batch_count=71383.33333333333, ans=0.0 +2024-01-15 18:55:09,643 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=71383.33333333333, ans=0.0 +2024-01-15 18:55:12,832 INFO [train.py:994] (1/2) Epoch 26, batch 350, loss[loss=0.1544, simple_loss=0.2323, pruned_loss=0.03824, over 24438.00 frames. ], tot_loss[loss=0.1554, simple_loss=0.234, pruned_loss=0.03841, over 3972847.60 frames. ], batch size: 170, lr: 1.58e-02, grad_scale: 32.0 +2024-01-15 18:55:22,876 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.564e+02 1.903e+02 2.138e+02 2.406e+02 3.843e+02, threshold=4.276e+02, percent-clipped=0.0 +2024-01-15 18:56:06,337 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.bypass_mid.scale_min, batch_count=71550.0, ans=0.2 +2024-01-15 18:56:14,676 INFO [train.py:994] (1/2) Epoch 26, batch 400, loss[loss=0.1677, simple_loss=0.2538, pruned_loss=0.04079, over 23833.00 frames. ], tot_loss[loss=0.1552, simple_loss=0.2335, pruned_loss=0.03842, over 4159123.59 frames. ], batch size: 328, lr: 1.57e-02, grad_scale: 32.0 +2024-01-15 18:56:16,131 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.convnext.hidden_balancer.prob, batch_count=71583.33333333333, ans=0.125 +2024-01-15 18:56:30,794 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_skip_rate, batch_count=71616.66666666667, ans=0.0 +2024-01-15 18:56:34,240 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module2.balancer1.prob, batch_count=71616.66666666667, ans=0.125 +2024-01-15 18:57:11,179 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass_mid.scale_min, batch_count=71716.66666666667, ans=0.2 +2024-01-15 18:57:15,696 INFO [train.py:994] (1/2) Epoch 26, batch 450, loss[loss=0.1578, simple_loss=0.2379, pruned_loss=0.0389, over 24486.00 frames. ], tot_loss[loss=0.1543, simple_loss=0.2329, pruned_loss=0.03792, over 4297364.24 frames. ], batch size: 216, lr: 1.57e-02, grad_scale: 32.0 +2024-01-15 18:57:25,858 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.515e+02 1.875e+02 2.085e+02 2.426e+02 4.105e+02, threshold=4.170e+02, percent-clipped=0.0 +2024-01-15 18:58:03,231 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.nonlin_attention.balancer.prob, batch_count=71850.0, ans=0.125 +2024-01-15 18:58:08,120 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward2.hidden_balancer.prob, batch_count=71883.33333333333, ans=0.125 +2024-01-15 18:58:18,245 INFO [train.py:994] (1/2) Epoch 26, batch 500, loss[loss=0.1572, simple_loss=0.2372, pruned_loss=0.03858, over 24500.00 frames. ], tot_loss[loss=0.1541, simple_loss=0.2327, pruned_loss=0.03773, over 4411105.65 frames. ], batch size: 187, lr: 1.57e-02, grad_scale: 32.0 +2024-01-15 18:59:11,115 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.whiten, num_groups=1, num_channels=256, metric=3.13 vs. limit=12.0 +2024-01-15 18:59:11,530 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=3.55 vs. limit=6.0 +2024-01-15 18:59:20,813 INFO [train.py:994] (1/2) Epoch 26, batch 550, loss[loss=0.156, simple_loss=0.2391, pruned_loss=0.03638, over 23845.00 frames. ], tot_loss[loss=0.1541, simple_loss=0.2332, pruned_loss=0.03754, over 4507908.98 frames. ], batch size: 328, lr: 1.57e-02, grad_scale: 32.0 +2024-01-15 18:59:27,866 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=384, metric=3.54 vs. limit=15.0 +2024-01-15 18:59:30,773 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.560e+02 1.885e+02 2.122e+02 2.518e+02 4.046e+02, threshold=4.244e+02, percent-clipped=0.0 +2024-01-15 18:59:38,805 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.conv_module1.whiten, num_groups=1, num_channels=192, metric=11.16 vs. limit=15.0 +2024-01-15 18:59:42,346 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_skip_rate, batch_count=72116.66666666667, ans=0.0 +2024-01-15 18:59:47,148 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=72150.0, ans=0.125 +2024-01-15 18:59:53,584 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.attention_skip_rate, batch_count=72150.0, ans=0.0 +2024-01-15 19:00:16,123 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_skip_rate, batch_count=72216.66666666667, ans=0.0 +2024-01-15 19:00:23,056 INFO [train.py:994] (1/2) Epoch 26, batch 600, loss[loss=0.1581, simple_loss=0.2441, pruned_loss=0.03602, over 24464.00 frames. ], tot_loss[loss=0.1543, simple_loss=0.2333, pruned_loss=0.03762, over 4561239.32 frames. ], batch size: 250, lr: 1.57e-02, grad_scale: 32.0 +2024-01-15 19:00:43,481 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=72283.33333333333, ans=0.125 +2024-01-15 19:00:56,492 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.self_attn_weights.pos_emb_skip_rate, batch_count=72316.66666666667, ans=0.0 +2024-01-15 19:01:24,241 INFO [train.py:994] (1/2) Epoch 26, batch 650, loss[loss=0.1643, simple_loss=0.2452, pruned_loss=0.04167, over 24466.00 frames. ], tot_loss[loss=0.1546, simple_loss=0.2336, pruned_loss=0.03777, over 4621992.66 frames. ], batch size: 210, lr: 1.57e-02, grad_scale: 32.0 +2024-01-15 19:01:24,527 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.nonlin_attention.balancer.max_positive, batch_count=72416.66666666667, ans=0.95 +2024-01-15 19:01:24,806 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.feed_forward2.out_whiten, num_groups=1, num_channels=512, metric=15.38 vs. limit=15.0 +2024-01-15 19:01:27,881 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.self_attn1.whiten, num_groups=1, num_channels=512, metric=14.77 vs. limit=22.5 +2024-01-15 19:01:30,535 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.ff3_skip_rate, batch_count=72416.66666666667, ans=0.0 +2024-01-15 19:01:34,939 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.586e+02 1.987e+02 2.293e+02 2.786e+02 4.279e+02, threshold=4.586e+02, percent-clipped=3.0 +2024-01-15 19:01:40,060 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=72450.0, ans=0.0 +2024-01-15 19:02:06,020 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.balancer2.prob, batch_count=72516.66666666667, ans=0.125 +2024-01-15 19:02:17,550 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.conv_module2.whiten, num_groups=1, num_channels=384, metric=2.68 vs. limit=15.0 +2024-01-15 19:02:21,557 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.balancer_ff2.min_abs, batch_count=72550.0, ans=0.1 +2024-01-15 19:02:25,526 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=5.97 vs. limit=6.0 +2024-01-15 19:02:27,037 INFO [train.py:994] (1/2) Epoch 26, batch 700, loss[loss=0.1639, simple_loss=0.2439, pruned_loss=0.04189, over 24438.00 frames. ], tot_loss[loss=0.1544, simple_loss=0.2334, pruned_loss=0.03772, over 4667948.90 frames. ], batch size: 170, lr: 1.56e-02, grad_scale: 32.0 +2024-01-15 19:02:27,320 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module2.balancer2.prob, batch_count=72583.33333333333, ans=0.125 +2024-01-15 19:02:29,727 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.bypass.scale_min, batch_count=72583.33333333333, ans=0.2 +2024-01-15 19:03:15,356 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.ff2_skip_rate, batch_count=72683.33333333333, ans=0.0 +2024-01-15 19:03:15,439 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.attention_skip_rate, batch_count=72683.33333333333, ans=0.0 +2024-01-15 19:03:15,485 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=72683.33333333333, ans=0.125 +2024-01-15 19:03:29,159 INFO [train.py:994] (1/2) Epoch 26, batch 750, loss[loss=0.1313, simple_loss=0.2065, pruned_loss=0.02809, over 23931.00 frames. ], tot_loss[loss=0.1537, simple_loss=0.2328, pruned_loss=0.03734, over 4700243.34 frames. ], batch size: 131, lr: 1.56e-02, grad_scale: 32.0 +2024-01-15 19:03:30,623 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module2.balancer2.prob, batch_count=72750.0, ans=0.125 +2024-01-15 19:03:39,029 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=72750.0, ans=0.125 +2024-01-15 19:03:39,720 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.458e+02 1.874e+02 2.013e+02 2.241e+02 3.260e+02, threshold=4.026e+02, percent-clipped=0.0 +2024-01-15 19:03:45,063 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.nonlin_attention.whiten2, num_groups=1, num_channels=384, metric=6.12 vs. limit=15.0 +2024-01-15 19:03:52,179 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=512, metric=14.81 vs. limit=15.0 +2024-01-15 19:03:53,312 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.nonlin_attention.whiten2.whitening_limit, batch_count=72816.66666666667, ans=15.0 +2024-01-15 19:04:09,116 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.attention_skip_rate, batch_count=72850.0, ans=0.0 +2024-01-15 19:04:22,762 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module1.balancer1.prob, batch_count=72883.33333333333, ans=0.125 +2024-01-15 19:04:29,126 INFO [train.py:994] (1/2) Epoch 26, batch 800, loss[loss=0.1534, simple_loss=0.2346, pruned_loss=0.03607, over 24495.00 frames. ], tot_loss[loss=0.1536, simple_loss=0.2324, pruned_loss=0.03738, over 4708527.91 frames. ], batch size: 222, lr: 1.56e-02, grad_scale: 32.0 +2024-01-15 19:04:32,687 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.ff3_skip_rate, batch_count=72916.66666666667, ans=0.0 +2024-01-15 19:04:36,485 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=8.51 vs. limit=10.0 +2024-01-15 19:04:44,465 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.out_combiner.scale_min, batch_count=72950.0, ans=0.2 +2024-01-15 19:05:01,727 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=72983.33333333333, ans=0.1 +2024-01-15 19:05:10,534 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=73016.66666666667, ans=0.125 +2024-01-15 19:05:11,895 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=192, metric=2.84 vs. limit=10.0 +2024-01-15 19:05:42,610 INFO [train.py:994] (1/2) Epoch 27, batch 0, loss[loss=0.1557, simple_loss=0.2365, pruned_loss=0.03741, over 24468.00 frames. ], tot_loss[loss=0.1557, simple_loss=0.2365, pruned_loss=0.03741, over 24468.00 frames. ], batch size: 267, lr: 1.53e-02, grad_scale: 32.0 +2024-01-15 19:05:42,610 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 19:05:55,958 INFO [zipformer.py:1858] (1/2) name=encoder.encoders.0.layers.1.self_attn_weights, attn_weights_entropy = tensor([5.5362, 5.3115, 5.2351, 5.2178], device='cuda:1') +2024-01-15 19:06:02,088 INFO [zipformer.py:1858] (1/2) name=encoder.encoders.1.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([4.8266, 4.1087, 4.5567, 3.9507], device='cuda:1') +2024-01-15 19:06:03,782 INFO [train.py:1026] (1/2) Epoch 27, validation: loss=0.1674, simple_loss=0.2515, pruned_loss=0.04165, over 1622729.00 frames. +2024-01-15 19:06:03,783 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 19:06:12,486 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=384, metric=22.15 vs. limit=22.5 +2024-01-15 19:06:22,301 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.442e+02 1.897e+02 2.083e+02 2.532e+02 4.087e+02, threshold=4.166e+02, percent-clipped=1.0 +2024-01-15 19:06:39,397 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.out_proj.dropout_p, batch_count=73126.66666666667, ans=0.1 +2024-01-15 19:06:49,380 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.whiten.whitening_limit, batch_count=73160.0, ans=12.0 +2024-01-15 19:06:58,227 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=3.52 vs. limit=6.0 +2024-01-15 19:07:04,938 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.feed_forward2.out_whiten, num_groups=1, num_channels=384, metric=7.58 vs. limit=15.0 +2024-01-15 19:07:06,924 INFO [train.py:994] (1/2) Epoch 27, batch 50, loss[loss=0.1531, simple_loss=0.2264, pruned_loss=0.03988, over 24375.00 frames. ], tot_loss[loss=0.152, simple_loss=0.2304, pruned_loss=0.03685, over 1080004.58 frames. ], batch size: 153, lr: 1.53e-02, grad_scale: 32.0 +2024-01-15 19:07:26,471 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_skip_rate, batch_count=73260.0, ans=0.0 +2024-01-15 19:07:48,727 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.self_attn_weights.pos_emb_skip_rate, batch_count=73326.66666666667, ans=0.0 +2024-01-15 19:07:53,844 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=5.00 vs. limit=6.0 +2024-01-15 19:07:57,250 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=4.03 vs. limit=10.0 +2024-01-15 19:07:59,892 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=256, metric=15.10 vs. limit=22.5 +2024-01-15 19:08:10,276 INFO [train.py:994] (1/2) Epoch 27, batch 100, loss[loss=0.1569, simple_loss=0.2359, pruned_loss=0.03891, over 24504.00 frames. ], tot_loss[loss=0.153, simple_loss=0.2323, pruned_loss=0.03686, over 1915690.85 frames. ], batch size: 229, lr: 1.53e-02, grad_scale: 32.0 +2024-01-15 19:08:17,178 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward3.hidden_balancer.prob, batch_count=73393.33333333333, ans=0.125 +2024-01-15 19:08:28,686 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.568e+02 1.827e+02 2.038e+02 2.394e+02 3.395e+02, threshold=4.076e+02, percent-clipped=0.0 +2024-01-15 19:08:30,226 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.balancer2.prob, batch_count=73426.66666666667, ans=0.125 +2024-01-15 19:08:43,558 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.ff3_skip_rate, batch_count=73460.0, ans=0.0 +2024-01-15 19:08:44,843 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.whiten, num_groups=1, num_channels=256, metric=3.62 vs. limit=12.0 +2024-01-15 19:09:11,949 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=14.42 vs. limit=15.0 +2024-01-15 19:09:12,661 INFO [train.py:994] (1/2) Epoch 27, batch 150, loss[loss=0.1462, simple_loss=0.2269, pruned_loss=0.03269, over 24364.00 frames. ], tot_loss[loss=0.1528, simple_loss=0.2318, pruned_loss=0.03691, over 2561014.18 frames. ], batch size: 275, lr: 1.53e-02, grad_scale: 32.0 +2024-01-15 19:09:18,928 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module2.balancer2.prob, batch_count=73560.0, ans=0.125 +2024-01-15 19:09:34,820 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.balancer1.prob, batch_count=73593.33333333333, ans=0.125 +2024-01-15 19:09:36,046 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=73626.66666666667, ans=0.125 +2024-01-15 19:09:39,019 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=192, metric=4.65 vs. limit=10.0 +2024-01-15 19:09:39,969 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.conv_module2.whiten, num_groups=1, num_channels=512, metric=8.83 vs. limit=15.0 +2024-01-15 19:09:46,711 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.ff2_skip_rate, batch_count=73626.66666666667, ans=0.0 +2024-01-15 19:09:52,855 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.attention_skip_rate, batch_count=73660.0, ans=0.0 +2024-01-15 19:09:55,819 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=73660.0, ans=0.125 +2024-01-15 19:09:58,791 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.bypass_mid.scale_min, batch_count=73660.0, ans=0.2 +2024-01-15 19:10:02,242 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=73693.33333333333, ans=0.0 +2024-01-15 19:10:06,139 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=384, metric=15.46 vs. limit=22.5 +2024-01-15 19:10:06,410 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=5.22 vs. limit=6.0 +2024-01-15 19:10:13,764 INFO [train.py:994] (1/2) Epoch 27, batch 200, loss[loss=0.1511, simple_loss=0.2321, pruned_loss=0.03511, over 24540.00 frames. ], tot_loss[loss=0.1533, simple_loss=0.2325, pruned_loss=0.0371, over 3060762.98 frames. ], batch size: 236, lr: 1.53e-02, grad_scale: 32.0 +2024-01-15 19:10:29,699 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward3.hidden_balancer.prob, batch_count=73760.0, ans=0.125 +2024-01-15 19:10:32,944 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.480e+02 1.764e+02 1.943e+02 2.218e+02 3.609e+02, threshold=3.886e+02, percent-clipped=0.0 +2024-01-15 19:10:38,403 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.self_attn1.whiten, num_groups=1, num_channels=384, metric=20.19 vs. limit=22.5 +2024-01-15 19:10:55,687 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module1.balancer1.max_abs, batch_count=73826.66666666667, ans=10.0 +2024-01-15 19:11:16,887 INFO [train.py:994] (1/2) Epoch 27, batch 250, loss[loss=0.1241, simple_loss=0.1838, pruned_loss=0.03221, over 17661.00 frames. ], tot_loss[loss=0.1536, simple_loss=0.2329, pruned_loss=0.03715, over 3451294.52 frames. ], batch size: 76, lr: 1.53e-02, grad_scale: 32.0 +2024-01-15 19:11:46,302 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=384, metric=3.43 vs. limit=15.0 +2024-01-15 19:11:50,853 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_skip_rate, batch_count=73960.0, ans=0.0 +2024-01-15 19:12:03,374 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward3.hidden_balancer.prob, batch_count=73993.33333333333, ans=0.125 +2024-01-15 19:12:06,800 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.nonlin_attention.balancer.max_positive, batch_count=74026.66666666667, ans=0.95 +2024-01-15 19:12:19,120 INFO [train.py:994] (1/2) Epoch 27, batch 300, loss[loss=0.1508, simple_loss=0.2337, pruned_loss=0.03393, over 24499.00 frames. ], tot_loss[loss=0.1532, simple_loss=0.2324, pruned_loss=0.037, over 3753348.92 frames. ], batch size: 229, lr: 1.52e-02, grad_scale: 32.0 +2024-01-15 19:12:38,390 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.487e+02 1.848e+02 2.123e+02 2.511e+02 4.010e+02, threshold=4.247e+02, percent-clipped=2.0 +2024-01-15 19:12:43,481 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module2.balancer1.prob, batch_count=74126.66666666667, ans=0.125 +2024-01-15 19:12:48,122 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=74126.66666666667, ans=0.1 +2024-01-15 19:12:52,861 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.nonlin_attention.balancer.prob, batch_count=74126.66666666667, ans=0.125 +2024-01-15 19:13:06,460 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=512, metric=15.50 vs. limit=15.0 +2024-01-15 19:13:21,950 INFO [train.py:994] (1/2) Epoch 27, batch 350, loss[loss=0.154, simple_loss=0.2374, pruned_loss=0.0353, over 24429.00 frames. ], tot_loss[loss=0.1531, simple_loss=0.2323, pruned_loss=0.03692, over 3991599.79 frames. ], batch size: 258, lr: 1.52e-02, grad_scale: 32.0 +2024-01-15 19:13:22,314 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.nonlin_attention.balancer.max_positive, batch_count=74226.66666666667, ans=0.95 +2024-01-15 19:13:23,661 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.self_attn2.whiten, num_groups=1, num_channels=384, metric=13.93 vs. limit=22.5 +2024-01-15 19:13:40,571 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module1.balancer2.prob, batch_count=74260.0, ans=0.125 +2024-01-15 19:13:44,740 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=74260.0, ans=0.125 +2024-01-15 19:13:45,828 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.bypass_mid.scale_min, batch_count=74293.33333333333, ans=0.2 +2024-01-15 19:13:46,922 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=74293.33333333333, ans=0.1 +2024-01-15 19:13:52,934 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module2.balancer2.min_positive, batch_count=74293.33333333333, ans=0.05 +2024-01-15 19:14:06,372 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module1.balancer1.prob, batch_count=74326.66666666667, ans=0.125 +2024-01-15 19:14:22,295 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module1.balancer1.max_abs, batch_count=74393.33333333333, ans=10.0 +2024-01-15 19:14:23,095 INFO [train.py:994] (1/2) Epoch 27, batch 400, loss[loss=0.1645, simple_loss=0.2441, pruned_loss=0.04247, over 24488.00 frames. ], tot_loss[loss=0.1532, simple_loss=0.2325, pruned_loss=0.03692, over 4175311.99 frames. ], batch size: 210, lr: 1.52e-02, grad_scale: 32.0 +2024-01-15 19:14:34,546 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module1.balancer1.prob, batch_count=74393.33333333333, ans=0.125 +2024-01-15 19:14:43,691 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.495e+02 1.874e+02 2.027e+02 2.380e+02 3.576e+02, threshold=4.054e+02, percent-clipped=0.0 +2024-01-15 19:14:44,382 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.nonlin_attention.whiten2, num_groups=1, num_channels=512, metric=4.18 vs. limit=15.0 +2024-01-15 19:14:55,349 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.balancer1.prob, batch_count=74460.0, ans=0.125 +2024-01-15 19:14:59,658 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.ff3_skip_rate, batch_count=74460.0, ans=0.0 +2024-01-15 19:15:08,500 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.whiten, num_groups=1, num_channels=192, metric=4.33 vs. limit=12.0 +2024-01-15 19:15:26,518 INFO [train.py:994] (1/2) Epoch 27, batch 450, loss[loss=0.1516, simple_loss=0.2317, pruned_loss=0.03572, over 24245.00 frames. ], tot_loss[loss=0.1532, simple_loss=0.2326, pruned_loss=0.03691, over 4305518.29 frames. ], batch size: 140, lr: 1.52e-02, grad_scale: 16.0 +2024-01-15 19:15:43,840 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_skip_rate, batch_count=74593.33333333333, ans=0.0 +2024-01-15 19:15:49,352 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_module2.balancer2.prob, batch_count=74593.33333333333, ans=0.125 +2024-01-15 19:16:07,446 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module2.balancer2.prob, batch_count=74660.0, ans=0.125 +2024-01-15 19:16:10,557 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=74660.0, ans=0.1 +2024-01-15 19:16:18,183 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward2.hidden_balancer.prob, batch_count=74693.33333333333, ans=0.125 +2024-01-15 19:16:28,361 INFO [train.py:994] (1/2) Epoch 27, batch 500, loss[loss=0.1377, simple_loss=0.2195, pruned_loss=0.02799, over 24344.00 frames. ], tot_loss[loss=0.153, simple_loss=0.2324, pruned_loss=0.03678, over 4431634.59 frames. ], batch size: 147, lr: 1.52e-02, grad_scale: 16.0 +2024-01-15 19:16:33,422 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.self_attn2.whiten, num_groups=1, num_channels=384, metric=13.30 vs. limit=22.5 +2024-01-15 19:16:42,843 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=512, metric=16.06 vs. limit=22.5 +2024-01-15 19:16:49,344 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.619e+02 1.903e+02 2.137e+02 2.532e+02 4.096e+02, threshold=4.275e+02, percent-clipped=1.0 +2024-01-15 19:16:49,708 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module2.balancer1.prob, batch_count=74760.0, ans=0.125 +2024-01-15 19:16:59,085 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_skip_rate, batch_count=74793.33333333333, ans=0.0 +2024-01-15 19:17:10,695 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module1.balancer1.prob, batch_count=74826.66666666667, ans=0.125 +2024-01-15 19:17:17,781 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.balancer1.prob, batch_count=74860.0, ans=0.125 +2024-01-15 19:17:29,624 INFO [train.py:994] (1/2) Epoch 27, batch 550, loss[loss=0.1538, simple_loss=0.2392, pruned_loss=0.03415, over 23912.00 frames. ], tot_loss[loss=0.1524, simple_loss=0.2314, pruned_loss=0.03667, over 4503616.82 frames. ], batch size: 328, lr: 1.52e-02, grad_scale: 16.0 +2024-01-15 19:17:37,603 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.bypass_mid.scale_min, batch_count=74893.33333333333, ans=0.2 +2024-01-15 19:17:54,145 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=74960.0, ans=0.125 +2024-01-15 19:18:25,231 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=6.01 vs. limit=10.0 +2024-01-15 19:18:27,367 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.out_combiner.scale_min, batch_count=75026.66666666667, ans=0.2 +2024-01-15 19:18:31,731 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.conv_module1.whiten, num_groups=1, num_channels=192, metric=9.01 vs. limit=15.0 +2024-01-15 19:18:31,957 INFO [train.py:994] (1/2) Epoch 27, batch 600, loss[loss=0.151, simple_loss=0.2292, pruned_loss=0.03641, over 24483.00 frames. ], tot_loss[loss=0.1525, simple_loss=0.2316, pruned_loss=0.03667, over 4560805.57 frames. ], batch size: 170, lr: 1.52e-02, grad_scale: 16.0 +2024-01-15 19:18:37,989 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.bypass_mid.scale_min, batch_count=75060.0, ans=0.2 +2024-01-15 19:18:53,254 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.506e+02 1.823e+02 2.003e+02 2.203e+02 3.481e+02, threshold=4.005e+02, percent-clipped=0.0 +2024-01-15 19:18:57,289 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward3.hidden_balancer.prob, batch_count=75126.66666666667, ans=0.125 +2024-01-15 19:18:59,678 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=75126.66666666667, ans=0.125 +2024-01-15 19:19:33,523 INFO [train.py:994] (1/2) Epoch 27, batch 650, loss[loss=0.1474, simple_loss=0.2288, pruned_loss=0.03298, over 24490.00 frames. ], tot_loss[loss=0.1523, simple_loss=0.2317, pruned_loss=0.03648, over 4619740.29 frames. ], batch size: 216, lr: 1.51e-02, grad_scale: 8.0 +2024-01-15 19:19:38,003 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.balancer2.prob, batch_count=75226.66666666667, ans=0.125 +2024-01-15 19:19:45,972 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.conv_module1.whiten, num_groups=1, num_channels=384, metric=3.76 vs. limit=15.0 +2024-01-15 19:20:01,619 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.balancer1.prob, batch_count=75293.33333333333, ans=0.125 +2024-01-15 19:20:08,124 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=75293.33333333333, ans=0.125 +2024-01-15 19:20:09,797 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.self_attn2.whiten, num_groups=1, num_channels=192, metric=15.98 vs. limit=22.5 +2024-01-15 19:20:11,638 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module2.balancer1.min_positive, batch_count=75326.66666666667, ans=0.025 +2024-01-15 19:20:20,083 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.bypass.skip_rate, batch_count=75326.66666666667, ans=0.07 +2024-01-15 19:20:36,196 INFO [train.py:994] (1/2) Epoch 27, batch 700, loss[loss=0.1607, simple_loss=0.2352, pruned_loss=0.04315, over 24502.00 frames. ], tot_loss[loss=0.1526, simple_loss=0.232, pruned_loss=0.03657, over 4661328.31 frames. ], batch size: 165, lr: 1.51e-02, grad_scale: 8.0 +2024-01-15 19:20:56,280 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=75426.66666666667, ans=0.125 +2024-01-15 19:20:58,457 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.538e+02 1.855e+02 2.007e+02 2.333e+02 4.096e+02, threshold=4.014e+02, percent-clipped=1.0 +2024-01-15 19:21:08,989 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=10.05 vs. limit=15.0 +2024-01-15 19:21:24,667 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_skip_rate, batch_count=75526.66666666667, ans=0.0 +2024-01-15 19:21:37,758 INFO [train.py:994] (1/2) Epoch 27, batch 750, loss[loss=0.1575, simple_loss=0.2335, pruned_loss=0.04076, over 24356.00 frames. ], tot_loss[loss=0.1524, simple_loss=0.2319, pruned_loss=0.03643, over 4703446.56 frames. ], batch size: 153, lr: 1.51e-02, grad_scale: 4.0 +2024-01-15 19:22:04,114 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module2.balancer1.prob, batch_count=75626.66666666667, ans=0.125 +2024-01-15 19:22:09,472 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module1.balancer2.min_positive, batch_count=75626.66666666667, ans=0.05 +2024-01-15 19:22:22,737 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.conv_module2.whiten, num_groups=1, num_channels=384, metric=2.73 vs. limit=15.0 +2024-01-15 19:22:37,645 INFO [train.py:994] (1/2) Epoch 27, batch 800, loss[loss=0.1566, simple_loss=0.2373, pruned_loss=0.03795, over 24521.00 frames. ], tot_loss[loss=0.152, simple_loss=0.2315, pruned_loss=0.03622, over 4734706.69 frames. ], batch size: 236, lr: 1.51e-02, grad_scale: 8.0 +2024-01-15 19:22:57,209 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.balancer2.prob, batch_count=75760.0, ans=0.125 +2024-01-15 19:22:59,179 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.516e+02 1.822e+02 1.929e+02 2.256e+02 3.473e+02, threshold=3.858e+02, percent-clipped=0.0 +2024-01-15 19:23:02,782 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.balancer1.prob, batch_count=75793.33333333333, ans=0.125 +2024-01-15 19:23:06,204 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.balancer2.prob, batch_count=75793.33333333333, ans=0.125 +2024-01-15 19:23:14,307 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.out_proj.dropout_p, batch_count=75826.66666666667, ans=0.1 +2024-01-15 19:23:48,346 INFO [train.py:994] (1/2) Epoch 28, batch 0, loss[loss=0.163, simple_loss=0.2407, pruned_loss=0.04264, over 24550.00 frames. ], tot_loss[loss=0.163, simple_loss=0.2407, pruned_loss=0.04264, over 24550.00 frames. ], batch size: 204, lr: 1.48e-02, grad_scale: 16.0 +2024-01-15 19:23:48,347 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 19:24:09,003 INFO [train.py:1026] (1/2) Epoch 28, validation: loss=0.1682, simple_loss=0.2519, pruned_loss=0.04225, over 1622729.00 frames. +2024-01-15 19:24:09,005 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 19:24:28,942 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward1.out_proj.dropout_p, batch_count=75903.33333333333, ans=0.1 +2024-01-15 19:24:37,958 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module2.balancer2.prob, batch_count=75936.66666666667, ans=0.125 +2024-01-15 19:24:46,543 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=384, metric=3.38 vs. limit=15.0 +2024-01-15 19:25:10,720 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.conv_module2.whiten, num_groups=1, num_channels=256, metric=2.70 vs. limit=15.0 +2024-01-15 19:25:11,346 INFO [train.py:994] (1/2) Epoch 28, batch 50, loss[loss=0.1494, simple_loss=0.2291, pruned_loss=0.03489, over 24464.00 frames. ], tot_loss[loss=0.151, simple_loss=0.2302, pruned_loss=0.03588, over 1087175.93 frames. ], batch size: 250, lr: 1.48e-02, grad_scale: 16.0 +2024-01-15 19:25:38,714 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_skip_rate, batch_count=76103.33333333333, ans=0.0 +2024-01-15 19:25:43,831 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.582e+02 1.909e+02 2.094e+02 2.541e+02 4.142e+02, threshold=4.189e+02, percent-clipped=2.0 +2024-01-15 19:26:00,033 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward2.hidden_balancer.prob, batch_count=76170.0, ans=0.125 +2024-01-15 19:26:02,704 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.conv_module2.whiten, num_groups=1, num_channels=384, metric=7.47 vs. limit=15.0 +2024-01-15 19:26:13,251 INFO [train.py:994] (1/2) Epoch 28, batch 100, loss[loss=0.1588, simple_loss=0.234, pruned_loss=0.04178, over 24337.00 frames. ], tot_loss[loss=0.1488, simple_loss=0.2278, pruned_loss=0.03486, over 1908754.10 frames. ], batch size: 153, lr: 1.48e-02, grad_scale: 16.0 +2024-01-15 19:26:13,561 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module2.balancer2.prob, batch_count=76203.33333333333, ans=0.125 +2024-01-15 19:26:23,728 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_skip_rate, batch_count=76203.33333333333, ans=0.0 +2024-01-15 19:26:38,439 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.2.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 19:26:43,138 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=76270.0, ans=0.1 +2024-01-15 19:26:54,436 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=76303.33333333333, ans=0.1 +2024-01-15 19:27:14,587 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module2.balancer1.prob, batch_count=76370.0, ans=0.125 +2024-01-15 19:27:15,447 INFO [train.py:994] (1/2) Epoch 28, batch 150, loss[loss=0.1567, simple_loss=0.2284, pruned_loss=0.04244, over 24406.00 frames. ], tot_loss[loss=0.1494, simple_loss=0.229, pruned_loss=0.03492, over 2560559.54 frames. ], batch size: 159, lr: 1.48e-02, grad_scale: 16.0 +2024-01-15 19:27:19,767 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.nonlin_attention.whiten1, num_groups=1, num_channels=384, metric=4.76 vs. limit=10.0 +2024-01-15 19:27:43,287 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module1.balancer1.prob, batch_count=76436.66666666667, ans=0.125 +2024-01-15 19:27:43,500 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=256, metric=17.08 vs. limit=22.5 +2024-01-15 19:27:48,902 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.474e+02 1.798e+02 1.936e+02 2.219e+02 3.188e+02, threshold=3.872e+02, percent-clipped=0.0 +2024-01-15 19:27:59,295 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.ff3_skip_rate, batch_count=76470.0, ans=0.0 +2024-01-15 19:28:00,811 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.conv_module2.whiten, num_groups=1, num_channels=384, metric=3.66 vs. limit=15.0 +2024-01-15 19:28:16,958 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module2.balancer2.prob, batch_count=76536.66666666667, ans=0.125 +2024-01-15 19:28:17,848 INFO [train.py:994] (1/2) Epoch 28, batch 200, loss[loss=0.1602, simple_loss=0.2404, pruned_loss=0.03999, over 24533.00 frames. ], tot_loss[loss=0.1497, simple_loss=0.2292, pruned_loss=0.03514, over 3053956.55 frames. ], batch size: 176, lr: 1.48e-02, grad_scale: 8.0 +2024-01-15 19:29:04,391 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=76636.66666666667, ans=0.1 +2024-01-15 19:29:09,014 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=384, metric=9.45 vs. limit=15.0 +2024-01-15 19:29:13,371 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.ff3_skip_rate, batch_count=76670.0, ans=0.0 +2024-01-15 19:29:19,036 INFO [train.py:994] (1/2) Epoch 28, batch 250, loss[loss=0.1497, simple_loss=0.2289, pruned_loss=0.03521, over 24483.00 frames. ], tot_loss[loss=0.1504, simple_loss=0.2301, pruned_loss=0.03537, over 3454688.90 frames. ], batch size: 216, lr: 1.48e-02, grad_scale: 8.0 +2024-01-15 19:29:31,202 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module1.balancer2.prob, batch_count=76736.66666666667, ans=0.125 +2024-01-15 19:29:31,206 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.bypass.scale_min, batch_count=76736.66666666667, ans=0.2 +2024-01-15 19:29:52,416 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.566e+02 1.863e+02 2.047e+02 2.419e+02 3.522e+02, threshold=4.093e+02, percent-clipped=0.0 +2024-01-15 19:29:59,417 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.self_attn_weights.pos_emb_skip_rate, batch_count=76803.33333333333, ans=0.0 +2024-01-15 19:30:12,314 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=76836.66666666667, ans=0.1 +2024-01-15 19:30:12,354 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=76836.66666666667, ans=0.125 +2024-01-15 19:30:21,644 INFO [train.py:994] (1/2) Epoch 28, batch 300, loss[loss=0.1537, simple_loss=0.2425, pruned_loss=0.03249, over 24214.00 frames. ], tot_loss[loss=0.1506, simple_loss=0.2304, pruned_loss=0.03543, over 3754186.08 frames. ], batch size: 311, lr: 1.47e-02, grad_scale: 8.0 +2024-01-15 19:30:22,005 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_skip_rate, batch_count=76870.0, ans=0.0 +2024-01-15 19:30:25,490 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module1.balancer2.prob, batch_count=76870.0, ans=0.125 +2024-01-15 19:30:26,619 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.balancer2.prob, batch_count=76870.0, ans=0.125 +2024-01-15 19:30:37,056 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=76903.33333333333, ans=0.0 +2024-01-15 19:30:42,422 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_skip_rate, batch_count=76903.33333333333, ans=0.0 +2024-01-15 19:30:44,272 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=76903.33333333333, ans=0.1 +2024-01-15 19:30:51,703 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.nonlin_attention.whiten2, num_groups=1, num_channels=384, metric=3.07 vs. limit=15.0 +2024-01-15 19:31:03,733 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=8, num_channels=256, metric=5.59 vs. limit=6.0 +2024-01-15 19:31:16,465 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=77003.33333333333, ans=0.125 +2024-01-15 19:31:23,236 INFO [train.py:994] (1/2) Epoch 28, batch 350, loss[loss=0.1308, simple_loss=0.2113, pruned_loss=0.02514, over 23957.00 frames. ], tot_loss[loss=0.1508, simple_loss=0.2306, pruned_loss=0.03553, over 3996071.11 frames. ], batch size: 131, lr: 1.47e-02, grad_scale: 8.0 +2024-01-15 19:31:24,621 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.convnext.out_balancer.prob, batch_count=77036.66666666667, ans=0.125 +2024-01-15 19:31:36,044 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.attention_skip_rate, batch_count=77070.0, ans=0.0 +2024-01-15 19:31:45,337 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.whiten, num_groups=1, num_channels=512, metric=5.67 vs. limit=12.0 +2024-01-15 19:31:46,153 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=77070.0, ans=0.0 +2024-01-15 19:31:54,299 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.out_proj.dropout_p, batch_count=77103.33333333333, ans=0.1 +2024-01-15 19:31:56,320 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.491e+02 1.863e+02 2.087e+02 2.688e+02 4.509e+02, threshold=4.173e+02, percent-clipped=2.0 +2024-01-15 19:31:59,701 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=77136.66666666667, ans=0.0 +2024-01-15 19:32:17,244 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=512, metric=10.79 vs. limit=15.0 +2024-01-15 19:32:17,294 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=8.74 vs. limit=10.0 +2024-01-15 19:32:24,687 INFO [train.py:994] (1/2) Epoch 28, batch 400, loss[loss=0.1575, simple_loss=0.2397, pruned_loss=0.0377, over 24616.00 frames. ], tot_loss[loss=0.1511, simple_loss=0.2307, pruned_loss=0.03572, over 4180892.39 frames. ], batch size: 199, lr: 1.47e-02, grad_scale: 16.0 +2024-01-15 19:32:36,892 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.attention_skip_rate, batch_count=77236.66666666667, ans=0.0 +2024-01-15 19:32:55,142 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_whiten, num_groups=1, num_channels=384, metric=14.57 vs. limit=15.0 +2024-01-15 19:32:59,365 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_skip_rate, batch_count=77270.0, ans=0.0 +2024-01-15 19:33:01,771 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module2.balancer1.prob, batch_count=77303.33333333333, ans=0.125 +2024-01-15 19:33:01,814 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.balancer1.prob, batch_count=77303.33333333333, ans=0.125 +2024-01-15 19:33:12,344 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=77303.33333333333, ans=0.125 +2024-01-15 19:33:27,954 INFO [train.py:994] (1/2) Epoch 28, batch 450, loss[loss=0.1557, simple_loss=0.2343, pruned_loss=0.03851, over 24548.00 frames. ], tot_loss[loss=0.1513, simple_loss=0.2308, pruned_loss=0.03589, over 4323757.13 frames. ], batch size: 193, lr: 1.47e-02, grad_scale: 16.0 +2024-01-15 19:33:35,291 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_skip_rate, batch_count=77370.0, ans=0.0 +2024-01-15 19:33:37,780 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.balancer1.prob, batch_count=77370.0, ans=0.125 +2024-01-15 19:33:58,176 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module2.balancer2.prob, batch_count=77436.66666666667, ans=0.125 +2024-01-15 19:33:58,183 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.bypass_mid.scale_min, batch_count=77436.66666666667, ans=0.2 +2024-01-15 19:34:01,316 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.397e+02 1.800e+02 2.039e+02 2.585e+02 3.637e+02, threshold=4.077e+02, percent-clipped=0.0 +2024-01-15 19:34:06,894 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=12.78 vs. limit=15.0 +2024-01-15 19:34:13,652 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.ff3_skip_rate, batch_count=77470.0, ans=0.0 +2024-01-15 19:34:17,391 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.balancer2.prob, batch_count=77503.33333333333, ans=0.125 +2024-01-15 19:34:24,422 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass.scale_min, batch_count=77503.33333333333, ans=0.2 +2024-01-15 19:34:30,181 INFO [train.py:994] (1/2) Epoch 28, batch 500, loss[loss=0.1498, simple_loss=0.2372, pruned_loss=0.03121, over 23843.00 frames. ], tot_loss[loss=0.1512, simple_loss=0.2306, pruned_loss=0.03595, over 4428050.18 frames. ], batch size: 328, lr: 1.47e-02, grad_scale: 16.0 +2024-01-15 19:34:35,791 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.balancer_ff2.min_abs, batch_count=77536.66666666667, ans=0.1 +2024-01-15 19:34:41,614 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass.skip_rate, batch_count=77570.0, ans=0.09899494936611666 +2024-01-15 19:35:04,148 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=77603.33333333333, ans=0.125 +2024-01-15 19:35:32,512 INFO [train.py:994] (1/2) Epoch 28, batch 550, loss[loss=0.1587, simple_loss=0.2339, pruned_loss=0.04181, over 24594.00 frames. ], tot_loss[loss=0.1512, simple_loss=0.2306, pruned_loss=0.03594, over 4519447.75 frames. ], batch size: 199, lr: 1.47e-02, grad_scale: 16.0 +2024-01-15 19:35:43,862 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.self_attn2.whiten, num_groups=1, num_channels=512, metric=14.41 vs. limit=22.5 +2024-01-15 19:36:05,808 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.512e+02 1.862e+02 2.099e+02 2.436e+02 4.478e+02, threshold=4.198e+02, percent-clipped=3.0 +2024-01-15 19:36:16,061 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module1.balancer1.prob, batch_count=77803.33333333333, ans=0.125 +2024-01-15 19:36:26,638 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=10.61 vs. limit=15.0 +2024-01-15 19:36:33,447 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.balancer1.prob, batch_count=77870.0, ans=0.125 +2024-01-15 19:36:34,266 INFO [train.py:994] (1/2) Epoch 28, batch 600, loss[loss=0.1472, simple_loss=0.2337, pruned_loss=0.03037, over 23882.00 frames. ], tot_loss[loss=0.1509, simple_loss=0.2306, pruned_loss=0.03562, over 4594971.22 frames. ], batch size: 328, lr: 1.47e-02, grad_scale: 16.0 +2024-01-15 19:36:51,326 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.out_proj.dropout_p, batch_count=77903.33333333333, ans=0.1 +2024-01-15 19:36:59,676 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=77936.66666666667, ans=0.125 +2024-01-15 19:37:09,824 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module1.balancer1.prob, batch_count=77936.66666666667, ans=0.125 +2024-01-15 19:37:17,880 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.self_attn_weights.whiten_keys, num_groups=8, num_channels=256, metric=5.19 vs. limit=6.0 +2024-01-15 19:37:31,751 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module1.balancer2.prob, batch_count=78003.33333333333, ans=0.125 +2024-01-15 19:37:36,711 INFO [train.py:994] (1/2) Epoch 28, batch 650, loss[loss=0.1599, simple_loss=0.2467, pruned_loss=0.03659, over 23846.00 frames. ], tot_loss[loss=0.1513, simple_loss=0.2307, pruned_loss=0.03594, over 4642190.63 frames. ], batch size: 328, lr: 1.46e-02, grad_scale: 16.0 +2024-01-15 19:37:59,190 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=11.81 vs. limit=15.0 +2024-01-15 19:37:59,827 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward3.hidden_balancer.prob, batch_count=78070.0, ans=0.125 +2024-01-15 19:38:10,581 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.562e+02 2.024e+02 2.393e+02 2.788e+02 5.477e+02, threshold=4.785e+02, percent-clipped=2.0 +2024-01-15 19:38:21,462 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module2.balancer2.min_positive, batch_count=78136.66666666667, ans=0.05 +2024-01-15 19:38:24,923 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=8.44 vs. limit=10.0 +2024-01-15 19:38:39,082 INFO [train.py:994] (1/2) Epoch 28, batch 700, loss[loss=0.1537, simple_loss=0.236, pruned_loss=0.03576, over 24485.00 frames. ], tot_loss[loss=0.1513, simple_loss=0.2306, pruned_loss=0.03596, over 4669166.72 frames. ], batch size: 210, lr: 1.46e-02, grad_scale: 16.0 +2024-01-15 19:38:43,514 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=256, metric=14.31 vs. limit=22.5 +2024-01-15 19:39:03,145 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module1.balancer1.prob, batch_count=78270.0, ans=0.125 +2024-01-15 19:39:05,587 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.bypass.skip_rate, batch_count=78270.0, ans=0.09899494936611666 +2024-01-15 19:39:15,098 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=78303.33333333333, ans=0.1 +2024-01-15 19:39:26,462 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=78303.33333333333, ans=0.125 +2024-01-15 19:39:35,243 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module1.balancer1.prob, batch_count=78336.66666666667, ans=0.125 +2024-01-15 19:39:40,866 INFO [train.py:994] (1/2) Epoch 28, batch 750, loss[loss=0.1568, simple_loss=0.2427, pruned_loss=0.03545, over 22328.00 frames. ], tot_loss[loss=0.15, simple_loss=0.2292, pruned_loss=0.03536, over 4672587.69 frames. ], batch size: 357, lr: 1.46e-02, grad_scale: 16.0 +2024-01-15 19:39:41,633 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=512, metric=13.79 vs. limit=15.0 +2024-01-15 19:39:44,292 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.bypass_mid.scale_min, batch_count=78370.0, ans=0.2 +2024-01-15 19:40:02,704 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.4.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 19:40:12,726 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.balancer2.min_abs, batch_count=78436.66666666667, ans=0.5 +2024-01-15 19:40:14,771 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.479e+02 1.835e+02 2.101e+02 2.568e+02 3.560e+02, threshold=4.202e+02, percent-clipped=0.0 +2024-01-15 19:40:17,927 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=5.02 vs. limit=6.0 +2024-01-15 19:40:22,545 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward1.hidden_balancer.prob, batch_count=78470.0, ans=0.125 +2024-01-15 19:40:23,713 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=78470.0, ans=0.125 +2024-01-15 19:40:33,787 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.balancer1.prob, batch_count=78503.33333333333, ans=0.125 +2024-01-15 19:40:41,198 INFO [train.py:994] (1/2) Epoch 28, batch 800, loss[loss=0.1572, simple_loss=0.241, pruned_loss=0.03669, over 24475.00 frames. ], tot_loss[loss=0.1497, simple_loss=0.2293, pruned_loss=0.03507, over 4703024.86 frames. ], batch size: 222, lr: 1.46e-02, grad_scale: 32.0 +2024-01-15 19:40:48,029 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass_mid.scale_min, batch_count=78536.66666666667, ans=0.2 +2024-01-15 19:40:57,863 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=11.01 vs. limit=15.0 +2024-01-15 19:41:19,490 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.bypass.skip_rate, batch_count=78636.66666666667, ans=0.09899494936611666 +2024-01-15 19:41:20,669 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module1.balancer2.min_abs, batch_count=78636.66666666667, ans=0.5 +2024-01-15 19:41:52,742 INFO [train.py:994] (1/2) Epoch 29, batch 0, loss[loss=0.1637, simple_loss=0.2362, pruned_loss=0.04566, over 24378.00 frames. ], tot_loss[loss=0.1637, simple_loss=0.2362, pruned_loss=0.04566, over 24378.00 frames. ], batch size: 153, lr: 1.44e-02, grad_scale: 32.0 +2024-01-15 19:41:52,743 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 19:42:03,393 INFO [zipformer.py:1858] (1/2) name=encoder.encoders.5.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([2.7501, 2.6973, 4.1263, 2.6160], device='cuda:1') +2024-01-15 19:42:04,893 INFO [zipformer.py:1858] (1/2) name=encoder.encoders.3.encoder.layers.3.self_attn_weights, attn_weights_entropy = tensor([1.6871, 1.6152, 2.5361, 2.6010, 2.4037, 2.6321, 2.4723, 2.5204], + device='cuda:1') +2024-01-15 19:42:12,485 INFO [train.py:1026] (1/2) Epoch 29, validation: loss=0.1669, simple_loss=0.2498, pruned_loss=0.042, over 1622729.00 frames. +2024-01-15 19:42:12,486 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 19:42:12,702 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward2.hidden_balancer.prob, batch_count=78680.0, ans=0.125 +2024-01-15 19:42:14,450 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=11.57 vs. limit=15.0 +2024-01-15 19:42:25,182 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_skip_rate, batch_count=78713.33333333333, ans=0.0 +2024-01-15 19:42:46,452 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.attention_skip_rate, batch_count=78746.66666666667, ans=0.0 +2024-01-15 19:42:54,939 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.492e+02 1.800e+02 2.035e+02 2.389e+02 3.115e+02, threshold=4.070e+02, percent-clipped=0.0 +2024-01-15 19:43:14,030 INFO [train.py:994] (1/2) Epoch 29, batch 50, loss[loss=0.1387, simple_loss=0.2179, pruned_loss=0.02976, over 24167.00 frames. ], tot_loss[loss=0.1483, simple_loss=0.2277, pruned_loss=0.03446, over 1086492.68 frames. ], batch size: 140, lr: 1.43e-02, grad_scale: 32.0 +2024-01-15 19:43:17,215 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.conv_module2.whiten, num_groups=1, num_channels=192, metric=3.07 vs. limit=15.0 +2024-01-15 19:43:18,544 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.nonlin_attention.balancer.max_positive, batch_count=78846.66666666667, ans=0.95 +2024-01-15 19:43:19,195 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=192, metric=11.61 vs. limit=15.0 +2024-01-15 19:43:44,655 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.out_proj.dropout_p, batch_count=78913.33333333333, ans=0.1 +2024-01-15 19:43:58,820 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.ff2_skip_rate, batch_count=78946.66666666667, ans=0.0 +2024-01-15 19:44:10,760 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.ff3_skip_rate, batch_count=78980.0, ans=0.0 +2024-01-15 19:44:16,266 INFO [train.py:994] (1/2) Epoch 29, batch 100, loss[loss=0.1448, simple_loss=0.2262, pruned_loss=0.03177, over 24512.00 frames. ], tot_loss[loss=0.1495, simple_loss=0.2294, pruned_loss=0.03485, over 1919661.76 frames. ], batch size: 243, lr: 1.43e-02, grad_scale: 32.0 +2024-01-15 19:44:58,019 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.balancer1.prob, batch_count=79113.33333333333, ans=0.125 +2024-01-15 19:44:58,778 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.562e+02 1.850e+02 1.997e+02 2.293e+02 3.475e+02, threshold=3.995e+02, percent-clipped=0.0 +2024-01-15 19:45:17,654 INFO [train.py:994] (1/2) Epoch 29, batch 150, loss[loss=0.121, simple_loss=0.197, pruned_loss=0.02251, over 23497.00 frames. ], tot_loss[loss=0.1492, simple_loss=0.229, pruned_loss=0.03472, over 2561873.59 frames. ], batch size: 119, lr: 1.43e-02, grad_scale: 32.0 +2024-01-15 19:45:28,869 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.conv_module1.whiten, num_groups=1, num_channels=512, metric=3.77 vs. limit=15.0 +2024-01-15 19:45:38,328 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=512, metric=5.45 vs. limit=15.0 +2024-01-15 19:45:40,265 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.ff3_skip_rate, batch_count=79213.33333333333, ans=0.0 +2024-01-15 19:46:03,949 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.ff3_skip_rate, batch_count=79280.0, ans=0.0 +2024-01-15 19:46:06,536 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.self_attn2.whiten, num_groups=1, num_channels=256, metric=18.99 vs. limit=22.5 +2024-01-15 19:46:08,031 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.balancer2.prob, batch_count=79313.33333333333, ans=0.125 +2024-01-15 19:46:20,273 INFO [train.py:994] (1/2) Epoch 29, batch 200, loss[loss=0.1436, simple_loss=0.2151, pruned_loss=0.03606, over 23616.00 frames. ], tot_loss[loss=0.1496, simple_loss=0.2296, pruned_loss=0.03476, over 3062831.28 frames. ], batch size: 119, lr: 1.43e-02, grad_scale: 32.0 +2024-01-15 19:46:28,814 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=79346.66666666667, ans=0.125 +2024-01-15 19:46:37,034 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=192, metric=12.59 vs. limit=15.0 +2024-01-15 19:46:51,435 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_skip_rate, batch_count=79413.33333333333, ans=0.0 +2024-01-15 19:47:01,981 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=384, metric=18.61 vs. limit=22.5 +2024-01-15 19:47:02,591 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.436e+02 1.852e+02 2.095e+02 2.443e+02 3.716e+02, threshold=4.190e+02, percent-clipped=0.0 +2024-01-15 19:47:04,020 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.ff2_skip_rate, batch_count=79446.66666666667, ans=0.0 +2024-01-15 19:47:09,449 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module1.balancer2.prob, batch_count=79480.0, ans=0.125 +2024-01-15 19:47:22,027 INFO [train.py:994] (1/2) Epoch 29, batch 250, loss[loss=0.1473, simple_loss=0.2275, pruned_loss=0.03353, over 24331.00 frames. ], tot_loss[loss=0.1492, simple_loss=0.229, pruned_loss=0.03473, over 3444576.30 frames. ], batch size: 285, lr: 1.43e-02, grad_scale: 32.0 +2024-01-15 19:47:24,065 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.bypass.skip_rate, batch_count=79513.33333333333, ans=0.09899494936611666 +2024-01-15 19:47:44,698 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=79546.66666666667, ans=0.1 +2024-01-15 19:47:51,215 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward2.hidden_balancer.prob, batch_count=79580.0, ans=0.125 +2024-01-15 19:48:05,865 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module2.balancer1.prob, batch_count=79613.33333333333, ans=0.125 +2024-01-15 19:48:07,315 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.feed_forward1.out_whiten, num_groups=1, num_channels=512, metric=8.10 vs. limit=15.0 +2024-01-15 19:48:13,287 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.nonlin_attention.whiten1, num_groups=1, num_channels=384, metric=4.05 vs. limit=10.0 +2024-01-15 19:48:24,480 INFO [train.py:994] (1/2) Epoch 29, batch 300, loss[loss=0.1548, simple_loss=0.2349, pruned_loss=0.03732, over 24490.00 frames. ], tot_loss[loss=0.1492, simple_loss=0.2288, pruned_loss=0.0348, over 3745660.39 frames. ], batch size: 210, lr: 1.43e-02, grad_scale: 32.0 +2024-01-15 19:48:40,331 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=79713.33333333333, ans=0.1 +2024-01-15 19:48:41,435 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module2.balancer2.min_positive, batch_count=79713.33333333333, ans=0.05 +2024-01-15 19:48:43,294 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.0.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 19:48:45,806 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward1.hidden_balancer.prob, batch_count=79713.33333333333, ans=0.125 +2024-01-15 19:49:06,087 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.569e+02 1.816e+02 2.120e+02 2.493e+02 4.503e+02, threshold=4.241e+02, percent-clipped=1.0 +2024-01-15 19:49:26,242 INFO [train.py:994] (1/2) Epoch 29, batch 350, loss[loss=0.1478, simple_loss=0.2269, pruned_loss=0.0344, over 24407.00 frames. ], tot_loss[loss=0.1489, simple_loss=0.2286, pruned_loss=0.03461, over 3976145.82 frames. ], batch size: 258, lr: 1.43e-02, grad_scale: 32.0 +2024-01-15 19:49:27,719 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module2.balancer2.min_abs, batch_count=79846.66666666667, ans=0.5 +2024-01-15 19:49:39,612 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=79880.0, ans=0.1 +2024-01-15 19:49:39,939 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=384, metric=10.98 vs. limit=15.0 +2024-01-15 19:49:47,853 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module2.balancer1.prob, batch_count=79880.0, ans=0.125 +2024-01-15 19:49:56,144 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.ff3_skip_rate, batch_count=79913.33333333333, ans=0.0 +2024-01-15 19:50:29,656 INFO [train.py:994] (1/2) Epoch 29, batch 400, loss[loss=0.1583, simple_loss=0.2407, pruned_loss=0.03796, over 24478.00 frames. ], tot_loss[loss=0.1489, simple_loss=0.2288, pruned_loss=0.03448, over 4152568.28 frames. ], batch size: 222, lr: 1.43e-02, grad_scale: 32.0 +2024-01-15 19:50:33,575 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=192, metric=6.32 vs. limit=15.0 +2024-01-15 19:51:11,279 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.476e+02 1.814e+02 2.020e+02 2.310e+02 3.488e+02, threshold=4.040e+02, percent-clipped=0.0 +2024-01-15 19:51:17,844 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=384, metric=10.90 vs. limit=15.0 +2024-01-15 19:51:30,813 INFO [train.py:994] (1/2) Epoch 29, batch 450, loss[loss=0.1649, simple_loss=0.2383, pruned_loss=0.04571, over 24564.00 frames. ], tot_loss[loss=0.1488, simple_loss=0.2285, pruned_loss=0.03456, over 4301470.91 frames. ], batch size: 176, lr: 1.42e-02, grad_scale: 32.0 +2024-01-15 19:51:36,974 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.out_proj.dropout_p, batch_count=80180.0, ans=0.1 +2024-01-15 19:51:49,851 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=80213.33333333333, ans=0.1 +2024-01-15 19:51:57,943 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module2.balancer1.prob, batch_count=80246.66666666667, ans=0.125 +2024-01-15 19:52:08,620 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module2.balancer1.prob, batch_count=80280.0, ans=0.125 +2024-01-15 19:52:31,792 INFO [train.py:994] (1/2) Epoch 29, batch 500, loss[loss=0.1358, simple_loss=0.2157, pruned_loss=0.02793, over 24216.00 frames. ], tot_loss[loss=0.1489, simple_loss=0.2285, pruned_loss=0.03467, over 4417121.10 frames. ], batch size: 140, lr: 1.42e-02, grad_scale: 32.0 +2024-01-15 19:52:50,858 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module2.balancer2.prob, batch_count=80380.0, ans=0.125 +2024-01-15 19:53:03,600 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.nonlin_attention.balancer.prob, batch_count=80413.33333333333, ans=0.125 +2024-01-15 19:53:04,851 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_skip_rate, batch_count=80413.33333333333, ans=0.0 +2024-01-15 19:53:14,691 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.489e+02 1.815e+02 2.023e+02 2.405e+02 3.729e+02, threshold=4.047e+02, percent-clipped=0.0 +2024-01-15 19:53:26,764 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=80480.0, ans=0.1 +2024-01-15 19:53:31,457 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=192, metric=5.32 vs. limit=10.0 +2024-01-15 19:53:34,798 INFO [train.py:994] (1/2) Epoch 29, batch 550, loss[loss=0.1537, simple_loss=0.2304, pruned_loss=0.0385, over 24530.00 frames. ], tot_loss[loss=0.1491, simple_loss=0.2289, pruned_loss=0.03465, over 4513405.56 frames. ], batch size: 236, lr: 1.42e-02, grad_scale: 32.0 +2024-01-15 19:53:36,729 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=384, metric=4.98 vs. limit=15.0 +2024-01-15 19:53:44,599 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.bypass.skip_rate, batch_count=80513.33333333333, ans=0.07 +2024-01-15 19:53:50,669 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.conv_module2.whiten, num_groups=1, num_channels=384, metric=2.75 vs. limit=15.0 +2024-01-15 19:53:56,670 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.attention_skip_rate, batch_count=80546.66666666667, ans=0.0 +2024-01-15 19:53:59,149 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.attention_skip_rate, batch_count=80580.0, ans=0.0 +2024-01-15 19:54:22,904 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module1.balancer2.prob, batch_count=80646.66666666667, ans=0.125 +2024-01-15 19:54:26,300 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.conv_module2.whiten, num_groups=1, num_channels=512, metric=7.69 vs. limit=15.0 +2024-01-15 19:54:36,481 INFO [train.py:994] (1/2) Epoch 29, batch 600, loss[loss=0.1448, simple_loss=0.2284, pruned_loss=0.03058, over 24438.00 frames. ], tot_loss[loss=0.1489, simple_loss=0.2286, pruned_loss=0.03463, over 4564824.31 frames. ], batch size: 170, lr: 1.42e-02, grad_scale: 32.0 +2024-01-15 19:54:44,476 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=5.24 vs. limit=6.0 +2024-01-15 19:55:20,316 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.540e+02 1.854e+02 2.059e+02 2.378e+02 3.411e+02, threshold=4.118e+02, percent-clipped=0.0 +2024-01-15 19:55:38,442 INFO [train.py:994] (1/2) Epoch 29, batch 650, loss[loss=0.1423, simple_loss=0.2243, pruned_loss=0.03011, over 24336.00 frames. ], tot_loss[loss=0.1484, simple_loss=0.228, pruned_loss=0.03435, over 4617860.62 frames. ], batch size: 298, lr: 1.42e-02, grad_scale: 16.0 +2024-01-15 19:55:48,640 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.bypass.skip_rate, batch_count=80846.66666666667, ans=0.07 +2024-01-15 19:55:51,052 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.bypass.skip_rate, batch_count=80880.0, ans=0.035 +2024-01-15 19:55:59,426 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module2.balancer2.prob, batch_count=80880.0, ans=0.125 +2024-01-15 19:56:40,298 INFO [train.py:994] (1/2) Epoch 29, batch 700, loss[loss=0.1421, simple_loss=0.2227, pruned_loss=0.03075, over 24190.00 frames. ], tot_loss[loss=0.1488, simple_loss=0.2284, pruned_loss=0.03457, over 4665958.76 frames. ], batch size: 140, lr: 1.42e-02, grad_scale: 16.0 +2024-01-15 19:56:44,090 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module1.balancer1.prob, batch_count=81013.33333333333, ans=0.125 +2024-01-15 19:56:45,238 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=81013.33333333333, ans=0.125 +2024-01-15 19:56:49,952 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.balancer2.prob, batch_count=81013.33333333333, ans=0.125 +2024-01-15 19:57:19,435 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=256, metric=16.22 vs. limit=22.5 +2024-01-15 19:57:23,205 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.490e+02 1.881e+02 2.019e+02 2.378e+02 3.656e+02, threshold=4.037e+02, percent-clipped=0.0 +2024-01-15 19:57:41,599 INFO [train.py:994] (1/2) Epoch 29, batch 750, loss[loss=0.1582, simple_loss=0.238, pruned_loss=0.03921, over 24425.00 frames. ], tot_loss[loss=0.1492, simple_loss=0.2287, pruned_loss=0.03485, over 4702050.55 frames. ], batch size: 159, lr: 1.42e-02, grad_scale: 16.0 +2024-01-15 19:57:49,831 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.out_combiner.scale_min, batch_count=81180.0, ans=0.2 +2024-01-15 19:57:52,735 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.balancer2.prob, batch_count=81180.0, ans=0.125 +2024-01-15 19:58:11,328 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=11.67 vs. limit=15.0 +2024-01-15 19:58:12,260 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=81246.66666666667, ans=0.0 +2024-01-15 19:58:24,090 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward3.hidden_balancer.prob, batch_count=81280.0, ans=0.125 +2024-01-15 19:58:25,047 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 19:58:32,811 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=81313.33333333333, ans=0.0 +2024-01-15 19:58:38,094 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=8.07 vs. limit=15.0 +2024-01-15 19:58:41,829 INFO [train.py:994] (1/2) Epoch 29, batch 800, loss[loss=0.1435, simple_loss=0.2276, pruned_loss=0.02975, over 24239.00 frames. ], tot_loss[loss=0.1487, simple_loss=0.2281, pruned_loss=0.03468, over 4715030.50 frames. ], batch size: 311, lr: 1.41e-02, grad_scale: 32.0 +2024-01-15 19:58:44,376 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward3.hidden_balancer.prob, batch_count=81346.66666666667, ans=0.125 +2024-01-15 19:58:52,643 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.nonlin_attention.balancer.prob, batch_count=81380.0, ans=0.125 +2024-01-15 19:58:59,207 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=81380.0, ans=0.0 +2024-01-15 19:59:08,896 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.bypass.scale_min, batch_count=81413.33333333333, ans=0.2 +2024-01-15 19:59:21,921 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.562e+02 1.819e+02 1.963e+02 2.267e+02 3.581e+02, threshold=3.926e+02, percent-clipped=0.0 +2024-01-15 19:59:54,086 INFO [train.py:994] (1/2) Epoch 30, batch 0, loss[loss=0.1587, simple_loss=0.2384, pruned_loss=0.03952, over 24222.00 frames. ], tot_loss[loss=0.1587, simple_loss=0.2384, pruned_loss=0.03952, over 24222.00 frames. ], batch size: 311, lr: 1.39e-02, grad_scale: 32.0 +2024-01-15 19:59:54,086 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 20:00:00,904 INFO [zipformer.py:1858] (1/2) name=encoder.encoders.0.layers.1.self_attn_weights, attn_weights_entropy = tensor([5.4433, 5.1268, 5.0852, 5.0079], device='cuda:1') +2024-01-15 20:00:14,347 INFO [train.py:1026] (1/2) Epoch 30, validation: loss=0.1667, simple_loss=0.2496, pruned_loss=0.04196, over 1622729.00 frames. +2024-01-15 20:00:14,348 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 20:00:40,848 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.bypass.scale_min, batch_count=81556.66666666667, ans=0.2 +2024-01-15 20:00:58,606 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.ff2_skip_rate, batch_count=81590.0, ans=0.0 +2024-01-15 20:01:16,549 INFO [train.py:994] (1/2) Epoch 30, batch 50, loss[loss=0.1514, simple_loss=0.2264, pruned_loss=0.0382, over 24462.00 frames. ], tot_loss[loss=0.1465, simple_loss=0.2257, pruned_loss=0.03366, over 1084339.16 frames. ], batch size: 222, lr: 1.39e-02, grad_scale: 32.0 +2024-01-15 20:01:17,951 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=81656.66666666667, ans=0.125 +2024-01-15 20:01:38,252 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.conv_module2.whiten, num_groups=1, num_channels=256, metric=2.86 vs. limit=15.0 +2024-01-15 20:01:42,796 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.ff2_skip_rate, batch_count=81723.33333333333, ans=0.0 +2024-01-15 20:01:46,768 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass.scale_min, batch_count=81723.33333333333, ans=0.2 +2024-01-15 20:02:06,438 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass_mid.scale_min, batch_count=81790.0, ans=0.2 +2024-01-15 20:02:09,644 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.554e+02 1.802e+02 2.012e+02 2.445e+02 3.493e+02, threshold=4.024e+02, percent-clipped=0.0 +2024-01-15 20:02:18,790 INFO [train.py:994] (1/2) Epoch 30, batch 100, loss[loss=0.133, simple_loss=0.2088, pruned_loss=0.0286, over 24358.00 frames. ], tot_loss[loss=0.1461, simple_loss=0.2249, pruned_loss=0.03362, over 1909789.76 frames. ], batch size: 153, lr: 1.39e-02, grad_scale: 32.0 +2024-01-15 20:02:22,042 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=512, metric=5.29 vs. limit=15.0 +2024-01-15 20:02:24,754 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass_mid.scale_min, batch_count=81823.33333333333, ans=0.2 +2024-01-15 20:03:12,792 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module1.balancer2.prob, batch_count=81956.66666666667, ans=0.125 +2024-01-15 20:03:21,385 INFO [train.py:994] (1/2) Epoch 30, batch 150, loss[loss=0.1252, simple_loss=0.1959, pruned_loss=0.0273, over 23509.00 frames. ], tot_loss[loss=0.1457, simple_loss=0.2248, pruned_loss=0.03333, over 2553415.70 frames. ], batch size: 119, lr: 1.39e-02, grad_scale: 32.0 +2024-01-15 20:03:30,250 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_skip_rate, batch_count=81990.0, ans=0.0 +2024-01-15 20:03:36,096 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=8, num_channels=256, metric=5.45 vs. limit=6.0 +2024-01-15 20:03:47,858 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.nonlin_attention.balancer.prob, batch_count=82056.66666666667, ans=0.125 +2024-01-15 20:03:57,987 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.nonlin_attention.balancer.prob, batch_count=82090.0, ans=0.125 +2024-01-15 20:04:03,747 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=82090.0, ans=0.1 +2024-01-15 20:04:06,813 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward3.hidden_balancer.prob, batch_count=82090.0, ans=0.125 +2024-01-15 20:04:08,233 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=2.82 vs. limit=6.0 +2024-01-15 20:04:14,834 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.551e+02 1.884e+02 2.201e+02 2.660e+02 3.501e+02, threshold=4.402e+02, percent-clipped=0.0 +2024-01-15 20:04:15,114 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass.skip_rate, batch_count=82123.33333333333, ans=0.04949747468305833 +2024-01-15 20:04:16,262 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.self_attn_weights.pos_emb_skip_rate, batch_count=82123.33333333333, ans=0.0 +2024-01-15 20:04:18,652 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_skip_rate, batch_count=82123.33333333333, ans=0.0 +2024-01-15 20:04:22,680 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.out_proj.dropout_p, batch_count=82156.66666666667, ans=0.1 +2024-01-15 20:04:23,585 INFO [train.py:994] (1/2) Epoch 30, batch 200, loss[loss=0.142, simple_loss=0.2254, pruned_loss=0.0293, over 24347.00 frames. ], tot_loss[loss=0.1467, simple_loss=0.2256, pruned_loss=0.0339, over 3053114.43 frames. ], batch size: 298, lr: 1.39e-02, grad_scale: 32.0 +2024-01-15 20:04:28,701 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.ff2_skip_rate, batch_count=82156.66666666667, ans=0.0 +2024-01-15 20:04:29,710 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.conv.8.prob, batch_count=82156.66666666667, ans=0.125 +2024-01-15 20:04:41,405 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.whiten, num_groups=1, num_channels=384, metric=6.95 vs. limit=12.0 +2024-01-15 20:05:20,658 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.attention_skip_rate, batch_count=82290.0, ans=0.0 +2024-01-15 20:05:23,598 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward3.hidden_balancer.prob, batch_count=82290.0, ans=0.125 +2024-01-15 20:05:25,597 INFO [train.py:994] (1/2) Epoch 30, batch 250, loss[loss=0.1529, simple_loss=0.2389, pruned_loss=0.03342, over 23912.00 frames. ], tot_loss[loss=0.1475, simple_loss=0.2266, pruned_loss=0.0342, over 3449801.86 frames. ], batch size: 328, lr: 1.38e-02, grad_scale: 32.0 +2024-01-15 20:06:11,682 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.nonlin_attention.balancer.max_positive, batch_count=82423.33333333333, ans=0.95 +2024-01-15 20:06:19,047 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.471e+02 1.830e+02 2.048e+02 2.345e+02 3.197e+02, threshold=4.097e+02, percent-clipped=0.0 +2024-01-15 20:06:27,322 INFO [train.py:994] (1/2) Epoch 30, batch 300, loss[loss=0.1462, simple_loss=0.2293, pruned_loss=0.0315, over 23844.00 frames. ], tot_loss[loss=0.1476, simple_loss=0.2271, pruned_loss=0.03403, over 3760255.56 frames. ], batch size: 328, lr: 1.38e-02, grad_scale: 32.0 +2024-01-15 20:06:34,146 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=82490.0, ans=0.1 +2024-01-15 20:06:40,673 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module1.balancer1.prob, batch_count=82523.33333333333, ans=0.125 +2024-01-15 20:06:40,720 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.attention_skip_rate, batch_count=82523.33333333333, ans=0.0 +2024-01-15 20:06:53,695 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=82556.66666666667, ans=0.1 +2024-01-15 20:06:57,396 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.self_attn2.whiten, num_groups=1, num_channels=192, metric=13.64 vs. limit=22.5 +2024-01-15 20:07:20,926 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.out_combiner.scale_min, batch_count=82623.33333333333, ans=0.2 +2024-01-15 20:07:26,611 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.self_attn2.whiten, num_groups=1, num_channels=384, metric=13.53 vs. limit=22.5 +2024-01-15 20:07:30,095 INFO [train.py:994] (1/2) Epoch 30, batch 350, loss[loss=0.148, simple_loss=0.2262, pruned_loss=0.03489, over 24521.00 frames. ], tot_loss[loss=0.1483, simple_loss=0.2279, pruned_loss=0.03437, over 4002839.65 frames. ], batch size: 229, lr: 1.38e-02, grad_scale: 32.0 +2024-01-15 20:07:38,043 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.nonlin_attention.whiten2, num_groups=1, num_channels=384, metric=3.14 vs. limit=15.0 +2024-01-15 20:08:02,018 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=8.46 vs. limit=15.0 +2024-01-15 20:08:02,910 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_skip_rate, batch_count=82723.33333333333, ans=0.0 +2024-01-15 20:08:08,712 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.0.layers.0.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 20:08:18,793 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module2.balancer1.prob, batch_count=82790.0, ans=0.125 +2024-01-15 20:08:23,255 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.505e+02 1.788e+02 1.997e+02 2.329e+02 3.830e+02, threshold=3.994e+02, percent-clipped=0.0 +2024-01-15 20:08:32,273 INFO [train.py:994] (1/2) Epoch 30, batch 400, loss[loss=0.1511, simple_loss=0.2296, pruned_loss=0.03632, over 24392.00 frames. ], tot_loss[loss=0.1482, simple_loss=0.2279, pruned_loss=0.03429, over 4171919.18 frames. ], batch size: 159, lr: 1.38e-02, grad_scale: 32.0 +2024-01-15 20:09:08,273 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module2.balancer2.prob, batch_count=82923.33333333333, ans=0.125 +2024-01-15 20:09:15,846 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.whiten, num_groups=1, num_channels=384, metric=5.65 vs. limit=12.0 +2024-01-15 20:09:19,711 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.feed_forward3.out_whiten, num_groups=1, num_channels=512, metric=10.15 vs. limit=15.0 +2024-01-15 20:09:21,806 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_module1.balancer2.prob, batch_count=82956.66666666667, ans=0.125 +2024-01-15 20:09:33,910 INFO [train.py:994] (1/2) Epoch 30, batch 450, loss[loss=0.1574, simple_loss=0.2371, pruned_loss=0.03884, over 24433.00 frames. ], tot_loss[loss=0.148, simple_loss=0.2278, pruned_loss=0.03409, over 4312340.17 frames. ], batch size: 222, lr: 1.38e-02, grad_scale: 32.0 +2024-01-15 20:09:42,465 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=82990.0, ans=0.1 +2024-01-15 20:09:45,455 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.ff2_skip_rate, batch_count=83023.33333333333, ans=0.0 +2024-01-15 20:09:45,491 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=83023.33333333333, ans=0.125 +2024-01-15 20:09:49,013 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.bypass_mid.scale_min, batch_count=83023.33333333333, ans=0.2 +2024-01-15 20:09:52,575 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module1.balancer2.prob, batch_count=83023.33333333333, ans=0.125 +2024-01-15 20:10:20,434 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.0.layers.0.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 20:10:22,900 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module1.balancer1.prob, batch_count=83123.33333333333, ans=0.125 +2024-01-15 20:10:27,451 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.489e+02 1.855e+02 2.114e+02 2.682e+02 3.873e+02, threshold=4.227e+02, percent-clipped=0.0 +2024-01-15 20:10:35,666 INFO [train.py:994] (1/2) Epoch 30, batch 500, loss[loss=0.1347, simple_loss=0.2173, pruned_loss=0.02611, over 24312.00 frames. ], tot_loss[loss=0.1477, simple_loss=0.2275, pruned_loss=0.03398, over 4412010.78 frames. ], batch size: 147, lr: 1.38e-02, grad_scale: 32.0 +2024-01-15 20:10:55,881 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_skip_rate, batch_count=83190.0, ans=0.0 +2024-01-15 20:10:58,707 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=512, metric=2.28 vs. limit=15.0 +2024-01-15 20:10:59,435 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.balancer2.prob, batch_count=83223.33333333333, ans=0.125 +2024-01-15 20:11:04,750 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 20:11:22,831 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.out_proj.dropout_p, batch_count=83256.66666666667, ans=0.1 +2024-01-15 20:11:24,380 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.nonlin_attention.whiten1, num_groups=1, num_channels=384, metric=4.06 vs. limit=10.0 +2024-01-15 20:11:37,925 INFO [train.py:994] (1/2) Epoch 30, batch 550, loss[loss=0.1505, simple_loss=0.2368, pruned_loss=0.03211, over 24352.00 frames. ], tot_loss[loss=0.1476, simple_loss=0.2273, pruned_loss=0.03395, over 4500063.63 frames. ], batch size: 298, lr: 1.38e-02, grad_scale: 32.0 +2024-01-15 20:11:43,259 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.bypass.scale_min, batch_count=83323.33333333333, ans=0.2 +2024-01-15 20:11:44,445 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=83323.33333333333, ans=0.1 +2024-01-15 20:12:02,119 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.whiten, num_groups=1, num_channels=384, metric=5.72 vs. limit=12.0 +2024-01-15 20:12:18,836 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module2.balancer1.prob, batch_count=83423.33333333333, ans=0.125 +2024-01-15 20:12:27,569 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.attention_skip_rate, batch_count=83456.66666666667, ans=0.0 +2024-01-15 20:12:31,411 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.527e+02 1.924e+02 2.195e+02 2.587e+02 4.884e+02, threshold=4.390e+02, percent-clipped=1.0 +2024-01-15 20:12:39,667 INFO [train.py:994] (1/2) Epoch 30, batch 600, loss[loss=0.1538, simple_loss=0.2391, pruned_loss=0.0343, over 24447.00 frames. ], tot_loss[loss=0.1479, simple_loss=0.2275, pruned_loss=0.03418, over 4557398.37 frames. ], batch size: 250, lr: 1.38e-02, grad_scale: 32.0 +2024-01-15 20:12:43,494 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_skip_rate, batch_count=83490.0, ans=0.0 +2024-01-15 20:12:52,712 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=10.31 vs. limit=15.0 +2024-01-15 20:13:21,438 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=83590.0, ans=0.125 +2024-01-15 20:13:28,321 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module1.balancer1.prob, batch_count=83623.33333333333, ans=0.125 +2024-01-15 20:13:31,753 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module2.balancer2.prob, batch_count=83623.33333333333, ans=0.125 +2024-01-15 20:13:32,323 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=256, metric=14.51 vs. limit=22.5 +2024-01-15 20:13:41,417 INFO [train.py:994] (1/2) Epoch 30, batch 650, loss[loss=0.1597, simple_loss=0.2378, pruned_loss=0.04079, over 24528.00 frames. ], tot_loss[loss=0.1479, simple_loss=0.2277, pruned_loss=0.03406, over 4622723.75 frames. ], batch size: 236, lr: 1.37e-02, grad_scale: 32.0 +2024-01-15 20:13:54,278 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.ff3_skip_rate, batch_count=83690.0, ans=0.0 +2024-01-15 20:14:00,063 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_module2.balancer1.min_positive, batch_count=83690.0, ans=0.025 +2024-01-15 20:14:11,148 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.nonlin_attention.whiten2, num_groups=1, num_channels=384, metric=5.79 vs. limit=15.0 +2024-01-15 20:14:15,404 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.out_proj.dropout_p, batch_count=83723.33333333333, ans=0.1 +2024-01-15 20:14:35,314 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.453e+02 1.791e+02 1.959e+02 2.214e+02 2.829e+02, threshold=3.919e+02, percent-clipped=0.0 +2024-01-15 20:14:35,586 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.balancer1.prob, batch_count=83790.0, ans=0.125 +2024-01-15 20:14:39,164 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.ff2_skip_rate, batch_count=83790.0, ans=0.0 +2024-01-15 20:14:42,576 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module2.balancer1.prob, batch_count=83823.33333333333, ans=0.125 +2024-01-15 20:14:43,494 INFO [train.py:994] (1/2) Epoch 30, batch 700, loss[loss=0.1436, simple_loss=0.227, pruned_loss=0.03007, over 24508.00 frames. ], tot_loss[loss=0.1478, simple_loss=0.2276, pruned_loss=0.03404, over 4661940.37 frames. ], batch size: 204, lr: 1.37e-02, grad_scale: 32.0 +2024-01-15 20:14:48,369 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.hidden_balancer.prob, batch_count=83823.33333333333, ans=0.125 +2024-01-15 20:15:01,655 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=83856.66666666667, ans=0.1 +2024-01-15 20:15:04,106 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_skip_rate, batch_count=83856.66666666667, ans=0.0 +2024-01-15 20:15:13,335 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.bypass.skip_rate, batch_count=83890.0, ans=0.035 +2024-01-15 20:15:25,768 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass.scale_min, batch_count=83923.33333333333, ans=0.2 +2024-01-15 20:15:42,505 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module2.balancer2.prob, batch_count=83956.66666666667, ans=0.125 +2024-01-15 20:15:43,600 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.bypass_mid.scale_min, batch_count=83990.0, ans=0.2 +2024-01-15 20:15:44,501 INFO [train.py:994] (1/2) Epoch 30, batch 750, loss[loss=0.1572, simple_loss=0.2323, pruned_loss=0.04106, over 24485.00 frames. ], tot_loss[loss=0.1474, simple_loss=0.2273, pruned_loss=0.03377, over 4692204.02 frames. ], batch size: 165, lr: 1.37e-02, grad_scale: 32.0 +2024-01-15 20:15:50,730 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.balancer2.min_positive, batch_count=83990.0, ans=0.05 +2024-01-15 20:15:50,781 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module1.balancer1.prob, batch_count=83990.0, ans=0.125 +2024-01-15 20:15:51,860 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=83990.0, ans=0.1 +2024-01-15 20:15:54,378 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.ff2_skip_rate, batch_count=83990.0, ans=0.0 +2024-01-15 20:15:55,482 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=83990.0, ans=0.125 +2024-01-15 20:16:36,765 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.548e+02 1.859e+02 2.058e+02 2.418e+02 3.544e+02, threshold=4.115e+02, percent-clipped=0.0 +2024-01-15 20:16:44,690 INFO [train.py:994] (1/2) Epoch 30, batch 800, loss[loss=0.1545, simple_loss=0.2322, pruned_loss=0.03841, over 24488.00 frames. ], tot_loss[loss=0.1473, simple_loss=0.2269, pruned_loss=0.03382, over 4707096.93 frames. ], batch size: 210, lr: 1.37e-02, grad_scale: 32.0 +2024-01-15 20:16:49,687 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.feed_forward1.out_whiten, num_groups=1, num_channels=384, metric=6.92 vs. limit=15.0 +2024-01-15 20:16:51,039 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=384, metric=20.99 vs. limit=22.5 +2024-01-15 20:16:51,570 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.out_proj.dropout_p, batch_count=84156.66666666667, ans=0.1 +2024-01-15 20:17:14,792 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward3.hidden_balancer.prob, batch_count=84223.33333333333, ans=0.125 +2024-01-15 20:17:15,830 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.conv.8.prob, batch_count=84223.33333333333, ans=0.125 +2024-01-15 20:17:56,021 INFO [train.py:994] (1/2) Epoch 31, batch 0, loss[loss=0.1346, simple_loss=0.2182, pruned_loss=0.0255, over 24204.00 frames. ], tot_loss[loss=0.1346, simple_loss=0.2182, pruned_loss=0.0255, over 24204.00 frames. ], batch size: 140, lr: 1.35e-02, grad_scale: 32.0 +2024-01-15 20:17:56,021 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 20:18:06,781 INFO [zipformer.py:1858] (1/2) name=encoder.encoders.3.encoder.layers.2.self_attn_weights, attn_weights_entropy = tensor([1.7002, 2.4567, 2.6247, 2.5307, 1.8304, 2.5934, 2.5210, 2.4426], + device='cuda:1') +2024-01-15 20:18:07,494 INFO [zipformer.py:1858] (1/2) name=encoder.encoders.2.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([4.0177, 3.7992, 3.1546, 3.5890], device='cuda:1') +2024-01-15 20:18:16,809 INFO [train.py:1026] (1/2) Epoch 31, validation: loss=0.1652, simple_loss=0.2487, pruned_loss=0.04091, over 1622729.00 frames. +2024-01-15 20:18:16,810 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 20:18:19,492 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass.scale_min, batch_count=84300.0, ans=0.2 +2024-01-15 20:18:24,369 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.balancer2.prob, batch_count=84300.0, ans=0.125 +2024-01-15 20:18:44,973 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=192, metric=6.56 vs. limit=15.0 +2024-01-15 20:18:55,005 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.balancer_na.min_abs, batch_count=84400.0, ans=0.02 +2024-01-15 20:19:18,316 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.496e+02 1.819e+02 1.963e+02 2.349e+02 3.862e+02, threshold=3.926e+02, percent-clipped=0.0 +2024-01-15 20:19:18,344 INFO [train.py:994] (1/2) Epoch 31, batch 50, loss[loss=0.151, simple_loss=0.2375, pruned_loss=0.03229, over 22573.00 frames. ], tot_loss[loss=0.1482, simple_loss=0.2274, pruned_loss=0.03454, over 1085593.66 frames. ], batch size: 357, lr: 1.35e-02, grad_scale: 32.0 +2024-01-15 20:19:49,528 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=84533.33333333333, ans=0.0 +2024-01-15 20:19:50,669 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module2.balancer1.prob, batch_count=84533.33333333333, ans=0.125 +2024-01-15 20:20:21,172 INFO [train.py:994] (1/2) Epoch 31, batch 100, loss[loss=0.1478, simple_loss=0.2282, pruned_loss=0.03371, over 24532.00 frames. ], tot_loss[loss=0.1455, simple_loss=0.2247, pruned_loss=0.03309, over 1899696.29 frames. ], batch size: 193, lr: 1.35e-02, grad_scale: 32.0 +2024-01-15 20:20:38,260 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.bypass.skip_rate, batch_count=84666.66666666667, ans=0.035 +2024-01-15 20:21:06,603 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module1.balancer2.prob, batch_count=84733.33333333333, ans=0.125 +2024-01-15 20:21:19,851 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=84766.66666666667, ans=0.1 +2024-01-15 20:21:23,156 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.391e+02 1.828e+02 2.015e+02 2.325e+02 4.082e+02, threshold=4.031e+02, percent-clipped=1.0 +2024-01-15 20:21:23,184 INFO [train.py:994] (1/2) Epoch 31, batch 150, loss[loss=0.1523, simple_loss=0.2314, pruned_loss=0.03664, over 24371.00 frames. ], tot_loss[loss=0.1455, simple_loss=0.2248, pruned_loss=0.03308, over 2532441.34 frames. ], batch size: 275, lr: 1.35e-02, grad_scale: 32.0 +2024-01-15 20:21:36,991 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=84833.33333333333, ans=0.1 +2024-01-15 20:21:46,274 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=192, metric=5.77 vs. limit=15.0 +2024-01-15 20:22:01,695 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass_mid.scale_min, batch_count=84900.0, ans=0.2 +2024-01-15 20:22:23,654 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.ff2_skip_rate, batch_count=84966.66666666667, ans=0.0 +2024-01-15 20:22:25,069 INFO [train.py:994] (1/2) Epoch 31, batch 200, loss[loss=0.1527, simple_loss=0.2352, pruned_loss=0.03507, over 24524.00 frames. ], tot_loss[loss=0.146, simple_loss=0.2255, pruned_loss=0.03323, over 3032160.96 frames. ], batch size: 204, lr: 1.34e-02, grad_scale: 32.0 +2024-01-15 20:22:35,990 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=192, metric=7.08 vs. limit=15.0 +2024-01-15 20:22:56,922 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=85033.33333333333, ans=0.125 +2024-01-15 20:23:16,310 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=144, metric=9.96 vs. limit=10.0 +2024-01-15 20:23:27,464 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.517e+02 1.737e+02 1.884e+02 2.109e+02 3.532e+02, threshold=3.768e+02, percent-clipped=0.0 +2024-01-15 20:23:27,496 INFO [train.py:994] (1/2) Epoch 31, batch 250, loss[loss=0.1321, simple_loss=0.2005, pruned_loss=0.0318, over 23531.00 frames. ], tot_loss[loss=0.1457, simple_loss=0.2255, pruned_loss=0.03294, over 3431195.03 frames. ], batch size: 119, lr: 1.34e-02, grad_scale: 32.0 +2024-01-15 20:23:28,989 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.ff2_skip_rate, batch_count=85133.33333333333, ans=0.0 +2024-01-15 20:23:39,409 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.nonlin_attention.balancer.prob, batch_count=85166.66666666667, ans=0.125 +2024-01-15 20:23:39,447 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.bypass.scale_min, batch_count=85166.66666666667, ans=0.2 +2024-01-15 20:24:07,928 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.self_attn_weights.whiten_keys, num_groups=8, num_channels=256, metric=4.01 vs. limit=6.0 +2024-01-15 20:24:13,496 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_skip_rate, batch_count=85233.33333333333, ans=0.0 +2024-01-15 20:24:26,468 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module2.balancer2.prob, batch_count=85266.66666666667, ans=0.125 +2024-01-15 20:24:28,427 INFO [train.py:994] (1/2) Epoch 31, batch 300, loss[loss=0.1514, simple_loss=0.2313, pruned_loss=0.03579, over 24512.00 frames. ], tot_loss[loss=0.1455, simple_loss=0.2255, pruned_loss=0.03276, over 3736967.27 frames. ], batch size: 229, lr: 1.34e-02, grad_scale: 32.0 +2024-01-15 20:24:47,832 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=85333.33333333333, ans=0.1 +2024-01-15 20:25:03,023 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module2.balancer1.prob, batch_count=85366.66666666667, ans=0.125 +2024-01-15 20:25:06,542 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module1.balancer2.prob, batch_count=85400.0, ans=0.125 +2024-01-15 20:25:11,249 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.ff2_skip_rate, batch_count=85400.0, ans=0.0 +2024-01-15 20:25:30,945 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.391e+02 1.870e+02 2.116e+02 2.844e+02 4.869e+02, threshold=4.233e+02, percent-clipped=5.0 +2024-01-15 20:25:30,974 INFO [train.py:994] (1/2) Epoch 31, batch 350, loss[loss=0.1567, simple_loss=0.233, pruned_loss=0.04015, over 24306.00 frames. ], tot_loss[loss=0.1455, simple_loss=0.2256, pruned_loss=0.03271, over 3974139.28 frames. ], batch size: 285, lr: 1.34e-02, grad_scale: 32.0 +2024-01-15 20:25:33,553 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module1.balancer2.prob, batch_count=85466.66666666667, ans=0.125 +2024-01-15 20:25:36,036 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.4.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 20:26:00,790 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=384, metric=21.65 vs. limit=22.5 +2024-01-15 20:26:09,914 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.self_attn1.whiten, num_groups=1, num_channels=384, metric=12.56 vs. limit=22.5 +2024-01-15 20:26:18,543 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_skip_rate, batch_count=85600.0, ans=0.0 +2024-01-15 20:26:19,730 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.attention_skip_rate, batch_count=85600.0, ans=0.0 +2024-01-15 20:26:31,186 INFO [train.py:994] (1/2) Epoch 31, batch 400, loss[loss=0.158, simple_loss=0.2369, pruned_loss=0.0395, over 24448.00 frames. ], tot_loss[loss=0.1448, simple_loss=0.2249, pruned_loss=0.03235, over 4152216.17 frames. ], batch size: 170, lr: 1.34e-02, grad_scale: 32.0 +2024-01-15 20:27:15,246 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.4.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 20:27:17,593 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.ff3_skip_rate, batch_count=85733.33333333333, ans=0.0 +2024-01-15 20:27:22,624 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=3.05 vs. limit=6.0 +2024-01-15 20:27:29,346 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.3.encoder.layers.2.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 20:27:33,137 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.542e+02 1.759e+02 1.988e+02 2.277e+02 3.260e+02, threshold=3.975e+02, percent-clipped=0.0 +2024-01-15 20:27:33,165 INFO [train.py:994] (1/2) Epoch 31, batch 450, loss[loss=0.1426, simple_loss=0.2241, pruned_loss=0.03055, over 24412.00 frames. ], tot_loss[loss=0.145, simple_loss=0.2251, pruned_loss=0.03248, over 4302837.40 frames. ], batch size: 250, lr: 1.34e-02, grad_scale: 32.0 +2024-01-15 20:27:34,604 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.bypass.scale_min, batch_count=85800.0, ans=0.2 +2024-01-15 20:28:18,529 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward3.hidden_balancer.prob, batch_count=85900.0, ans=0.125 +2024-01-15 20:28:18,560 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=85900.0, ans=0.1 +2024-01-15 20:28:23,895 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.ff2_skip_rate, batch_count=85933.33333333333, ans=0.0 +2024-01-15 20:28:29,256 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.self_attn_weights.pos_emb_skip_rate, batch_count=85933.33333333333, ans=0.0 +2024-01-15 20:28:34,935 INFO [train.py:994] (1/2) Epoch 31, batch 500, loss[loss=0.1507, simple_loss=0.2302, pruned_loss=0.03566, over 24522.00 frames. ], tot_loss[loss=0.1456, simple_loss=0.2257, pruned_loss=0.03272, over 4428577.37 frames. ], batch size: 165, lr: 1.34e-02, grad_scale: 32.0 +2024-01-15 20:28:43,887 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.conv_module1.whiten, num_groups=1, num_channels=512, metric=4.35 vs. limit=15.0 +2024-01-15 20:28:45,226 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=11.39 vs. limit=15.0 +2024-01-15 20:29:18,637 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_skip_rate, batch_count=86066.66666666667, ans=0.0 +2024-01-15 20:29:31,268 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.nonlin_attention.balancer.prob, batch_count=86100.0, ans=0.125 +2024-01-15 20:29:37,060 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.543e+02 1.823e+02 1.990e+02 2.284e+02 3.467e+02, threshold=3.981e+02, percent-clipped=0.0 +2024-01-15 20:29:37,088 INFO [train.py:994] (1/2) Epoch 31, batch 550, loss[loss=0.1569, simple_loss=0.2422, pruned_loss=0.03581, over 23901.00 frames. ], tot_loss[loss=0.145, simple_loss=0.2251, pruned_loss=0.03247, over 4504159.10 frames. ], batch size: 328, lr: 1.34e-02, grad_scale: 32.0 +2024-01-15 20:29:51,718 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module2.balancer1.prob, batch_count=86166.66666666667, ans=0.125 +2024-01-15 20:29:54,273 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.conv_module1.whiten, num_groups=1, num_channels=384, metric=2.91 vs. limit=15.0 +2024-01-15 20:30:02,346 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.ff3_skip_rate, batch_count=86200.0, ans=0.0 +2024-01-15 20:30:04,642 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=86200.0, ans=0.125 +2024-01-15 20:30:33,018 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_module2.balancer1.prob, batch_count=86266.66666666667, ans=0.125 +2024-01-15 20:30:40,443 INFO [train.py:994] (1/2) Epoch 31, batch 600, loss[loss=0.1509, simple_loss=0.233, pruned_loss=0.03445, over 24619.00 frames. ], tot_loss[loss=0.1456, simple_loss=0.2256, pruned_loss=0.03279, over 4572551.99 frames. ], batch size: 199, lr: 1.33e-02, grad_scale: 32.0 +2024-01-15 20:30:40,747 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_skip_rate, batch_count=86300.0, ans=0.0 +2024-01-15 20:31:19,256 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=8, num_channels=256, metric=5.42 vs. limit=6.0 +2024-01-15 20:31:26,536 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.balancer_na.min_abs, batch_count=86400.0, ans=0.02 +2024-01-15 20:31:28,878 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=86433.33333333333, ans=0.1 +2024-01-15 20:31:35,315 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module2.balancer1.prob, batch_count=86433.33333333333, ans=0.125 +2024-01-15 20:31:42,295 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.540e+02 1.828e+02 1.966e+02 2.239e+02 3.295e+02, threshold=3.932e+02, percent-clipped=0.0 +2024-01-15 20:31:42,323 INFO [train.py:994] (1/2) Epoch 31, batch 650, loss[loss=0.1532, simple_loss=0.2289, pruned_loss=0.03879, over 24485.00 frames. ], tot_loss[loss=0.1459, simple_loss=0.2257, pruned_loss=0.03302, over 4618836.06 frames. ], batch size: 181, lr: 1.33e-02, grad_scale: 32.0 +2024-01-15 20:31:42,635 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward3.hidden_balancer.prob, batch_count=86466.66666666667, ans=0.125 +2024-01-15 20:31:47,926 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.attention_skip_rate, batch_count=86466.66666666667, ans=0.0 +2024-01-15 20:31:53,802 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_skip_rate, batch_count=86500.0, ans=0.0 +2024-01-15 20:31:56,244 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module2.balancer1.prob, batch_count=86500.0, ans=0.125 +2024-01-15 20:32:06,385 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.balancer2.prob, batch_count=86533.33333333333, ans=0.125 +2024-01-15 20:32:21,944 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=192, metric=7.55 vs. limit=15.0 +2024-01-15 20:32:44,956 INFO [train.py:994] (1/2) Epoch 31, batch 700, loss[loss=0.144, simple_loss=0.2254, pruned_loss=0.03127, over 24543.00 frames. ], tot_loss[loss=0.1465, simple_loss=0.2265, pruned_loss=0.03326, over 4667139.65 frames. ], batch size: 193, lr: 1.33e-02, grad_scale: 32.0 +2024-01-15 20:32:54,286 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=256, metric=14.53 vs. limit=22.5 +2024-01-15 20:33:00,782 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=192, metric=5.07 vs. limit=10.0 +2024-01-15 20:33:15,362 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=256, metric=3.37 vs. limit=15.0 +2024-01-15 20:33:28,745 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.2.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 20:33:31,794 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=86733.33333333333, ans=0.125 +2024-01-15 20:33:38,028 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=3.41 vs. limit=10.0 +2024-01-15 20:33:45,510 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module2.balancer1.prob, batch_count=86766.66666666667, ans=0.125 +2024-01-15 20:33:47,589 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.533e+02 1.770e+02 1.940e+02 2.237e+02 3.282e+02, threshold=3.880e+02, percent-clipped=0.0 +2024-01-15 20:33:47,616 INFO [train.py:994] (1/2) Epoch 31, batch 750, loss[loss=0.1537, simple_loss=0.229, pruned_loss=0.0392, over 24566.00 frames. ], tot_loss[loss=0.1468, simple_loss=0.2265, pruned_loss=0.03351, over 4695640.82 frames. ], batch size: 176, lr: 1.33e-02, grad_scale: 32.0 +2024-01-15 20:33:55,582 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=86800.0, ans=0.1 +2024-01-15 20:34:06,796 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.hidden_balancer.prob, batch_count=86833.33333333333, ans=0.125 +2024-01-15 20:34:26,071 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=384, metric=10.20 vs. limit=15.0 +2024-01-15 20:34:30,230 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.ff3_skip_rate, batch_count=86900.0, ans=0.0 +2024-01-15 20:34:32,430 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module2.balancer2.prob, batch_count=86900.0, ans=0.125 +2024-01-15 20:34:47,307 INFO [train.py:994] (1/2) Epoch 31, batch 800, loss[loss=0.1583, simple_loss=0.24, pruned_loss=0.03827, over 24462.00 frames. ], tot_loss[loss=0.1462, simple_loss=0.2261, pruned_loss=0.03313, over 4727783.53 frames. ], batch size: 222, lr: 1.33e-02, grad_scale: 32.0 +2024-01-15 20:35:19,390 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module2.balancer2.prob, batch_count=87033.33333333333, ans=0.125 +2024-01-15 20:35:25,386 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=87066.66666666667, ans=0.1 +2024-01-15 20:36:01,608 INFO [train.py:994] (1/2) Epoch 32, batch 0, loss[loss=0.159, simple_loss=0.2408, pruned_loss=0.03861, over 24614.00 frames. ], tot_loss[loss=0.159, simple_loss=0.2408, pruned_loss=0.03861, over 24614.00 frames. ], batch size: 199, lr: 1.31e-02, grad_scale: 32.0 +2024-01-15 20:36:01,609 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 20:36:18,041 INFO [zipformer.py:1858] (1/2) name=encoder.encoders.2.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([3.6923, 3.3516, 2.7896, 2.3608], device='cuda:1') +2024-01-15 20:36:22,999 INFO [train.py:1026] (1/2) Epoch 32, validation: loss=0.1657, simple_loss=0.2488, pruned_loss=0.04124, over 1622729.00 frames. +2024-01-15 20:36:23,000 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 20:36:30,581 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.balancer1.prob, batch_count=87110.0, ans=0.125 +2024-01-15 20:36:31,439 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.407e+02 1.789e+02 2.040e+02 2.421e+02 3.531e+02, threshold=4.081e+02, percent-clipped=0.0 +2024-01-15 20:36:35,913 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=87143.33333333333, ans=0.125 +2024-01-15 20:36:38,654 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.feed_forward1.out_whiten, num_groups=1, num_channels=512, metric=10.02 vs. limit=15.0 +2024-01-15 20:36:55,421 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_skip_rate, batch_count=87176.66666666667, ans=0.0 +2024-01-15 20:36:55,479 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module2.balancer1.prob, batch_count=87176.66666666667, ans=0.125 +2024-01-15 20:37:00,786 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=13.19 vs. limit=15.0 +2024-01-15 20:37:04,092 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.bypass.scale_min, batch_count=87210.0, ans=0.2 +2024-01-15 20:37:18,572 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=87243.33333333333, ans=0.125 +2024-01-15 20:37:20,823 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=87243.33333333333, ans=0.1 +2024-01-15 20:37:24,717 INFO [train.py:994] (1/2) Epoch 32, batch 50, loss[loss=0.1511, simple_loss=0.2309, pruned_loss=0.03562, over 24482.00 frames. ], tot_loss[loss=0.1446, simple_loss=0.2241, pruned_loss=0.03255, over 1086474.92 frames. ], batch size: 165, lr: 1.31e-02, grad_scale: 16.0 +2024-01-15 20:37:44,533 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.bypass.skip_rate, batch_count=87310.0, ans=0.09899494936611666 +2024-01-15 20:38:06,444 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=2.72 vs. limit=6.0 +2024-01-15 20:38:26,663 INFO [train.py:994] (1/2) Epoch 32, batch 100, loss[loss=0.1442, simple_loss=0.2255, pruned_loss=0.03141, over 24483.00 frames. ], tot_loss[loss=0.1447, simple_loss=0.2248, pruned_loss=0.03232, over 1928258.66 frames. ], batch size: 181, lr: 1.31e-02, grad_scale: 16.0 +2024-01-15 20:38:36,503 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.506e+02 1.827e+02 2.037e+02 2.266e+02 3.836e+02, threshold=4.073e+02, percent-clipped=0.0 +2024-01-15 20:38:53,423 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.balancer1.prob, batch_count=87510.0, ans=0.125 +2024-01-15 20:38:56,867 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.ff2_skip_rate, batch_count=87510.0, ans=0.0 +2024-01-15 20:38:57,960 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=87510.0, ans=0.1 +2024-01-15 20:39:01,575 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.bypass_mid.scale_min, batch_count=87510.0, ans=0.2 +2024-01-15 20:39:10,290 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=87543.33333333333, ans=0.1 +2024-01-15 20:39:16,209 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.bypass.skip_rate, batch_count=87576.66666666667, ans=0.09899494936611666 +2024-01-15 20:39:28,364 INFO [train.py:994] (1/2) Epoch 32, batch 150, loss[loss=0.1416, simple_loss=0.2183, pruned_loss=0.03241, over 24477.00 frames. ], tot_loss[loss=0.1448, simple_loss=0.2245, pruned_loss=0.0325, over 2564458.38 frames. ], batch size: 216, lr: 1.31e-02, grad_scale: 16.0 +2024-01-15 20:39:31,059 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=87610.0, ans=0.1 +2024-01-15 20:39:31,072 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.out_combiner.scale_min, batch_count=87610.0, ans=0.2 +2024-01-15 20:39:53,392 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.1.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 20:40:25,499 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=10.02 vs. limit=15.0 +2024-01-15 20:40:30,368 INFO [train.py:994] (1/2) Epoch 32, batch 200, loss[loss=0.1487, simple_loss=0.2308, pruned_loss=0.03327, over 24511.00 frames. ], tot_loss[loss=0.1457, simple_loss=0.2258, pruned_loss=0.03285, over 3073159.93 frames. ], batch size: 243, lr: 1.30e-02, grad_scale: 16.0 +2024-01-15 20:40:38,300 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module2.balancer2.prob, batch_count=87776.66666666667, ans=0.125 +2024-01-15 20:40:40,437 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.492e+02 1.788e+02 2.022e+02 2.395e+02 4.005e+02, threshold=4.044e+02, percent-clipped=0.0 +2024-01-15 20:40:54,034 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward3.hidden_balancer.prob, batch_count=87843.33333333333, ans=0.125 +2024-01-15 20:40:55,582 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.dropout.p, batch_count=87843.33333333333, ans=0.1 +2024-01-15 20:40:58,982 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.ff3_skip_rate, batch_count=87843.33333333333, ans=0.0 +2024-01-15 20:41:00,148 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module2.balancer2.prob, batch_count=87843.33333333333, ans=0.125 +2024-01-15 20:41:04,594 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.whiten, num_groups=1, num_channels=512, metric=4.73 vs. limit=12.0 +2024-01-15 20:41:13,521 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_skip_rate, batch_count=87876.66666666667, ans=0.0 +2024-01-15 20:41:19,505 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.conv_module1.whiten, num_groups=1, num_channels=192, metric=8.13 vs. limit=15.0 +2024-01-15 20:41:20,061 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module2.balancer2.min_abs, batch_count=87910.0, ans=0.5 +2024-01-15 20:41:20,129 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=87910.0, ans=0.1 +2024-01-15 20:41:21,295 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.ff2_skip_rate, batch_count=87910.0, ans=0.0 +2024-01-15 20:41:24,811 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.attention_skip_rate, batch_count=87910.0, ans=0.0 +2024-01-15 20:41:32,072 INFO [train.py:994] (1/2) Epoch 32, batch 250, loss[loss=0.1445, simple_loss=0.2251, pruned_loss=0.032, over 24461.00 frames. ], tot_loss[loss=0.1448, simple_loss=0.2248, pruned_loss=0.03233, over 3451549.24 frames. ], batch size: 267, lr: 1.30e-02, grad_scale: 16.0 +2024-01-15 20:42:04,589 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.conv_module2.whiten, num_groups=1, num_channels=384, metric=2.73 vs. limit=15.0 +2024-01-15 20:42:19,401 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.bypass_mid.scale_min, batch_count=88043.33333333333, ans=0.2 +2024-01-15 20:42:33,438 INFO [train.py:994] (1/2) Epoch 32, batch 300, loss[loss=0.1541, simple_loss=0.2317, pruned_loss=0.03832, over 23920.00 frames. ], tot_loss[loss=0.1452, simple_loss=0.2254, pruned_loss=0.0325, over 3757788.18 frames. ], batch size: 328, lr: 1.30e-02, grad_scale: 16.0 +2024-01-15 20:42:33,738 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.ff2_skip_rate, batch_count=88110.0, ans=0.0 +2024-01-15 20:42:44,253 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.481e+02 1.751e+02 1.891e+02 2.061e+02 3.059e+02, threshold=3.782e+02, percent-clipped=0.0 +2024-01-15 20:42:48,131 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.attention_skip_rate, batch_count=88143.33333333333, ans=0.0 +2024-01-15 20:42:57,023 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.self_attn2.whiten, num_groups=1, num_channels=384, metric=20.93 vs. limit=22.5 +2024-01-15 20:43:19,244 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module1.balancer1.prob, batch_count=88210.0, ans=0.125 +2024-01-15 20:43:24,320 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=256, metric=9.08 vs. limit=15.0 +2024-01-15 20:43:26,160 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.3.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 20:43:33,704 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module1.balancer1.prob, batch_count=88243.33333333333, ans=0.125 +2024-01-15 20:43:36,473 INFO [train.py:994] (1/2) Epoch 32, batch 350, loss[loss=0.1477, simple_loss=0.2332, pruned_loss=0.03108, over 24470.00 frames. ], tot_loss[loss=0.1447, simple_loss=0.2247, pruned_loss=0.03236, over 3967759.75 frames. ], batch size: 267, lr: 1.30e-02, grad_scale: 16.0 +2024-01-15 20:43:41,987 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=512, metric=15.78 vs. limit=22.5 +2024-01-15 20:44:08,426 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=512, metric=14.58 vs. limit=15.0 +2024-01-15 20:44:38,792 INFO [train.py:994] (1/2) Epoch 32, batch 400, loss[loss=0.1359, simple_loss=0.2185, pruned_loss=0.02662, over 24433.00 frames. ], tot_loss[loss=0.1446, simple_loss=0.2243, pruned_loss=0.03241, over 4146462.32 frames. ], batch size: 250, lr: 1.30e-02, grad_scale: 32.0 +2024-01-15 20:44:39,081 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.nonlin_attention.balancer.prob, batch_count=88443.33333333333, ans=0.125 +2024-01-15 20:44:41,475 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module2.balancer1.max_abs, batch_count=88443.33333333333, ans=10.0 +2024-01-15 20:44:48,124 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=2.97 vs. limit=6.0 +2024-01-15 20:44:49,976 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.376e+02 1.793e+02 1.951e+02 2.365e+02 3.576e+02, threshold=3.902e+02, percent-clipped=0.0 +2024-01-15 20:44:51,480 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module2.balancer1.max_abs, batch_count=88476.66666666667, ans=10.0 +2024-01-15 20:45:15,448 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=256, metric=16.97 vs. limit=22.5 +2024-01-15 20:45:25,836 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.out_proj.dropout_p, batch_count=88543.33333333333, ans=0.1 +2024-01-15 20:45:31,022 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.conv_module2.whiten, num_groups=1, num_channels=512, metric=16.10 vs. limit=15.0 +2024-01-15 20:45:33,037 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_skip_rate, batch_count=88576.66666666667, ans=0.0 +2024-01-15 20:45:39,552 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module1.balancer1.prob, batch_count=88610.0, ans=0.125 +2024-01-15 20:45:40,454 INFO [train.py:994] (1/2) Epoch 32, batch 450, loss[loss=0.1415, simple_loss=0.218, pruned_loss=0.03252, over 24558.00 frames. ], tot_loss[loss=0.145, simple_loss=0.2247, pruned_loss=0.03263, over 4289648.97 frames. ], batch size: 236, lr: 1.30e-02, grad_scale: 16.0 +2024-01-15 20:45:58,465 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=256, metric=6.16 vs. limit=15.0 +2024-01-15 20:46:37,492 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.conv_module2.whiten, num_groups=1, num_channels=512, metric=7.05 vs. limit=15.0 +2024-01-15 20:46:42,864 INFO [train.py:994] (1/2) Epoch 32, batch 500, loss[loss=0.1311, simple_loss=0.2116, pruned_loss=0.02532, over 24307.00 frames. ], tot_loss[loss=0.1449, simple_loss=0.2245, pruned_loss=0.03261, over 4404887.65 frames. ], batch size: 147, lr: 1.30e-02, grad_scale: 16.0 +2024-01-15 20:46:52,660 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.conv.5.prob, batch_count=88776.66666666667, ans=0.125 +2024-01-15 20:46:52,817 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.attention_skip_rate, batch_count=88776.66666666667, ans=0.0 +2024-01-15 20:46:53,666 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.476e+02 1.760e+02 2.034e+02 2.346e+02 3.813e+02, threshold=4.069e+02, percent-clipped=0.0 +2024-01-15 20:47:04,301 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.conv.8.prob, batch_count=88810.0, ans=0.125 +2024-01-15 20:47:30,889 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.self_attn_weights.whiten_keys, num_groups=8, num_channels=256, metric=4.28 vs. limit=6.0 +2024-01-15 20:47:34,057 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward1.hidden_balancer.prob, batch_count=88910.0, ans=0.125 +2024-01-15 20:47:35,663 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.whiten, num_groups=1, num_channels=512, metric=6.56 vs. limit=12.0 +2024-01-15 20:47:37,634 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=88910.0, ans=0.1 +2024-01-15 20:47:42,417 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=88910.0, ans=0.1 +2024-01-15 20:47:42,850 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.conv_module2.whiten, num_groups=1, num_channels=384, metric=7.79 vs. limit=15.0 +2024-01-15 20:47:44,584 INFO [train.py:994] (1/2) Epoch 32, batch 550, loss[loss=0.1263, simple_loss=0.2036, pruned_loss=0.02445, over 23944.00 frames. ], tot_loss[loss=0.1447, simple_loss=0.2245, pruned_loss=0.03246, over 4500717.25 frames. ], batch size: 131, lr: 1.30e-02, grad_scale: 16.0 +2024-01-15 20:47:48,343 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=4.54 vs. limit=6.0 +2024-01-15 20:47:54,895 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=384, metric=21.87 vs. limit=22.5 +2024-01-15 20:47:59,656 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.conv_module2.whiten, num_groups=1, num_channels=256, metric=5.82 vs. limit=15.0 +2024-01-15 20:48:19,334 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.whiten, num_groups=1, num_channels=192, metric=4.26 vs. limit=12.0 +2024-01-15 20:48:23,319 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=89043.33333333333, ans=0.0 +2024-01-15 20:48:46,600 INFO [train.py:994] (1/2) Epoch 32, batch 600, loss[loss=0.1462, simple_loss=0.229, pruned_loss=0.03172, over 24315.00 frames. ], tot_loss[loss=0.1448, simple_loss=0.2249, pruned_loss=0.0323, over 4580034.55 frames. ], batch size: 285, lr: 1.30e-02, grad_scale: 16.0 +2024-01-15 20:48:51,573 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=89110.0, ans=0.125 +2024-01-15 20:48:54,004 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module2.balancer1.prob, batch_count=89110.0, ans=0.125 +2024-01-15 20:48:57,409 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.489e+02 1.784e+02 2.002e+02 2.290e+02 4.373e+02, threshold=4.003e+02, percent-clipped=1.0 +2024-01-15 20:48:57,683 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=89143.33333333333, ans=0.1 +2024-01-15 20:48:59,047 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.4.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 20:49:00,042 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.attention_skip_rate, batch_count=89143.33333333333, ans=0.0 +2024-01-15 20:49:05,910 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=89143.33333333333, ans=0.0 +2024-01-15 20:49:31,090 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.out_combiner.scale_min, batch_count=89210.0, ans=0.2 +2024-01-15 20:49:38,048 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.self_attn1.whiten, num_groups=1, num_channels=384, metric=20.93 vs. limit=22.5 +2024-01-15 20:49:38,822 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module1.balancer2.prob, batch_count=89243.33333333333, ans=0.125 +2024-01-15 20:49:48,089 INFO [train.py:994] (1/2) Epoch 32, batch 650, loss[loss=0.1435, simple_loss=0.2252, pruned_loss=0.03087, over 24361.00 frames. ], tot_loss[loss=0.1446, simple_loss=0.2248, pruned_loss=0.03221, over 4637459.08 frames. ], batch size: 298, lr: 1.29e-02, grad_scale: 16.0 +2024-01-15 20:50:02,935 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=12.71 vs. limit=15.0 +2024-01-15 20:50:15,315 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=384, metric=20.61 vs. limit=22.5 +2024-01-15 20:50:30,128 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.balancer1.prob, batch_count=89376.66666666667, ans=0.125 +2024-01-15 20:50:40,804 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_skip_rate, batch_count=89410.0, ans=0.0 +2024-01-15 20:50:45,014 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.ff2_skip_rate, batch_count=89410.0, ans=0.0 +2024-01-15 20:50:48,641 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=89410.0, ans=0.125 +2024-01-15 20:50:51,335 INFO [train.py:994] (1/2) Epoch 32, batch 700, loss[loss=0.1444, simple_loss=0.227, pruned_loss=0.03084, over 24507.00 frames. ], tot_loss[loss=0.1442, simple_loss=0.2243, pruned_loss=0.03207, over 4668940.71 frames. ], batch size: 229, lr: 1.29e-02, grad_scale: 16.0 +2024-01-15 20:50:51,596 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.bypass_mid.scale_min, batch_count=89443.33333333333, ans=0.2 +2024-01-15 20:51:01,837 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.524e+02 1.726e+02 1.881e+02 2.158e+02 2.890e+02, threshold=3.763e+02, percent-clipped=0.0 +2024-01-15 20:51:14,577 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.bypass_mid.scale_min, batch_count=89510.0, ans=0.2 +2024-01-15 20:51:19,960 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=89510.0, ans=0.125 +2024-01-15 20:51:22,433 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_skip_rate, batch_count=89510.0, ans=0.0 +2024-01-15 20:51:29,411 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.whiten, num_groups=1, num_channels=256, metric=4.35 vs. limit=12.0 +2024-01-15 20:51:41,061 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.feed_forward1.out_whiten, num_groups=1, num_channels=192, metric=9.67 vs. limit=15.0 +2024-01-15 20:51:46,918 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=89576.66666666667, ans=0.1 +2024-01-15 20:51:52,571 INFO [train.py:994] (1/2) Epoch 32, batch 750, loss[loss=0.1395, simple_loss=0.2184, pruned_loss=0.03025, over 24470.00 frames. ], tot_loss[loss=0.1444, simple_loss=0.2244, pruned_loss=0.03213, over 4702088.74 frames. ], batch size: 267, lr: 1.29e-02, grad_scale: 16.0 +2024-01-15 20:51:53,180 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.feed_forward3.out_whiten, num_groups=1, num_channels=512, metric=9.89 vs. limit=15.0 +2024-01-15 20:51:54,653 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=192, metric=7.64 vs. limit=15.0 +2024-01-15 20:51:55,251 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.ff2_skip_rate, batch_count=89610.0, ans=0.0 +2024-01-15 20:51:58,584 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.out_proj.dropout_p, batch_count=89610.0, ans=0.1 +2024-01-15 20:52:01,747 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=89610.0, ans=0.1 +2024-01-15 20:52:14,605 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=6.69 vs. limit=15.0 +2024-01-15 20:52:37,661 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module1.balancer1.prob, batch_count=89710.0, ans=0.125 +2024-01-15 20:52:43,755 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module1.balancer1.max_abs, batch_count=89743.33333333333, ans=10.0 +2024-01-15 20:52:49,990 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.self_attn2.whiten, num_groups=1, num_channels=192, metric=14.31 vs. limit=22.5 +2024-01-15 20:52:52,478 INFO [train.py:994] (1/2) Epoch 32, batch 800, loss[loss=0.1332, simple_loss=0.2091, pruned_loss=0.02864, over 23987.00 frames. ], tot_loss[loss=0.1443, simple_loss=0.2243, pruned_loss=0.03214, over 4718643.92 frames. ], batch size: 131, lr: 1.29e-02, grad_scale: 32.0 +2024-01-15 20:53:02,382 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.514e+02 1.755e+02 1.980e+02 2.439e+02 4.753e+02, threshold=3.960e+02, percent-clipped=3.0 +2024-01-15 20:53:03,799 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=89810.0, ans=0.1 +2024-01-15 20:53:18,966 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=89843.33333333333, ans=0.1 +2024-01-15 20:53:39,093 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_skip_rate, batch_count=89910.0, ans=0.0 +2024-01-15 20:53:58,011 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.out_proj.dropout_p, batch_count=89920.0, ans=0.1 +2024-01-15 20:54:02,955 INFO [train.py:994] (1/2) Epoch 33, batch 0, loss[loss=0.1448, simple_loss=0.2263, pruned_loss=0.03162, over 24419.00 frames. ], tot_loss[loss=0.1448, simple_loss=0.2263, pruned_loss=0.03162, over 24419.00 frames. ], batch size: 258, lr: 1.27e-02, grad_scale: 32.0 +2024-01-15 20:54:02,955 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 20:54:23,935 INFO [train.py:1026] (1/2) Epoch 33, validation: loss=0.1663, simple_loss=0.2485, pruned_loss=0.04203, over 1622729.00 frames. +2024-01-15 20:54:23,936 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 20:54:30,831 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.attention_skip_rate, batch_count=89920.0, ans=0.0 +2024-01-15 20:54:40,481 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.feed_forward2.out_whiten, num_groups=1, num_channels=512, metric=11.30 vs. limit=15.0 +2024-01-15 20:54:46,000 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.ff3_skip_rate, batch_count=89953.33333333333, ans=0.0 +2024-01-15 20:54:53,397 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=89986.66666666667, ans=0.1 +2024-01-15 20:55:01,952 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module1.balancer1.max_abs, batch_count=90020.0, ans=10.0 +2024-01-15 20:55:06,798 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=90020.0, ans=0.1 +2024-01-15 20:55:14,420 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=90053.33333333333, ans=0.1 +2024-01-15 20:55:20,107 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_whiten, num_groups=1, num_channels=384, metric=13.93 vs. limit=15.0 +2024-01-15 20:55:23,238 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=90053.33333333333, ans=0.1 +2024-01-15 20:55:25,620 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.balancer2.prob, batch_count=90086.66666666667, ans=0.125 +2024-01-15 20:55:26,533 INFO [train.py:994] (1/2) Epoch 33, batch 50, loss[loss=0.1477, simple_loss=0.2257, pruned_loss=0.03489, over 24537.00 frames. ], tot_loss[loss=0.1426, simple_loss=0.2229, pruned_loss=0.03114, over 1092004.36 frames. ], batch size: 236, lr: 1.27e-02, grad_scale: 32.0 +2024-01-15 20:55:26,811 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.ff3_skip_rate, batch_count=90086.66666666667, ans=0.0 +2024-01-15 20:55:31,552 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.ff3_skip_rate, batch_count=90086.66666666667, ans=0.0 +2024-01-15 20:55:44,351 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=384, metric=4.83 vs. limit=15.0 +2024-01-15 20:55:46,545 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.417e+02 1.799e+02 2.020e+02 2.508e+02 4.258e+02, threshold=4.039e+02, percent-clipped=1.0 +2024-01-15 20:55:47,202 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.whiten, num_groups=1, num_channels=512, metric=4.81 vs. limit=12.0 +2024-01-15 20:55:49,107 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_skip_rate, batch_count=90120.0, ans=0.0 +2024-01-15 20:55:52,732 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.balancer1.prob, batch_count=90153.33333333333, ans=0.125 +2024-01-15 20:55:54,255 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.nonlin_attention.whiten2, num_groups=1, num_channels=384, metric=3.09 vs. limit=15.0 +2024-01-15 20:55:56,450 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module2.balancer1.prob, batch_count=90153.33333333333, ans=0.125 +2024-01-15 20:55:56,949 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=192, metric=5.98 vs. limit=10.0 +2024-01-15 20:56:12,103 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=384, metric=10.94 vs. limit=15.0 +2024-01-15 20:56:27,946 INFO [train.py:994] (1/2) Epoch 33, batch 100, loss[loss=0.1462, simple_loss=0.2256, pruned_loss=0.0334, over 24605.00 frames. ], tot_loss[loss=0.1426, simple_loss=0.2221, pruned_loss=0.03148, over 1903496.90 frames. ], batch size: 199, lr: 1.27e-02, grad_scale: 32.0 +2024-01-15 20:56:31,220 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_module1.balancer1.prob, batch_count=90253.33333333333, ans=0.125 +2024-01-15 20:56:38,877 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_skip_rate, batch_count=90253.33333333333, ans=0.0 +2024-01-15 20:56:42,870 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=384, metric=19.57 vs. limit=22.5 +2024-01-15 20:56:43,545 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward2.hidden_balancer.prob, batch_count=90286.66666666667, ans=0.125 +2024-01-15 20:56:48,511 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=3.07 vs. limit=6.0 +2024-01-15 20:57:06,983 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_skip_rate, batch_count=90353.33333333333, ans=0.0 +2024-01-15 20:57:15,807 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=512, metric=13.12 vs. limit=22.5 +2024-01-15 20:57:18,876 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.ff3_skip_rate, batch_count=90386.66666666667, ans=0.0 +2024-01-15 20:57:30,420 INFO [train.py:994] (1/2) Epoch 33, batch 150, loss[loss=0.1271, simple_loss=0.1986, pruned_loss=0.02779, over 23445.00 frames. ], tot_loss[loss=0.1423, simple_loss=0.2223, pruned_loss=0.03115, over 2559674.37 frames. ], batch size: 119, lr: 1.27e-02, grad_scale: 32.0 +2024-01-15 20:57:47,024 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.self_attn2.whiten.whitening_limit, batch_count=90453.33333333333, ans=22.5 +2024-01-15 20:57:48,354 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.whiten, num_groups=1, num_channels=192, metric=4.71 vs. limit=12.0 +2024-01-15 20:57:49,750 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.457e+02 1.876e+02 2.056e+02 2.342e+02 3.508e+02, threshold=4.112e+02, percent-clipped=0.0 +2024-01-15 20:58:13,823 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.ff3_skip_rate, batch_count=90520.0, ans=0.0 +2024-01-15 20:58:20,503 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module2.balancer2.prob, batch_count=90553.33333333333, ans=0.125 +2024-01-15 20:58:21,754 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward3.hidden_balancer.prob, batch_count=90553.33333333333, ans=0.125 +2024-01-15 20:58:31,946 INFO [train.py:994] (1/2) Epoch 33, batch 200, loss[loss=0.1375, simple_loss=0.2167, pruned_loss=0.02912, over 24476.00 frames. ], tot_loss[loss=0.143, simple_loss=0.2232, pruned_loss=0.03137, over 3061214.57 frames. ], batch size: 187, lr: 1.27e-02, grad_scale: 32.0 +2024-01-15 20:58:38,581 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=90586.66666666667, ans=0.1 +2024-01-15 20:58:44,994 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module2.balancer1.prob, batch_count=90620.0, ans=0.125 +2024-01-15 20:59:09,206 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_skip_rate, batch_count=90686.66666666667, ans=0.0 +2024-01-15 20:59:13,161 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.nonlin_attention.whiten2, num_groups=1, num_channels=512, metric=5.78 vs. limit=15.0 +2024-01-15 20:59:13,971 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.self_attn_weights.pos_emb_skip_rate, batch_count=90686.66666666667, ans=0.0 +2024-01-15 20:59:27,564 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.balancer2.prob, batch_count=90720.0, ans=0.125 +2024-01-15 20:59:34,955 INFO [train.py:994] (1/2) Epoch 33, batch 250, loss[loss=0.1446, simple_loss=0.2235, pruned_loss=0.03288, over 24511.00 frames. ], tot_loss[loss=0.1435, simple_loss=0.2238, pruned_loss=0.0316, over 3458016.92 frames. ], batch size: 216, lr: 1.27e-02, grad_scale: 32.0 +2024-01-15 20:59:54,427 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.486e+02 1.800e+02 1.980e+02 2.341e+02 4.603e+02, threshold=3.961e+02, percent-clipped=1.0 +2024-01-15 21:00:09,420 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.out_combiner.scale_min, batch_count=90820.0, ans=0.2 +2024-01-15 21:00:16,365 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.ff2_skip_rate, batch_count=90853.33333333333, ans=0.0 +2024-01-15 21:00:22,410 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=256, metric=5.60 vs. limit=15.0 +2024-01-15 21:00:24,209 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_skip_rate, batch_count=90886.66666666667, ans=0.0 +2024-01-15 21:00:34,336 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_skip_rate, batch_count=90886.66666666667, ans=0.0 +2024-01-15 21:00:36,338 INFO [train.py:994] (1/2) Epoch 33, batch 300, loss[loss=0.1547, simple_loss=0.2381, pruned_loss=0.03563, over 24550.00 frames. ], tot_loss[loss=0.1436, simple_loss=0.224, pruned_loss=0.03167, over 3745664.51 frames. ], batch size: 243, lr: 1.27e-02, grad_scale: 32.0 +2024-01-15 21:00:43,871 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.attention_skip_rate, batch_count=90920.0, ans=0.0 +2024-01-15 21:00:45,718 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass_mid.scale_min, batch_count=90920.0, ans=0.2 +2024-01-15 21:00:46,899 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 21:00:54,552 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.out_combiner.scale_min, batch_count=90953.33333333333, ans=0.2 +2024-01-15 21:01:00,346 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.bypass.scale_min, batch_count=90986.66666666667, ans=0.2 +2024-01-15 21:01:00,358 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module1.balancer2.min_abs, batch_count=90986.66666666667, ans=0.5 +2024-01-15 21:01:05,196 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=256, metric=15.12 vs. limit=22.5 +2024-01-15 21:01:08,626 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module2.balancer1.min_positive, batch_count=90986.66666666667, ans=0.025 +2024-01-15 21:01:22,469 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_skip_rate, batch_count=91020.0, ans=0.0 +2024-01-15 21:01:24,123 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=8.48 vs. limit=10.0 +2024-01-15 21:01:30,153 INFO [scaling.py:1022] (1/2) Whitening: name=encoder_embed.out_whiten, num_groups=1, num_channels=192, metric=7.16 vs. limit=8.0 +2024-01-15 21:01:37,844 INFO [train.py:994] (1/2) Epoch 33, batch 350, loss[loss=0.1481, simple_loss=0.2338, pruned_loss=0.03122, over 24474.00 frames. ], tot_loss[loss=0.1437, simple_loss=0.2239, pruned_loss=0.03174, over 3976344.02 frames. ], batch size: 222, lr: 1.26e-02, grad_scale: 32.0 +2024-01-15 21:01:50,444 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.bypass.skip_rate, batch_count=91120.0, ans=0.035 +2024-01-15 21:01:57,158 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.376e+02 1.815e+02 2.000e+02 2.389e+02 3.512e+02, threshold=3.999e+02, percent-clipped=0.0 +2024-01-15 21:01:57,800 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.feed_forward2.out_whiten, num_groups=1, num_channels=512, metric=20.55 vs. limit=15.0 +2024-01-15 21:02:10,400 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.conv.8.prob, batch_count=91153.33333333333, ans=0.125 +2024-01-15 21:02:18,339 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=192, metric=7.37 vs. limit=15.0 +2024-01-15 21:02:18,864 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=91186.66666666667, ans=0.1 +2024-01-15 21:02:31,527 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.nonlin_attention.whiten2.whitening_limit, batch_count=91220.0, ans=15.0 +2024-01-15 21:02:39,663 INFO [train.py:994] (1/2) Epoch 33, batch 400, loss[loss=0.1493, simple_loss=0.2292, pruned_loss=0.03469, over 24591.00 frames. ], tot_loss[loss=0.1436, simple_loss=0.2238, pruned_loss=0.03176, over 4163683.37 frames. ], batch size: 199, lr: 1.26e-02, grad_scale: 32.0 +2024-01-15 21:02:44,633 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=91253.33333333333, ans=0.0 +2024-01-15 21:03:02,307 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=91286.66666666667, ans=0.125 +2024-01-15 21:03:23,791 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=512, metric=16.30 vs. limit=22.5 +2024-01-15 21:03:25,589 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.ff3_skip_rate, batch_count=91353.33333333333, ans=0.0 +2024-01-15 21:03:41,391 INFO [train.py:994] (1/2) Epoch 33, batch 450, loss[loss=0.1475, simple_loss=0.2243, pruned_loss=0.03534, over 24346.00 frames. ], tot_loss[loss=0.1437, simple_loss=0.2238, pruned_loss=0.03185, over 4313889.31 frames. ], batch size: 153, lr: 1.26e-02, grad_scale: 32.0 +2024-01-15 21:03:41,686 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=91420.0, ans=0.1 +2024-01-15 21:03:45,727 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_skip_rate, batch_count=91420.0, ans=0.0 +2024-01-15 21:03:49,321 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=91420.0, ans=0.0 +2024-01-15 21:03:51,678 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.balancer1.prob, batch_count=91420.0, ans=0.125 +2024-01-15 21:04:02,603 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.480e+02 1.828e+02 2.096e+02 2.286e+02 3.771e+02, threshold=4.192e+02, percent-clipped=0.0 +2024-01-15 21:04:25,510 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.ff2_skip_rate, batch_count=91520.0, ans=0.0 +2024-01-15 21:04:25,575 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.ff3_skip_rate, batch_count=91520.0, ans=0.0 +2024-01-15 21:04:43,191 INFO [train.py:994] (1/2) Epoch 33, batch 500, loss[loss=0.1566, simple_loss=0.2325, pruned_loss=0.04031, over 24485.00 frames. ], tot_loss[loss=0.1431, simple_loss=0.2233, pruned_loss=0.03142, over 4426065.51 frames. ], batch size: 216, lr: 1.26e-02, grad_scale: 32.0 +2024-01-15 21:04:52,547 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_skip_rate, batch_count=91586.66666666667, ans=0.0 +2024-01-15 21:04:52,887 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=384, metric=10.07 vs. limit=15.0 +2024-01-15 21:05:27,037 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward2.hidden_balancer.prob, batch_count=91686.66666666667, ans=0.125 +2024-01-15 21:05:45,239 INFO [train.py:994] (1/2) Epoch 33, batch 550, loss[loss=0.15, simple_loss=0.2342, pruned_loss=0.0329, over 24524.00 frames. ], tot_loss[loss=0.1428, simple_loss=0.2231, pruned_loss=0.03119, over 4515421.96 frames. ], batch size: 193, lr: 1.26e-02, grad_scale: 32.0 +2024-01-15 21:05:54,937 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.self_attn2.whiten, num_groups=1, num_channels=512, metric=15.18 vs. limit=22.5 +2024-01-15 21:05:55,156 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.whiten, num_groups=1, num_channels=192, metric=3.80 vs. limit=12.0 +2024-01-15 21:06:05,731 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module2.balancer1.prob, batch_count=91786.66666666667, ans=0.125 +2024-01-15 21:06:06,611 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.471e+02 1.687e+02 1.895e+02 2.179e+02 3.230e+02, threshold=3.789e+02, percent-clipped=0.0 +2024-01-15 21:06:10,342 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=91820.0, ans=0.1 +2024-01-15 21:06:23,885 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_skip_rate, batch_count=91853.33333333333, ans=0.0 +2024-01-15 21:06:35,726 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.whiten, num_groups=1, num_channels=384, metric=3.83 vs. limit=12.0 +2024-01-15 21:06:47,475 INFO [train.py:994] (1/2) Epoch 33, batch 600, loss[loss=0.1419, simple_loss=0.2258, pruned_loss=0.02901, over 24480.00 frames. ], tot_loss[loss=0.1431, simple_loss=0.2237, pruned_loss=0.03127, over 4592172.73 frames. ], batch size: 267, lr: 1.26e-02, grad_scale: 32.0 +2024-01-15 21:07:14,953 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.self_attn_weights.pos_emb_skip_rate, batch_count=91986.66666666667, ans=0.0 +2024-01-15 21:07:23,774 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module1.balancer2.prob, batch_count=92020.0, ans=0.125 +2024-01-15 21:07:26,049 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.ff3_skip_rate, batch_count=92020.0, ans=0.0 +2024-01-15 21:07:45,016 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=92053.33333333333, ans=0.125 +2024-01-15 21:07:49,417 INFO [train.py:994] (1/2) Epoch 33, batch 650, loss[loss=0.1577, simple_loss=0.2348, pruned_loss=0.04032, over 24529.00 frames. ], tot_loss[loss=0.1428, simple_loss=0.2231, pruned_loss=0.03127, over 4638900.80 frames. ], batch size: 204, lr: 1.26e-02, grad_scale: 32.0 +2024-01-15 21:08:00,922 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.convnext.out_balancer.prob, batch_count=92120.0, ans=0.125 +2024-01-15 21:08:09,800 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_module2.balancer2.prob, batch_count=92120.0, ans=0.125 +2024-01-15 21:08:10,670 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.572e+02 1.688e+02 1.875e+02 2.082e+02 2.904e+02, threshold=3.750e+02, percent-clipped=0.0 +2024-01-15 21:08:20,992 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.self_attn1.whiten, num_groups=1, num_channels=192, metric=12.01 vs. limit=22.5 +2024-01-15 21:08:51,407 INFO [train.py:994] (1/2) Epoch 33, batch 700, loss[loss=0.1378, simple_loss=0.2267, pruned_loss=0.02443, over 24368.00 frames. ], tot_loss[loss=0.1428, simple_loss=0.2231, pruned_loss=0.03129, over 4671729.78 frames. ], batch size: 298, lr: 1.26e-02, grad_scale: 32.0 +2024-01-15 21:08:52,831 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=92253.33333333333, ans=0.125 +2024-01-15 21:08:54,293 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.conv_module2.whiten, num_groups=1, num_channels=512, metric=9.55 vs. limit=15.0 +2024-01-15 21:09:02,519 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.nonlin_attention.balancer.prob, batch_count=92286.66666666667, ans=0.125 +2024-01-15 21:09:07,196 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass.skip_rate, batch_count=92286.66666666667, ans=0.07 +2024-01-15 21:09:53,364 INFO [train.py:994] (1/2) Epoch 33, batch 750, loss[loss=0.1338, simple_loss=0.216, pruned_loss=0.02586, over 24181.00 frames. ], tot_loss[loss=0.1433, simple_loss=0.2239, pruned_loss=0.03138, over 4700738.02 frames. ], batch size: 140, lr: 1.26e-02, grad_scale: 32.0 +2024-01-15 21:09:59,900 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.ff3_skip_rate, batch_count=92420.0, ans=0.0 +2024-01-15 21:09:59,939 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=92420.0, ans=0.125 +2024-01-15 21:10:04,980 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=512, metric=5.33 vs. limit=15.0 +2024-01-15 21:10:06,125 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.feed_forward1.out_whiten, num_groups=1, num_channels=512, metric=10.85 vs. limit=15.0 +2024-01-15 21:10:13,744 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.344e+02 1.781e+02 1.903e+02 2.114e+02 3.493e+02, threshold=3.806e+02, percent-clipped=0.0 +2024-01-15 21:10:23,060 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.bypass_mid.scale_min, batch_count=92486.66666666667, ans=0.2 +2024-01-15 21:10:43,688 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=92553.33333333333, ans=0.1 +2024-01-15 21:10:52,263 INFO [train.py:994] (1/2) Epoch 33, batch 800, loss[loss=0.1388, simple_loss=0.2197, pruned_loss=0.02895, over 24593.00 frames. ], tot_loss[loss=0.1434, simple_loss=0.2237, pruned_loss=0.03154, over 4718560.77 frames. ], batch size: 199, lr: 1.25e-02, grad_scale: 32.0 +2024-01-15 21:10:55,339 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.conv_module2.whiten, num_groups=1, num_channels=192, metric=3.17 vs. limit=15.0 +2024-01-15 21:11:07,829 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.self_attn_weights.whiten_keys, num_groups=8, num_channels=256, metric=5.31 vs. limit=6.0 +2024-01-15 21:11:26,233 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.ff2_skip_rate, batch_count=92686.66666666667, ans=0.0 +2024-01-15 21:11:29,628 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=92686.66666666667, ans=0.1 +2024-01-15 21:12:03,067 INFO [train.py:994] (1/2) Epoch 34, batch 0, loss[loss=0.1376, simple_loss=0.2221, pruned_loss=0.02652, over 24369.00 frames. ], tot_loss[loss=0.1376, simple_loss=0.2221, pruned_loss=0.02652, over 24369.00 frames. ], batch size: 275, lr: 1.24e-02, grad_scale: 32.0 +2024-01-15 21:12:03,068 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 21:12:23,926 INFO [train.py:1026] (1/2) Epoch 34, validation: loss=0.166, simple_loss=0.2489, pruned_loss=0.04151, over 1622729.00 frames. +2024-01-15 21:12:23,927 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 21:12:26,439 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=92730.0, ans=0.1 +2024-01-15 21:12:31,891 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=192, metric=11.02 vs. limit=15.0 +2024-01-15 21:12:40,890 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module2.balancer1.prob, batch_count=92763.33333333333, ans=0.125 +2024-01-15 21:12:52,820 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.566e+02 1.787e+02 1.980e+02 2.213e+02 3.780e+02, threshold=3.960e+02, percent-clipped=0.0 +2024-01-15 21:13:08,241 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.ff3_skip_rate, batch_count=92830.0, ans=0.0 +2024-01-15 21:13:24,347 INFO [train.py:994] (1/2) Epoch 34, batch 50, loss[loss=0.121, simple_loss=0.1952, pruned_loss=0.02341, over 23555.00 frames. ], tot_loss[loss=0.1412, simple_loss=0.2213, pruned_loss=0.03058, over 1088429.30 frames. ], batch size: 119, lr: 1.24e-02, grad_scale: 32.0 +2024-01-15 21:13:25,988 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.self_attn2.whiten, num_groups=1, num_channels=256, metric=14.55 vs. limit=22.5 +2024-01-15 21:13:42,208 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.ff2_skip_rate, batch_count=92930.0, ans=0.0 +2024-01-15 21:13:49,344 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module2.balancer2.prob, batch_count=92963.33333333333, ans=0.125 +2024-01-15 21:13:56,406 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.bypass_mid.scale_min, batch_count=92963.33333333333, ans=0.2 +2024-01-15 21:14:00,799 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=512, metric=14.58 vs. limit=15.0 +2024-01-15 21:14:04,032 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.bypass_mid.scale_min, batch_count=92996.66666666667, ans=0.2 +2024-01-15 21:14:25,657 INFO [train.py:994] (1/2) Epoch 34, batch 100, loss[loss=0.1234, simple_loss=0.2023, pruned_loss=0.02222, over 23949.00 frames. ], tot_loss[loss=0.1407, simple_loss=0.2211, pruned_loss=0.03018, over 1907212.45 frames. ], batch size: 131, lr: 1.23e-02, grad_scale: 32.0 +2024-01-15 21:14:31,937 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.ff3_skip_rate, batch_count=93063.33333333333, ans=0.0 +2024-01-15 21:14:37,878 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=93096.66666666667, ans=0.1 +2024-01-15 21:14:41,449 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass.scale_min, batch_count=93096.66666666667, ans=0.2 +2024-01-15 21:14:49,885 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=192, metric=2.94 vs. limit=10.0 +2024-01-15 21:14:56,458 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.468e+02 1.672e+02 1.813e+02 2.051e+02 3.232e+02, threshold=3.625e+02, percent-clipped=0.0 +2024-01-15 21:14:59,575 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=384, metric=9.38 vs. limit=15.0 +2024-01-15 21:15:06,341 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module2.balancer2.prob, batch_count=93163.33333333333, ans=0.125 +2024-01-15 21:15:28,397 INFO [train.py:994] (1/2) Epoch 34, batch 150, loss[loss=0.1443, simple_loss=0.2202, pruned_loss=0.03416, over 24408.00 frames. ], tot_loss[loss=0.1416, simple_loss=0.222, pruned_loss=0.03055, over 2559310.27 frames. ], batch size: 258, lr: 1.23e-02, grad_scale: 16.0 +2024-01-15 21:15:54,474 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.5.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 21:16:15,929 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.out_proj.dropout_p, batch_count=93330.0, ans=0.1 +2024-01-15 21:16:21,997 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.bypass.skip_rate, batch_count=93363.33333333333, ans=0.07 +2024-01-15 21:16:24,329 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module1.balancer2.prob, batch_count=93363.33333333333, ans=0.125 +2024-01-15 21:16:31,447 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=93396.66666666667, ans=0.125 +2024-01-15 21:16:32,299 INFO [train.py:994] (1/2) Epoch 34, batch 200, loss[loss=0.142, simple_loss=0.2238, pruned_loss=0.03012, over 24190.00 frames. ], tot_loss[loss=0.1421, simple_loss=0.2225, pruned_loss=0.03081, over 3062212.06 frames. ], batch size: 140, lr: 1.23e-02, grad_scale: 16.0 +2024-01-15 21:16:35,635 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.ff2_skip_rate, batch_count=93396.66666666667, ans=0.0 +2024-01-15 21:16:36,690 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.balancer1.prob, batch_count=93396.66666666667, ans=0.125 +2024-01-15 21:16:40,213 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.bypass.skip_rate, batch_count=93396.66666666667, ans=0.035 +2024-01-15 21:16:56,174 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.bypass.scale_min, batch_count=93463.33333333333, ans=0.2 +2024-01-15 21:16:57,577 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=3.08 vs. limit=6.0 +2024-01-15 21:17:03,583 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.424e+02 1.716e+02 2.028e+02 2.257e+02 3.544e+02, threshold=4.056e+02, percent-clipped=0.0 +2024-01-15 21:17:07,361 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.ff2_skip_rate, batch_count=93463.33333333333, ans=0.0 +2024-01-15 21:17:22,372 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=93530.0, ans=0.125 +2024-01-15 21:17:29,070 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=93530.0, ans=0.125 +2024-01-15 21:17:31,532 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward3.hidden_balancer.prob, batch_count=93530.0, ans=0.125 +2024-01-15 21:17:31,570 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.ff2_skip_rate, batch_count=93530.0, ans=0.0 +2024-01-15 21:17:35,379 INFO [train.py:994] (1/2) Epoch 34, batch 250, loss[loss=0.1497, simple_loss=0.2308, pruned_loss=0.03433, over 24457.00 frames. ], tot_loss[loss=0.1424, simple_loss=0.2227, pruned_loss=0.03105, over 3444080.41 frames. ], batch size: 222, lr: 1.23e-02, grad_scale: 16.0 +2024-01-15 21:17:49,835 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=93596.66666666667, ans=0.1 +2024-01-15 21:17:53,910 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.ff2_skip_rate, batch_count=93596.66666666667, ans=0.0 +2024-01-15 21:17:55,517 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.self_attn1.whiten, num_groups=1, num_channels=192, metric=11.99 vs. limit=22.5 +2024-01-15 21:17:58,470 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_module1.balancer1.max_abs, batch_count=93630.0, ans=10.0 +2024-01-15 21:18:10,736 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=93663.33333333333, ans=0.125 +2024-01-15 21:18:17,823 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.balancer2.prob, batch_count=93663.33333333333, ans=0.125 +2024-01-15 21:18:25,589 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.ff2_skip_rate, batch_count=93696.66666666667, ans=0.0 +2024-01-15 21:18:25,878 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.nonlin_attention.whiten2, num_groups=1, num_channels=512, metric=4.19 vs. limit=15.0 +2024-01-15 21:18:28,351 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=384, metric=5.22 vs. limit=10.0 +2024-01-15 21:18:36,397 INFO [train.py:994] (1/2) Epoch 34, batch 300, loss[loss=0.138, simple_loss=0.2214, pruned_loss=0.02736, over 24374.00 frames. ], tot_loss[loss=0.1419, simple_loss=0.2223, pruned_loss=0.03071, over 3749262.05 frames. ], batch size: 275, lr: 1.23e-02, grad_scale: 16.0 +2024-01-15 21:19:04,706 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.ff2_skip_rate, batch_count=93796.66666666667, ans=0.0 +2024-01-15 21:19:07,348 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.570e+02 1.906e+02 2.356e+02 3.034e+02 5.293e+02, threshold=4.711e+02, percent-clipped=5.0 +2024-01-15 21:19:37,990 INFO [train.py:994] (1/2) Epoch 34, batch 350, loss[loss=0.127, simple_loss=0.1878, pruned_loss=0.03309, over 17196.00 frames. ], tot_loss[loss=0.1415, simple_loss=0.2219, pruned_loss=0.03051, over 3972989.95 frames. ], batch size: 75, lr: 1.23e-02, grad_scale: 16.0 +2024-01-15 21:19:40,659 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=93896.66666666667, ans=0.125 +2024-01-15 21:20:06,657 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.ff2_skip_rate, batch_count=93963.33333333333, ans=0.0 +2024-01-15 21:20:11,446 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module2.balancer1.prob, batch_count=93963.33333333333, ans=0.125 +2024-01-15 21:20:26,808 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.ff2_skip_rate, batch_count=94030.0, ans=0.0 +2024-01-15 21:20:29,252 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.5.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 21:20:40,061 INFO [train.py:994] (1/2) Epoch 34, batch 400, loss[loss=0.1461, simple_loss=0.2242, pruned_loss=0.03398, over 24449.00 frames. ], tot_loss[loss=0.1423, simple_loss=0.223, pruned_loss=0.03086, over 4169573.36 frames. ], batch size: 170, lr: 1.23e-02, grad_scale: 32.0 +2024-01-15 21:20:46,882 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=94063.33333333333, ans=0.0 +2024-01-15 21:20:48,014 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.ff3_skip_rate, batch_count=94063.33333333333, ans=0.0 +2024-01-15 21:20:49,792 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=192, metric=6.24 vs. limit=15.0 +2024-01-15 21:21:06,637 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module1.balancer2.prob, batch_count=94130.0, ans=0.125 +2024-01-15 21:21:09,137 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.ff2_skip_rate, batch_count=94130.0, ans=0.0 +2024-01-15 21:21:11,713 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.483e+02 1.754e+02 1.948e+02 2.280e+02 3.320e+02, threshold=3.895e+02, percent-clipped=0.0 +2024-01-15 21:21:12,083 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.attention_skip_rate, batch_count=94130.0, ans=0.0 +2024-01-15 21:21:24,942 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.1.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 21:21:39,283 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.out_combiner.scale_min, batch_count=94196.66666666667, ans=0.2 +2024-01-15 21:21:42,439 INFO [train.py:994] (1/2) Epoch 34, batch 450, loss[loss=0.1478, simple_loss=0.2279, pruned_loss=0.03383, over 24379.00 frames. ], tot_loss[loss=0.143, simple_loss=0.2236, pruned_loss=0.03117, over 4309544.22 frames. ], batch size: 275, lr: 1.23e-02, grad_scale: 32.0 +2024-01-15 21:21:59,125 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=512, metric=15.13 vs. limit=22.5 +2024-01-15 21:22:01,099 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=94263.33333333333, ans=0.125 +2024-01-15 21:22:03,385 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module2.balancer2.prob, batch_count=94263.33333333333, ans=0.125 +2024-01-15 21:22:14,599 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module2.balancer1.prob, batch_count=94296.66666666667, ans=0.125 +2024-01-15 21:22:28,549 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=256, metric=3.33 vs. limit=15.0 +2024-01-15 21:22:35,879 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module2.balancer1.prob, batch_count=94363.33333333333, ans=0.125 +2024-01-15 21:22:39,443 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=94363.33333333333, ans=0.1 +2024-01-15 21:22:43,987 INFO [train.py:994] (1/2) Epoch 34, batch 500, loss[loss=0.1545, simple_loss=0.2298, pruned_loss=0.03961, over 24490.00 frames. ], tot_loss[loss=0.1421, simple_loss=0.2226, pruned_loss=0.03078, over 4414896.09 frames. ], batch size: 165, lr: 1.23e-02, grad_scale: 32.0 +2024-01-15 21:22:51,949 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.ff3_skip_rate, batch_count=94396.66666666667, ans=0.0 +2024-01-15 21:23:05,459 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=94430.0, ans=0.1 +2024-01-15 21:23:15,326 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.444e+02 1.724e+02 1.952e+02 2.350e+02 3.869e+02, threshold=3.904e+02, percent-clipped=0.0 +2024-01-15 21:23:20,482 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_module1.balancer1.prob, batch_count=94496.66666666667, ans=0.125 +2024-01-15 21:23:36,055 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module1.balancer2.prob, batch_count=94530.0, ans=0.125 +2024-01-15 21:23:37,348 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.4.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 21:23:45,973 INFO [train.py:994] (1/2) Epoch 34, batch 550, loss[loss=0.1448, simple_loss=0.2242, pruned_loss=0.0327, over 24482.00 frames. ], tot_loss[loss=0.1419, simple_loss=0.2222, pruned_loss=0.03078, over 4494499.41 frames. ], batch size: 187, lr: 1.23e-02, grad_scale: 32.0 +2024-01-15 21:23:55,213 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_skip_rate, batch_count=94563.33333333333, ans=0.0 +2024-01-15 21:24:06,329 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.balancer1.prob, batch_count=94596.66666666667, ans=0.125 +2024-01-15 21:24:12,796 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.attention_skip_rate, batch_count=94630.0, ans=0.0 +2024-01-15 21:24:29,227 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=94663.33333333333, ans=0.125 +2024-01-15 21:24:34,529 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.ff2_skip_rate, batch_count=94696.66666666667, ans=0.0 +2024-01-15 21:24:38,738 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.bypass_mid.scale_min, batch_count=94696.66666666667, ans=0.2 +2024-01-15 21:24:47,660 INFO [train.py:994] (1/2) Epoch 34, batch 600, loss[loss=0.1344, simple_loss=0.2218, pruned_loss=0.02352, over 23886.00 frames. ], tot_loss[loss=0.1418, simple_loss=0.2223, pruned_loss=0.03068, over 4566723.58 frames. ], batch size: 328, lr: 1.22e-02, grad_scale: 16.0 +2024-01-15 21:25:03,337 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=94763.33333333333, ans=0.1 +2024-01-15 21:25:09,296 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module2.balancer2.prob, batch_count=94763.33333333333, ans=0.125 +2024-01-15 21:25:09,314 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module2.balancer1.prob, batch_count=94763.33333333333, ans=0.125 +2024-01-15 21:25:19,619 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.368e+02 1.774e+02 2.106e+02 2.731e+02 3.699e+02, threshold=4.212e+02, percent-clipped=0.0 +2024-01-15 21:25:36,470 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module2.balancer2.prob, batch_count=94863.33333333333, ans=0.125 +2024-01-15 21:25:49,521 INFO [train.py:994] (1/2) Epoch 34, batch 650, loss[loss=0.1589, simple_loss=0.2294, pruned_loss=0.04413, over 24550.00 frames. ], tot_loss[loss=0.142, simple_loss=0.2225, pruned_loss=0.03071, over 4633075.02 frames. ], batch size: 165, lr: 1.22e-02, grad_scale: 16.0 +2024-01-15 21:25:56,312 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module1.balancer2.prob, batch_count=94896.66666666667, ans=0.125 +2024-01-15 21:26:10,004 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.whiten, num_groups=1, num_channels=384, metric=4.42 vs. limit=12.0 +2024-01-15 21:26:27,102 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.bypass_mid.scale_min, batch_count=94996.66666666667, ans=0.2 +2024-01-15 21:26:29,456 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module2.balancer1.prob, batch_count=94996.66666666667, ans=0.125 +2024-01-15 21:26:38,338 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.self_attn2.whiten, num_groups=1, num_channels=192, metric=13.12 vs. limit=22.5 +2024-01-15 21:26:41,797 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=95030.0, ans=0.1 +2024-01-15 21:26:46,483 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.3.encoder.layers.3.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 21:26:51,485 INFO [train.py:994] (1/2) Epoch 34, batch 700, loss[loss=0.1475, simple_loss=0.2275, pruned_loss=0.0337, over 24593.00 frames. ], tot_loss[loss=0.142, simple_loss=0.2223, pruned_loss=0.03086, over 4668071.01 frames. ], batch size: 199, lr: 1.22e-02, grad_scale: 16.0 +2024-01-15 21:26:57,718 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module2.balancer2.prob, batch_count=95063.33333333333, ans=0.125 +2024-01-15 21:27:09,038 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module1.balancer2.min_positive, batch_count=95096.66666666667, ans=0.05 +2024-01-15 21:27:21,228 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.attention_skip_rate, batch_count=95130.0, ans=0.0 +2024-01-15 21:27:23,403 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.377e+02 1.691e+02 1.887e+02 2.144e+02 3.212e+02, threshold=3.773e+02, percent-clipped=0.0 +2024-01-15 21:27:27,210 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module1.balancer2.prob, batch_count=95163.33333333333, ans=0.125 +2024-01-15 21:27:36,013 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass_mid.scale_min, batch_count=95163.33333333333, ans=0.2 +2024-01-15 21:27:41,959 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.self_attn1.whiten.whitening_limit, batch_count=95196.66666666667, ans=22.5 +2024-01-15 21:27:52,604 INFO [train.py:994] (1/2) Epoch 34, batch 750, loss[loss=0.1426, simple_loss=0.226, pruned_loss=0.02955, over 24462.00 frames. ], tot_loss[loss=0.1418, simple_loss=0.2222, pruned_loss=0.03066, over 4703657.46 frames. ], batch size: 216, lr: 1.22e-02, grad_scale: 16.0 +2024-01-15 21:28:07,333 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=256, metric=3.25 vs. limit=15.0 +2024-01-15 21:28:19,059 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.4.encoder.layers.2.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 21:28:33,086 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.bypass.scale_min, batch_count=95330.0, ans=0.2 +2024-01-15 21:28:47,707 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.dropout.p, batch_count=95363.33333333333, ans=0.1 +2024-01-15 21:28:48,289 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.feed_forward1.out_whiten, num_groups=1, num_channels=384, metric=14.57 vs. limit=15.0 +2024-01-15 21:28:51,208 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=95363.33333333333, ans=0.1 +2024-01-15 21:28:53,183 INFO [train.py:994] (1/2) Epoch 34, batch 800, loss[loss=0.1376, simple_loss=0.2249, pruned_loss=0.02521, over 24367.00 frames. ], tot_loss[loss=0.1414, simple_loss=0.2218, pruned_loss=0.03051, over 4723061.43 frames. ], batch size: 275, lr: 1.22e-02, grad_scale: 32.0 +2024-01-15 21:29:04,565 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.attention_skip_rate, batch_count=95430.0, ans=0.0 +2024-01-15 21:29:13,342 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module1.balancer1.prob, batch_count=95430.0, ans=0.125 +2024-01-15 21:29:16,709 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.3.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 21:29:21,091 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.nonlin_attention.balancer.prob, batch_count=95463.33333333333, ans=0.125 +2024-01-15 21:29:23,107 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.457e+02 1.735e+02 1.860e+02 2.155e+02 3.923e+02, threshold=3.720e+02, percent-clipped=1.0 +2024-01-15 21:29:28,971 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.bypass.skip_rate, batch_count=95496.66666666667, ans=0.04949747468305833 +2024-01-15 21:30:03,693 INFO [train.py:994] (1/2) Epoch 35, batch 0, loss[loss=0.1419, simple_loss=0.2218, pruned_loss=0.031, over 24528.00 frames. ], tot_loss[loss=0.1419, simple_loss=0.2218, pruned_loss=0.031, over 24528.00 frames. ], batch size: 229, lr: 1.20e-02, grad_scale: 32.0 +2024-01-15 21:30:03,693 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 21:30:24,250 INFO [zipformer.py:1858] (1/2) name=encoder.encoders.4.encoder.layers.2.self_attn_weights, attn_weights_entropy = tensor([2.3277, 3.2950, 3.4739, 3.4008], device='cuda:1') +2024-01-15 21:30:24,589 INFO [train.py:1026] (1/2) Epoch 35, validation: loss=0.1658, simple_loss=0.2479, pruned_loss=0.04189, over 1622729.00 frames. +2024-01-15 21:30:24,590 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 21:30:38,078 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_module2.balancer2.prob, batch_count=95573.33333333333, ans=0.125 +2024-01-15 21:30:53,850 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.ff2_skip_rate, batch_count=95606.66666666667, ans=0.0 +2024-01-15 21:31:04,380 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=95640.0, ans=0.1 +2024-01-15 21:31:26,721 INFO [train.py:994] (1/2) Epoch 35, batch 50, loss[loss=0.1442, simple_loss=0.2287, pruned_loss=0.02987, over 24490.00 frames. ], tot_loss[loss=0.1401, simple_loss=0.2206, pruned_loss=0.02979, over 1095592.40 frames. ], batch size: 267, lr: 1.20e-02, grad_scale: 32.0 +2024-01-15 21:31:44,948 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=12.81 vs. limit=15.0 +2024-01-15 21:32:04,412 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.attention_skip_rate, batch_count=95806.66666666667, ans=0.0 +2024-01-15 21:32:07,580 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.453e+02 1.768e+02 1.965e+02 2.373e+02 3.841e+02, threshold=3.929e+02, percent-clipped=1.0 +2024-01-15 21:32:15,641 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module1.balancer2.prob, batch_count=95840.0, ans=0.125 +2024-01-15 21:32:29,064 INFO [train.py:994] (1/2) Epoch 35, batch 100, loss[loss=0.1547, simple_loss=0.2407, pruned_loss=0.03433, over 22373.00 frames. ], tot_loss[loss=0.14, simple_loss=0.2205, pruned_loss=0.02977, over 1916362.91 frames. ], batch size: 357, lr: 1.20e-02, grad_scale: 32.0 +2024-01-15 21:33:04,558 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=95940.0, ans=0.125 +2024-01-15 21:33:09,159 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=95973.33333333333, ans=0.1 +2024-01-15 21:33:09,715 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=256, metric=9.68 vs. limit=15.0 +2024-01-15 21:33:31,800 INFO [train.py:994] (1/2) Epoch 35, batch 150, loss[loss=0.159, simple_loss=0.2444, pruned_loss=0.0368, over 22264.00 frames. ], tot_loss[loss=0.1402, simple_loss=0.2206, pruned_loss=0.02988, over 2558172.81 frames. ], batch size: 357, lr: 1.20e-02, grad_scale: 32.0 +2024-01-15 21:33:50,788 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_skip_rate, batch_count=96073.33333333333, ans=0.0 +2024-01-15 21:33:51,896 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.bypass.scale_min, batch_count=96073.33333333333, ans=0.2 +2024-01-15 21:34:04,442 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.attention_skip_rate, batch_count=96106.66666666667, ans=0.0 +2024-01-15 21:34:05,527 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.bypass_mid.scale_min, batch_count=96106.66666666667, ans=0.2 +2024-01-15 21:34:12,747 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.495e+02 1.731e+02 1.860e+02 2.113e+02 3.783e+02, threshold=3.719e+02, percent-clipped=0.0 +2024-01-15 21:34:13,151 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.ff3_skip_rate, batch_count=96140.0, ans=0.0 +2024-01-15 21:34:14,248 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=96140.0, ans=0.125 +2024-01-15 21:34:22,649 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.whiten, num_groups=1, num_channels=256, metric=5.04 vs. limit=12.0 +2024-01-15 21:34:25,110 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.self_attn2.whiten, num_groups=1, num_channels=192, metric=13.22 vs. limit=22.5 +2024-01-15 21:34:33,578 INFO [train.py:994] (1/2) Epoch 35, batch 200, loss[loss=0.1426, simple_loss=0.2295, pruned_loss=0.02782, over 23853.00 frames. ], tot_loss[loss=0.1406, simple_loss=0.2209, pruned_loss=0.03012, over 3061792.44 frames. ], batch size: 328, lr: 1.20e-02, grad_scale: 32.0 +2024-01-15 21:34:37,044 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module1.balancer1.prob, batch_count=96206.66666666667, ans=0.125 +2024-01-15 21:34:51,583 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=96240.0, ans=0.1 +2024-01-15 21:35:32,502 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=96340.0, ans=0.125 +2024-01-15 21:35:36,913 INFO [train.py:994] (1/2) Epoch 35, batch 250, loss[loss=0.1376, simple_loss=0.22, pruned_loss=0.0276, over 24400.00 frames. ], tot_loss[loss=0.1414, simple_loss=0.2217, pruned_loss=0.03053, over 3440572.65 frames. ], batch size: 258, lr: 1.20e-02, grad_scale: 32.0 +2024-01-15 21:35:45,883 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.hidden_balancer.prob, batch_count=96373.33333333333, ans=0.125 +2024-01-15 21:35:47,101 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.bypass.skip_rate, batch_count=96373.33333333333, ans=0.09899494936611666 +2024-01-15 21:36:17,155 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_module1.balancer2.prob, batch_count=96473.33333333333, ans=0.125 +2024-01-15 21:36:17,964 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.455e+02 1.809e+02 1.950e+02 2.177e+02 3.641e+02, threshold=3.900e+02, percent-clipped=0.0 +2024-01-15 21:36:20,089 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.self_attn2.whiten, num_groups=1, num_channels=256, metric=14.39 vs. limit=22.5 +2024-01-15 21:36:39,517 INFO [train.py:994] (1/2) Epoch 35, batch 300, loss[loss=0.1245, simple_loss=0.2113, pruned_loss=0.0189, over 24305.00 frames. ], tot_loss[loss=0.141, simple_loss=0.2212, pruned_loss=0.03042, over 3740777.54 frames. ], batch size: 147, lr: 1.20e-02, grad_scale: 32.0 +2024-01-15 21:36:53,466 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.0.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 21:37:16,636 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.whiten, num_groups=1, num_channels=512, metric=6.01 vs. limit=12.0 +2024-01-15 21:37:19,102 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=512, metric=9.97 vs. limit=15.0 +2024-01-15 21:37:29,471 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=256, metric=16.76 vs. limit=22.5 +2024-01-15 21:37:42,390 INFO [train.py:994] (1/2) Epoch 35, batch 350, loss[loss=0.1398, simple_loss=0.2196, pruned_loss=0.03, over 24468.00 frames. ], tot_loss[loss=0.1407, simple_loss=0.221, pruned_loss=0.03023, over 3968394.69 frames. ], batch size: 170, lr: 1.20e-02, grad_scale: 32.0 +2024-01-15 21:37:45,019 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass.skip_rate, batch_count=96706.66666666667, ans=0.09899494936611666 +2024-01-15 21:37:51,724 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module2.balancer1.prob, batch_count=96706.66666666667, ans=0.125 +2024-01-15 21:38:12,533 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=2.90 vs. limit=6.0 +2024-01-15 21:38:23,215 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.422e+02 1.755e+02 1.951e+02 2.334e+02 4.164e+02, threshold=3.901e+02, percent-clipped=2.0 +2024-01-15 21:38:23,455 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.bypass.scale_min, batch_count=96806.66666666667, ans=0.2 +2024-01-15 21:38:27,566 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=384, metric=12.86 vs. limit=15.0 +2024-01-15 21:38:42,813 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.feed_forward2.out_whiten, num_groups=1, num_channels=384, metric=11.60 vs. limit=15.0 +2024-01-15 21:38:44,480 INFO [train.py:994] (1/2) Epoch 35, batch 400, loss[loss=0.1171, simple_loss=0.201, pruned_loss=0.01661, over 24316.00 frames. ], tot_loss[loss=0.1403, simple_loss=0.2206, pruned_loss=0.03006, over 4159779.38 frames. ], batch size: 147, lr: 1.19e-02, grad_scale: 32.0 +2024-01-15 21:39:04,196 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.attention_skip_rate, batch_count=96906.66666666667, ans=0.0 +2024-01-15 21:39:08,428 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module1.balancer1.min_positive, batch_count=96940.0, ans=0.025 +2024-01-15 21:39:38,182 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.self_attn2.whiten, num_groups=1, num_channels=384, metric=12.65 vs. limit=22.5 +2024-01-15 21:39:46,483 INFO [train.py:994] (1/2) Epoch 35, batch 450, loss[loss=0.146, simple_loss=0.2315, pruned_loss=0.03027, over 24436.00 frames. ], tot_loss[loss=0.1401, simple_loss=0.2204, pruned_loss=0.0299, over 4293808.30 frames. ], batch size: 258, lr: 1.19e-02, grad_scale: 32.0 +2024-01-15 21:39:47,904 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module1.balancer1.max_abs, batch_count=97040.0, ans=10.0 +2024-01-15 21:39:49,219 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_skip_rate, batch_count=97040.0, ans=0.0 +2024-01-15 21:39:55,234 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=13.85 vs. limit=15.0 +2024-01-15 21:40:00,315 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=97073.33333333333, ans=0.1 +2024-01-15 21:40:27,088 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.433e+02 1.789e+02 2.051e+02 2.389e+02 3.427e+02, threshold=4.102e+02, percent-clipped=0.0 +2024-01-15 21:40:45,954 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module1.balancer1.prob, batch_count=97173.33333333333, ans=0.125 +2024-01-15 21:40:48,596 INFO [train.py:994] (1/2) Epoch 35, batch 500, loss[loss=0.1407, simple_loss=0.2214, pruned_loss=0.03004, over 24476.00 frames. ], tot_loss[loss=0.1407, simple_loss=0.2209, pruned_loss=0.03029, over 4417058.13 frames. ], batch size: 181, lr: 1.19e-02, grad_scale: 32.0 +2024-01-15 21:40:54,883 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.attention_skip_rate, batch_count=97206.66666666667, ans=0.0 +2024-01-15 21:41:05,182 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.conv_module2.whiten, num_groups=1, num_channels=384, metric=2.76 vs. limit=15.0 +2024-01-15 21:41:07,143 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=97240.0, ans=0.1 +2024-01-15 21:41:19,776 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=512, metric=5.20 vs. limit=15.0 +2024-01-15 21:41:28,142 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=97306.66666666667, ans=0.0 +2024-01-15 21:41:39,195 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.balancer1.prob, batch_count=97340.0, ans=0.125 +2024-01-15 21:41:39,207 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module2.balancer2.prob, batch_count=97340.0, ans=0.125 +2024-01-15 21:41:50,076 INFO [train.py:994] (1/2) Epoch 35, batch 550, loss[loss=0.1378, simple_loss=0.2182, pruned_loss=0.02874, over 24473.00 frames. ], tot_loss[loss=0.1402, simple_loss=0.2206, pruned_loss=0.02987, over 4517515.66 frames. ], batch size: 216, lr: 1.19e-02, grad_scale: 32.0 +2024-01-15 21:41:56,977 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.ff2_skip_rate, batch_count=97373.33333333333, ans=0.0 +2024-01-15 21:42:00,481 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=97373.33333333333, ans=0.125 +2024-01-15 21:42:26,405 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.ff2_skip_rate, batch_count=97473.33333333333, ans=0.0 +2024-01-15 21:42:30,845 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.463e+02 1.707e+02 1.808e+02 2.124e+02 2.901e+02, threshold=3.617e+02, percent-clipped=0.0 +2024-01-15 21:42:41,431 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.bypass.skip_rate, batch_count=97506.66666666667, ans=0.07 +2024-01-15 21:42:52,615 INFO [train.py:994] (1/2) Epoch 35, batch 600, loss[loss=0.1328, simple_loss=0.2147, pruned_loss=0.02545, over 24413.00 frames. ], tot_loss[loss=0.1401, simple_loss=0.2205, pruned_loss=0.02981, over 4575711.74 frames. ], batch size: 159, lr: 1.19e-02, grad_scale: 32.0 +2024-01-15 21:42:52,891 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module2.balancer2.prob, batch_count=97540.0, ans=0.125 +2024-01-15 21:43:06,732 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.conv_module2.whiten, num_groups=1, num_channels=512, metric=3.99 vs. limit=15.0 +2024-01-15 21:43:13,315 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.nonlin_attention.whiten2, num_groups=1, num_channels=512, metric=5.66 vs. limit=15.0 +2024-01-15 21:43:14,523 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward2.out_whiten.whitening_limit, batch_count=97573.33333333333, ans=15.0 +2024-01-15 21:43:23,433 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_module1.balancer2.prob, batch_count=97606.66666666667, ans=0.125 +2024-01-15 21:43:45,784 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.out_proj.dropout_p, batch_count=97673.33333333333, ans=0.1 +2024-01-15 21:43:52,280 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.bypass.scale_min, batch_count=97673.33333333333, ans=0.2 +2024-01-15 21:43:54,498 INFO [train.py:994] (1/2) Epoch 35, batch 650, loss[loss=0.1383, simple_loss=0.2174, pruned_loss=0.02954, over 24472.00 frames. ], tot_loss[loss=0.1407, simple_loss=0.2212, pruned_loss=0.0301, over 4635004.22 frames. ], batch size: 170, lr: 1.19e-02, grad_scale: 32.0 +2024-01-15 21:44:01,327 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.out_combiner.scale_min, batch_count=97706.66666666667, ans=0.2 +2024-01-15 21:44:03,629 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.out_proj.dropout_p, batch_count=97706.66666666667, ans=0.1 +2024-01-15 21:44:08,439 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=97740.0, ans=0.1 +2024-01-15 21:44:17,424 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.nonlin_attention.balancer.prob, batch_count=97740.0, ans=0.125 +2024-01-15 21:44:23,241 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=97773.33333333333, ans=0.0 +2024-01-15 21:44:25,221 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass.skip_rate, batch_count=97773.33333333333, ans=0.04949747468305833 +2024-01-15 21:44:34,518 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=97806.66666666667, ans=0.125 +2024-01-15 21:44:35,330 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.467e+02 1.709e+02 1.864e+02 2.143e+02 3.994e+02, threshold=3.728e+02, percent-clipped=1.0 +2024-01-15 21:44:47,916 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=97840.0, ans=0.1 +2024-01-15 21:44:53,408 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.1.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 21:44:56,759 INFO [train.py:994] (1/2) Epoch 35, batch 700, loss[loss=0.145, simple_loss=0.2231, pruned_loss=0.03339, over 24463.00 frames. ], tot_loss[loss=0.1407, simple_loss=0.2213, pruned_loss=0.03007, over 4661979.74 frames. ], batch size: 222, lr: 1.19e-02, grad_scale: 32.0 +2024-01-15 21:44:59,359 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=97873.33333333333, ans=0.125 +2024-01-15 21:45:00,616 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.out_combiner.scale_min, batch_count=97873.33333333333, ans=0.2 +2024-01-15 21:45:04,205 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module2.balancer2.prob, batch_count=97873.33333333333, ans=0.125 +2024-01-15 21:45:18,865 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=3.52 vs. limit=6.0 +2024-01-15 21:45:30,409 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=13.10 vs. limit=15.0 +2024-01-15 21:45:57,580 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.balancer1.prob, batch_count=98040.0, ans=0.125 +2024-01-15 21:45:58,391 INFO [train.py:994] (1/2) Epoch 35, batch 750, loss[loss=0.1402, simple_loss=0.2224, pruned_loss=0.02901, over 24445.00 frames. ], tot_loss[loss=0.1409, simple_loss=0.2216, pruned_loss=0.03013, over 4693414.11 frames. ], batch size: 250, lr: 1.19e-02, grad_scale: 16.0 +2024-01-15 21:46:02,349 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.conv_module2.whiten, num_groups=1, num_channels=192, metric=2.99 vs. limit=15.0 +2024-01-15 21:46:18,752 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=98073.33333333333, ans=0.125 +2024-01-15 21:46:40,434 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.465e+02 1.691e+02 1.844e+02 2.221e+02 4.100e+02, threshold=3.688e+02, percent-clipped=1.0 +2024-01-15 21:46:46,028 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.out_proj.dropout_p, batch_count=98173.33333333333, ans=0.1 +2024-01-15 21:46:58,143 INFO [train.py:994] (1/2) Epoch 35, batch 800, loss[loss=0.1536, simple_loss=0.2319, pruned_loss=0.0377, over 24406.00 frames. ], tot_loss[loss=0.1408, simple_loss=0.2211, pruned_loss=0.03025, over 4705962.80 frames. ], batch size: 153, lr: 1.19e-02, grad_scale: 32.0 +2024-01-15 21:46:58,343 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.balancer2.prob, batch_count=98206.66666666667, ans=0.125 +2024-01-15 21:47:11,397 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.conv_module2.whiten, num_groups=1, num_channels=512, metric=2.88 vs. limit=15.0 +2024-01-15 21:47:22,506 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.attention_skip_rate, batch_count=98273.33333333333, ans=0.0 +2024-01-15 21:48:08,377 INFO [train.py:994] (1/2) Epoch 36, batch 0, loss[loss=0.1419, simple_loss=0.2218, pruned_loss=0.031, over 24539.00 frames. ], tot_loss[loss=0.1419, simple_loss=0.2218, pruned_loss=0.031, over 24539.00 frames. ], batch size: 236, lr: 1.17e-02, grad_scale: 32.0 +2024-01-15 21:48:08,378 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 21:48:26,581 INFO [zipformer.py:1858] (1/2) name=encoder.encoders.1.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([4.0489, 3.5113, 3.6820, 3.7037], device='cuda:1') +2024-01-15 21:48:29,549 INFO [train.py:1026] (1/2) Epoch 36, validation: loss=0.1669, simple_loss=0.2489, pruned_loss=0.04246, over 1622729.00 frames. +2024-01-15 21:48:29,550 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 21:48:30,220 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.feed_forward1.out_whiten, num_groups=1, num_channels=512, metric=8.32 vs. limit=15.0 +2024-01-15 21:48:34,446 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=98350.0, ans=0.0 +2024-01-15 21:48:41,693 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.bypass.skip_rate, batch_count=98383.33333333333, ans=0.07 +2024-01-15 21:48:41,699 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.bypass_mid.scale_min, batch_count=98383.33333333333, ans=0.2 +2024-01-15 21:48:48,259 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward3.hidden_balancer.prob, batch_count=98383.33333333333, ans=0.125 +2024-01-15 21:48:54,796 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=98416.66666666667, ans=0.1 +2024-01-15 21:49:20,778 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.404e+02 1.762e+02 1.998e+02 2.319e+02 4.869e+02, threshold=3.996e+02, percent-clipped=3.0 +2024-01-15 21:49:31,326 INFO [train.py:994] (1/2) Epoch 36, batch 50, loss[loss=0.1405, simple_loss=0.2192, pruned_loss=0.03091, over 24453.00 frames. ], tot_loss[loss=0.1414, simple_loss=0.2212, pruned_loss=0.03081, over 1099261.58 frames. ], batch size: 216, lr: 1.17e-02, grad_scale: 32.0 +2024-01-15 21:49:48,212 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=98550.0, ans=0.1 +2024-01-15 21:50:02,336 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.balancer2.prob, batch_count=98583.33333333333, ans=0.125 +2024-01-15 21:50:22,186 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.bypass.skip_rate, batch_count=98650.0, ans=0.07 +2024-01-15 21:50:23,652 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=384, metric=4.98 vs. limit=10.0 +2024-01-15 21:50:27,229 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module2.balancer1.prob, batch_count=98650.0, ans=0.125 +2024-01-15 21:50:32,953 INFO [train.py:994] (1/2) Epoch 36, batch 100, loss[loss=0.1458, simple_loss=0.2184, pruned_loss=0.03656, over 24360.00 frames. ], tot_loss[loss=0.1401, simple_loss=0.2207, pruned_loss=0.02976, over 1933811.35 frames. ], batch size: 153, lr: 1.17e-02, grad_scale: 32.0 +2024-01-15 21:50:53,533 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=98716.66666666667, ans=0.0 +2024-01-15 21:50:53,603 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module2.balancer1.prob, batch_count=98716.66666666667, ans=0.125 +2024-01-15 21:51:05,425 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=98750.0, ans=0.125 +2024-01-15 21:51:08,979 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.balancer1.prob, batch_count=98783.33333333333, ans=0.125 +2024-01-15 21:51:17,799 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=4.24 vs. limit=6.0 +2024-01-15 21:51:19,389 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward1.out_proj.dropout_p, batch_count=98783.33333333333, ans=0.1 +2024-01-15 21:51:20,596 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.ff2_skip_rate, batch_count=98816.66666666667, ans=0.0 +2024-01-15 21:51:23,130 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.431e+02 1.714e+02 1.831e+02 2.083e+02 3.031e+02, threshold=3.661e+02, percent-clipped=0.0 +2024-01-15 21:51:25,133 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=98816.66666666667, ans=0.125 +2024-01-15 21:51:29,920 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=98816.66666666667, ans=0.125 +2024-01-15 21:51:34,213 INFO [train.py:994] (1/2) Epoch 36, batch 150, loss[loss=0.1422, simple_loss=0.2213, pruned_loss=0.03159, over 24494.00 frames. ], tot_loss[loss=0.1404, simple_loss=0.2215, pruned_loss=0.02967, over 2585532.32 frames. ], batch size: 210, lr: 1.17e-02, grad_scale: 32.0 +2024-01-15 21:51:41,601 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_skip_rate, batch_count=98850.0, ans=0.0 +2024-01-15 21:51:47,942 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_skip_rate, batch_count=98883.33333333333, ans=0.0 +2024-01-15 21:52:01,467 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_skip_rate, batch_count=98916.66666666667, ans=0.0 +2024-01-15 21:52:02,536 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.ff2_skip_rate, batch_count=98916.66666666667, ans=0.0 +2024-01-15 21:52:06,461 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=3.04 vs. limit=6.0 +2024-01-15 21:52:15,559 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=8.55 vs. limit=10.0 +2024-01-15 21:52:33,911 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.ff2_skip_rate, batch_count=99016.66666666667, ans=0.0 +2024-01-15 21:52:34,928 INFO [train.py:994] (1/2) Epoch 36, batch 200, loss[loss=0.1163, simple_loss=0.178, pruned_loss=0.0273, over 18782.00 frames. ], tot_loss[loss=0.1398, simple_loss=0.2204, pruned_loss=0.02957, over 3074282.02 frames. ], batch size: 81, lr: 1.17e-02, grad_scale: 32.0 +2024-01-15 21:52:43,484 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.balancer1.prob, batch_count=99016.66666666667, ans=0.125 +2024-01-15 21:53:25,430 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.492e+02 1.822e+02 2.260e+02 2.664e+02 3.639e+02, threshold=4.520e+02, percent-clipped=0.0 +2024-01-15 21:53:37,070 INFO [train.py:994] (1/2) Epoch 36, batch 250, loss[loss=0.1373, simple_loss=0.224, pruned_loss=0.02533, over 23884.00 frames. ], tot_loss[loss=0.1402, simple_loss=0.221, pruned_loss=0.02973, over 3460277.14 frames. ], batch size: 328, lr: 1.17e-02, grad_scale: 32.0 +2024-01-15 21:53:59,651 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.nonlin_attention.balancer.prob, batch_count=99216.66666666667, ans=0.125 +2024-01-15 21:53:59,685 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.out_combiner.scale_min, batch_count=99216.66666666667, ans=0.2 +2024-01-15 21:54:27,906 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.3.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 21:54:38,266 INFO [train.py:994] (1/2) Epoch 36, batch 300, loss[loss=0.1494, simple_loss=0.236, pruned_loss=0.0314, over 24489.00 frames. ], tot_loss[loss=0.1397, simple_loss=0.2205, pruned_loss=0.02947, over 3752990.02 frames. ], batch size: 216, lr: 1.17e-02, grad_scale: 32.0 +2024-01-15 21:54:44,011 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=99350.0, ans=0.0 +2024-01-15 21:54:58,951 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.out_combiner.scale_min, batch_count=99383.33333333333, ans=0.2 +2024-01-15 21:55:12,493 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module2.balancer2.prob, batch_count=99416.66666666667, ans=0.125 +2024-01-15 21:55:25,016 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.ff3_skip_rate, batch_count=99450.0, ans=0.0 +2024-01-15 21:55:29,377 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.468e+02 1.712e+02 1.848e+02 2.107e+02 3.475e+02, threshold=3.696e+02, percent-clipped=0.0 +2024-01-15 21:55:40,651 INFO [train.py:994] (1/2) Epoch 36, batch 350, loss[loss=0.1382, simple_loss=0.2234, pruned_loss=0.02645, over 24214.00 frames. ], tot_loss[loss=0.1401, simple_loss=0.2212, pruned_loss=0.0295, over 3994367.26 frames. ], batch size: 311, lr: 1.16e-02, grad_scale: 32.0 +2024-01-15 21:55:44,978 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.balancer2.prob, batch_count=99516.66666666667, ans=0.125 +2024-01-15 21:55:48,735 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.whiten, num_groups=1, num_channels=384, metric=2.62 vs. limit=12.0 +2024-01-15 21:56:12,421 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.self_attn1.whiten, num_groups=1, num_channels=192, metric=11.69 vs. limit=22.5 +2024-01-15 21:56:14,139 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.out_combiner.scale_min, batch_count=99583.33333333333, ans=0.2 +2024-01-15 21:56:30,006 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module2.balancer1.prob, batch_count=99650.0, ans=0.125 +2024-01-15 21:56:37,033 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.self_attn2.whiten.whitening_limit, batch_count=99650.0, ans=22.5 +2024-01-15 21:56:42,308 INFO [train.py:994] (1/2) Epoch 36, batch 400, loss[loss=0.1286, simple_loss=0.208, pruned_loss=0.0246, over 23888.00 frames. ], tot_loss[loss=0.1396, simple_loss=0.2207, pruned_loss=0.02921, over 4184339.57 frames. ], batch size: 131, lr: 1.16e-02, grad_scale: 32.0 +2024-01-15 21:57:04,944 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module2.balancer2.prob, batch_count=99716.66666666667, ans=0.125 +2024-01-15 21:57:06,239 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.self_attn_weights.pos_emb_skip_rate, batch_count=99750.0, ans=0.0 +2024-01-15 21:57:13,985 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=99750.0, ans=0.0 +2024-01-15 21:57:19,919 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=99783.33333333333, ans=0.1 +2024-01-15 21:57:31,335 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.bypass.scale_min, batch_count=99816.66666666667, ans=0.2 +2024-01-15 21:57:33,421 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.474e+02 1.721e+02 1.850e+02 2.143e+02 3.369e+02, threshold=3.701e+02, percent-clipped=0.0 +2024-01-15 21:57:44,571 INFO [train.py:994] (1/2) Epoch 36, batch 450, loss[loss=0.1474, simple_loss=0.2272, pruned_loss=0.03383, over 24534.00 frames. ], tot_loss[loss=0.1397, simple_loss=0.2208, pruned_loss=0.02932, over 4322596.78 frames. ], batch size: 165, lr: 1.16e-02, grad_scale: 32.0 +2024-01-15 21:58:12,021 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.balancer.min_positive, batch_count=99916.66666666667, ans=0.05 +2024-01-15 21:58:24,519 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=99950.0, ans=0.125 +2024-01-15 21:58:36,899 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=99983.33333333333, ans=0.0 +2024-01-15 21:58:46,931 INFO [train.py:994] (1/2) Epoch 36, batch 500, loss[loss=0.1337, simple_loss=0.21, pruned_loss=0.0287, over 24223.00 frames. ], tot_loss[loss=0.1395, simple_loss=0.2204, pruned_loss=0.02926, over 4435179.13 frames. ], batch size: 140, lr: 1.16e-02, grad_scale: 32.0 +2024-01-15 21:58:48,404 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_skip_rate, batch_count=100016.66666666667, ans=0.0 +2024-01-15 21:58:59,512 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.balancer2.prob, batch_count=100050.0, ans=0.125 +2024-01-15 21:59:00,941 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=9.63 vs. limit=15.0 +2024-01-15 21:59:16,241 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=100083.33333333333, ans=0.1 +2024-01-15 21:59:23,798 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.attention_skip_rate, batch_count=100116.66666666667, ans=0.0 +2024-01-15 21:59:33,364 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.attention_skip_rate, batch_count=100116.66666666667, ans=0.0 +2024-01-15 21:59:37,390 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.404e+02 1.694e+02 1.886e+02 2.106e+02 3.014e+02, threshold=3.772e+02, percent-clipped=0.0 +2024-01-15 21:59:39,277 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=512, metric=8.17 vs. limit=15.0 +2024-01-15 21:59:48,486 INFO [train.py:994] (1/2) Epoch 36, batch 550, loss[loss=0.1494, simple_loss=0.2283, pruned_loss=0.03529, over 24481.00 frames. ], tot_loss[loss=0.1393, simple_loss=0.2199, pruned_loss=0.02929, over 4513265.22 frames. ], batch size: 170, lr: 1.16e-02, grad_scale: 32.0 +2024-01-15 22:00:19,575 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=8.90 vs. limit=15.0 +2024-01-15 22:00:48,860 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.whiten, num_groups=1, num_channels=256, metric=4.96 vs. limit=12.0 +2024-01-15 22:00:49,607 INFO [train.py:994] (1/2) Epoch 36, batch 600, loss[loss=0.1474, simple_loss=0.2231, pruned_loss=0.03585, over 24569.00 frames. ], tot_loss[loss=0.1391, simple_loss=0.2199, pruned_loss=0.02917, over 4577343.56 frames. ], batch size: 176, lr: 1.16e-02, grad_scale: 32.0 +2024-01-15 22:00:51,047 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass.scale_min, batch_count=100350.0, ans=0.2 +2024-01-15 22:00:56,363 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=100350.0, ans=0.125 +2024-01-15 22:01:08,686 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.attention_skip_rate, batch_count=100383.33333333333, ans=0.0 +2024-01-15 22:01:18,101 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_skip_rate, batch_count=100416.66666666667, ans=0.0 +2024-01-15 22:01:25,096 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.1.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 22:01:40,247 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.461e+02 1.713e+02 1.963e+02 2.375e+02 3.594e+02, threshold=3.927e+02, percent-clipped=0.0 +2024-01-15 22:01:48,162 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=100483.33333333333, ans=0.1 +2024-01-15 22:01:48,504 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=8.42 vs. limit=15.0 +2024-01-15 22:01:52,219 INFO [train.py:994] (1/2) Epoch 36, batch 650, loss[loss=0.1407, simple_loss=0.2194, pruned_loss=0.03098, over 24443.00 frames. ], tot_loss[loss=0.1388, simple_loss=0.2196, pruned_loss=0.02895, over 4624143.92 frames. ], batch size: 258, lr: 1.16e-02, grad_scale: 32.0 +2024-01-15 22:01:52,575 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.ff2_skip_rate, batch_count=100516.66666666667, ans=0.0 +2024-01-15 22:02:06,525 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module1.balancer2.prob, batch_count=100550.0, ans=0.125 +2024-01-15 22:02:20,876 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.balancer2.prob, batch_count=100583.33333333333, ans=0.125 +2024-01-15 22:02:25,505 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=100583.33333333333, ans=0.1 +2024-01-15 22:02:33,895 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=100616.66666666667, ans=0.125 +2024-01-15 22:02:36,220 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward2.hidden_balancer.prob, batch_count=100616.66666666667, ans=0.125 +2024-01-15 22:02:43,241 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.out_proj.dropout_p, batch_count=100650.0, ans=0.1 +2024-01-15 22:02:53,863 INFO [train.py:994] (1/2) Epoch 36, batch 700, loss[loss=0.1268, simple_loss=0.2029, pruned_loss=0.02537, over 24164.00 frames. ], tot_loss[loss=0.1385, simple_loss=0.2194, pruned_loss=0.02881, over 4669821.56 frames. ], batch size: 140, lr: 1.16e-02, grad_scale: 32.0 +2024-01-15 22:02:55,704 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=2.87 vs. limit=6.0 +2024-01-15 22:03:07,082 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=8.31 vs. limit=10.0 +2024-01-15 22:03:08,726 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=10.99 vs. limit=15.0 +2024-01-15 22:03:10,832 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.ff3_skip_rate, batch_count=100716.66666666667, ans=0.0 +2024-01-15 22:03:18,314 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.whiten, num_groups=1, num_channels=256, metric=4.28 vs. limit=12.0 +2024-01-15 22:03:28,548 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.0.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 22:03:40,899 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.whiten, num_groups=1, num_channels=512, metric=6.17 vs. limit=12.0 +2024-01-15 22:03:44,976 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.506e+02 1.713e+02 1.930e+02 2.201e+02 3.344e+02, threshold=3.860e+02, percent-clipped=0.0 +2024-01-15 22:03:51,253 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_skip_rate, batch_count=100816.66666666667, ans=0.0 +2024-01-15 22:03:56,277 INFO [train.py:994] (1/2) Epoch 36, batch 750, loss[loss=0.1421, simple_loss=0.2261, pruned_loss=0.0291, over 24444.00 frames. ], tot_loss[loss=0.1383, simple_loss=0.2191, pruned_loss=0.0288, over 4694797.77 frames. ], batch size: 222, lr: 1.16e-02, grad_scale: 32.0 +2024-01-15 22:03:57,748 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.balancer2.prob, batch_count=100850.0, ans=0.125 +2024-01-15 22:04:03,253 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_skip_rate, batch_count=100850.0, ans=0.0 +2024-01-15 22:04:17,238 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_module2.balancer2.prob, batch_count=100883.33333333333, ans=0.125 +2024-01-15 22:04:25,236 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.ff3_skip_rate, batch_count=100916.66666666667, ans=0.0 +2024-01-15 22:04:26,384 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module2.balancer1.prob, batch_count=100916.66666666667, ans=0.125 +2024-01-15 22:04:28,648 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward3.hidden_balancer.prob, batch_count=100916.66666666667, ans=0.125 +2024-01-15 22:04:38,724 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass.scale_min, batch_count=100950.0, ans=0.2 +2024-01-15 22:04:55,445 INFO [train.py:994] (1/2) Epoch 36, batch 800, loss[loss=0.1395, simple_loss=0.2227, pruned_loss=0.02816, over 24388.00 frames. ], tot_loss[loss=0.1388, simple_loss=0.2195, pruned_loss=0.02906, over 4722338.59 frames. ], batch size: 275, lr: 1.16e-02, grad_scale: 32.0 +2024-01-15 22:05:00,221 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.bypass.skip_rate, batch_count=101016.66666666667, ans=0.09899494936611666 +2024-01-15 22:05:04,022 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.balancer1.min_positive, batch_count=101016.66666666667, ans=0.025 +2024-01-15 22:05:06,290 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=101050.0, ans=0.1 +2024-01-15 22:05:16,696 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=101050.0, ans=0.125 +2024-01-15 22:05:42,005 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.400e+02 1.756e+02 1.942e+02 2.533e+02 4.367e+02, threshold=3.885e+02, percent-clipped=2.0 +2024-01-15 22:06:06,400 INFO [train.py:994] (1/2) Epoch 37, batch 0, loss[loss=0.144, simple_loss=0.2307, pruned_loss=0.0287, over 23866.00 frames. ], tot_loss[loss=0.144, simple_loss=0.2307, pruned_loss=0.0287, over 23866.00 frames. ], batch size: 328, lr: 1.14e-02, grad_scale: 32.0 +2024-01-15 22:06:06,400 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 22:06:16,943 INFO [zipformer.py:1858] (1/2) name=encoder.encoders.0.layers.1.self_attn_weights, attn_weights_entropy = tensor([5.5597, 5.2472, 5.2582, 5.1881], device='cuda:1') +2024-01-15 22:06:27,581 INFO [train.py:1026] (1/2) Epoch 37, validation: loss=0.1661, simple_loss=0.2481, pruned_loss=0.042, over 1622729.00 frames. +2024-01-15 22:06:27,582 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 22:06:29,067 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.bypass.scale_min, batch_count=101160.0, ans=0.2 +2024-01-15 22:06:31,249 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module1.balancer2.prob, batch_count=101160.0, ans=0.125 +2024-01-15 22:06:45,358 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.out_combiner.scale_min, batch_count=101193.33333333333, ans=0.2 +2024-01-15 22:06:55,394 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass.scale_min, batch_count=101226.66666666667, ans=0.2 +2024-01-15 22:07:01,389 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward1.hidden_balancer.prob, batch_count=101226.66666666667, ans=0.125 +2024-01-15 22:07:09,019 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=144, metric=6.28 vs. limit=10.0 +2024-01-15 22:07:21,601 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module2.balancer1.prob, batch_count=101293.33333333333, ans=0.125 +2024-01-15 22:07:25,150 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_skip_rate, batch_count=101293.33333333333, ans=0.0 +2024-01-15 22:07:28,354 INFO [train.py:994] (1/2) Epoch 37, batch 50, loss[loss=0.132, simple_loss=0.2109, pruned_loss=0.02661, over 24361.00 frames. ], tot_loss[loss=0.1378, simple_loss=0.2182, pruned_loss=0.02868, over 1082963.23 frames. ], batch size: 298, lr: 1.14e-02, grad_scale: 32.0 +2024-01-15 22:07:43,358 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.bypass.scale_min, batch_count=101360.0, ans=0.2 +2024-01-15 22:07:43,472 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.balancer2.prob, batch_count=101360.0, ans=0.125 +2024-01-15 22:07:51,567 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=101360.0, ans=0.1 +2024-01-15 22:08:04,082 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module2.balancer1.prob, batch_count=101393.33333333333, ans=0.125 +2024-01-15 22:08:27,720 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass_mid.scale_min, batch_count=101460.0, ans=0.2 +2024-01-15 22:08:28,551 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.396e+02 1.658e+02 1.797e+02 2.024e+02 3.025e+02, threshold=3.594e+02, percent-clipped=0.0 +2024-01-15 22:08:30,861 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=17.68 vs. limit=15.0 +2024-01-15 22:08:31,615 INFO [train.py:994] (1/2) Epoch 37, batch 100, loss[loss=0.15, simple_loss=0.227, pruned_loss=0.03651, over 24479.00 frames. ], tot_loss[loss=0.138, simple_loss=0.218, pruned_loss=0.02901, over 1898803.13 frames. ], batch size: 170, lr: 1.14e-02, grad_scale: 32.0 +2024-01-15 22:08:34,326 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_skip_rate, batch_count=101493.33333333333, ans=0.0 +2024-01-15 22:08:44,386 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_skip_rate, batch_count=101526.66666666667, ans=0.0 +2024-01-15 22:08:50,208 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_skip_rate, batch_count=101526.66666666667, ans=0.0 +2024-01-15 22:08:52,713 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_skip_rate, batch_count=101526.66666666667, ans=0.0 +2024-01-15 22:08:56,036 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=3.55 vs. limit=10.0 +2024-01-15 22:09:05,273 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=512, metric=16.90 vs. limit=15.0 +2024-01-15 22:09:30,791 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=384, metric=13.80 vs. limit=15.0 +2024-01-15 22:09:32,666 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_skip_rate, batch_count=101626.66666666667, ans=0.0 +2024-01-15 22:09:34,729 INFO [train.py:994] (1/2) Epoch 37, batch 150, loss[loss=0.1458, simple_loss=0.2308, pruned_loss=0.03038, over 24496.00 frames. ], tot_loss[loss=0.1383, simple_loss=0.2185, pruned_loss=0.02902, over 2544146.26 frames. ], batch size: 181, lr: 1.14e-02, grad_scale: 32.0 +2024-01-15 22:09:36,276 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.out_combiner.scale_min, batch_count=101660.0, ans=0.2 +2024-01-15 22:10:09,945 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.bypass.scale_min, batch_count=101726.66666666667, ans=0.2 +2024-01-15 22:10:26,844 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_skip_rate, batch_count=101793.33333333333, ans=0.0 +2024-01-15 22:10:26,849 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.balancer2.prob, batch_count=101793.33333333333, ans=0.125 +2024-01-15 22:10:35,959 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.486e+02 1.745e+02 1.959e+02 2.425e+02 3.649e+02, threshold=3.918e+02, percent-clipped=1.0 +2024-01-15 22:10:37,151 INFO [train.py:994] (1/2) Epoch 37, batch 200, loss[loss=0.1442, simple_loss=0.2277, pruned_loss=0.03038, over 24508.00 frames. ], tot_loss[loss=0.1392, simple_loss=0.2198, pruned_loss=0.02929, over 3050106.48 frames. ], batch size: 193, lr: 1.14e-02, grad_scale: 16.0 +2024-01-15 22:10:41,750 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module2.balancer1.prob, batch_count=101826.66666666667, ans=0.125 +2024-01-15 22:10:45,836 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.bypass_mid.scale_min, batch_count=101826.66666666667, ans=0.2 +2024-01-15 22:10:49,873 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=8.59 vs. limit=10.0 +2024-01-15 22:10:55,273 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=101860.0, ans=0.1 +2024-01-15 22:11:12,042 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.conv_module2.whiten, num_groups=1, num_channels=384, metric=3.45 vs. limit=15.0 +2024-01-15 22:11:19,282 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=101926.66666666667, ans=0.1 +2024-01-15 22:11:34,815 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=512, metric=4.82 vs. limit=15.0 +2024-01-15 22:11:40,784 INFO [train.py:994] (1/2) Epoch 37, batch 250, loss[loss=0.1427, simple_loss=0.2262, pruned_loss=0.02966, over 22416.00 frames. ], tot_loss[loss=0.1386, simple_loss=0.219, pruned_loss=0.02915, over 3428478.42 frames. ], batch size: 357, lr: 1.14e-02, grad_scale: 16.0 +2024-01-15 22:11:47,300 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.attention_skip_rate, batch_count=101993.33333333333, ans=0.0 +2024-01-15 22:12:28,872 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.ff2_skip_rate, batch_count=102093.33333333333, ans=0.0 +2024-01-15 22:12:42,616 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.381e+02 1.698e+02 1.912e+02 2.193e+02 3.408e+02, threshold=3.824e+02, percent-clipped=0.0 +2024-01-15 22:12:43,831 INFO [train.py:994] (1/2) Epoch 37, batch 300, loss[loss=0.1443, simple_loss=0.2242, pruned_loss=0.03223, over 24293.00 frames. ], tot_loss[loss=0.139, simple_loss=0.2196, pruned_loss=0.02919, over 3741385.64 frames. ], batch size: 285, lr: 1.14e-02, grad_scale: 16.0 +2024-01-15 22:12:54,316 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.2.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 22:13:09,353 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module2.balancer2.min_positive, batch_count=102226.66666666667, ans=0.05 +2024-01-15 22:13:17,247 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module1.balancer1.prob, batch_count=102226.66666666667, ans=0.125 +2024-01-15 22:13:20,771 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.self_attn_weights.pos_emb_skip_rate, batch_count=102260.0, ans=0.0 +2024-01-15 22:13:22,233 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=256, metric=13.36 vs. limit=22.5 +2024-01-15 22:13:46,820 INFO [train.py:994] (1/2) Epoch 37, batch 350, loss[loss=0.1505, simple_loss=0.2296, pruned_loss=0.03568, over 24462.00 frames. ], tot_loss[loss=0.1386, simple_loss=0.2192, pruned_loss=0.02903, over 3982956.09 frames. ], batch size: 181, lr: 1.13e-02, grad_scale: 16.0 +2024-01-15 22:14:23,047 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module2.balancer2.prob, batch_count=102393.33333333333, ans=0.125 +2024-01-15 22:14:31,475 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.balancer1.prob, batch_count=102426.66666666667, ans=0.125 +2024-01-15 22:14:45,427 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.ff3_skip_rate, batch_count=102460.0, ans=0.0 +2024-01-15 22:14:49,285 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.454e+02 1.670e+02 1.836e+02 2.122e+02 2.808e+02, threshold=3.672e+02, percent-clipped=0.0 +2024-01-15 22:14:50,505 INFO [train.py:994] (1/2) Epoch 37, batch 400, loss[loss=0.1241, simple_loss=0.2047, pruned_loss=0.02168, over 24152.00 frames. ], tot_loss[loss=0.1382, simple_loss=0.2188, pruned_loss=0.02877, over 4166115.42 frames. ], batch size: 140, lr: 1.13e-02, grad_scale: 32.0 +2024-01-15 22:15:05,669 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_skip_rate, batch_count=102526.66666666667, ans=0.0 +2024-01-15 22:15:29,472 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.out_combiner.scale_min, batch_count=102593.33333333333, ans=0.2 +2024-01-15 22:15:33,152 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.bypass.scale_min, batch_count=102593.33333333333, ans=0.2 +2024-01-15 22:15:40,777 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.nonlin_attention.balancer.max_positive, batch_count=102626.66666666667, ans=0.95 +2024-01-15 22:15:53,525 INFO [train.py:994] (1/2) Epoch 37, batch 450, loss[loss=0.1172, simple_loss=0.1948, pruned_loss=0.01986, over 23639.00 frames. ], tot_loss[loss=0.1382, simple_loss=0.219, pruned_loss=0.0287, over 4321031.66 frames. ], batch size: 119, lr: 1.13e-02, grad_scale: 32.0 +2024-01-15 22:16:01,495 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_module2.balancer1.min_positive, batch_count=102660.0, ans=0.025 +2024-01-15 22:16:08,409 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.nonlin_attention.whiten1, num_groups=1, num_channels=384, metric=4.58 vs. limit=10.0 +2024-01-15 22:16:10,840 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=512, metric=8.69 vs. limit=15.0 +2024-01-15 22:16:15,898 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.ff3_skip_rate, batch_count=102693.33333333333, ans=0.0 +2024-01-15 22:16:26,653 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module1.balancer1.prob, batch_count=102726.66666666667, ans=0.125 +2024-01-15 22:16:55,303 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.448e+02 1.753e+02 1.977e+02 2.299e+02 3.213e+02, threshold=3.954e+02, percent-clipped=0.0 +2024-01-15 22:16:56,531 INFO [train.py:994] (1/2) Epoch 37, batch 500, loss[loss=0.1364, simple_loss=0.2207, pruned_loss=0.02604, over 24355.00 frames. ], tot_loss[loss=0.1381, simple_loss=0.2189, pruned_loss=0.02859, over 4436046.78 frames. ], batch size: 298, lr: 1.13e-02, grad_scale: 32.0 +2024-01-15 22:17:01,175 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module1.balancer2.min_positive, batch_count=102826.66666666667, ans=0.05 +2024-01-15 22:17:01,604 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=10.71 vs. limit=15.0 +2024-01-15 22:17:05,290 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module1.balancer1.max_abs, batch_count=102826.66666666667, ans=10.0 +2024-01-15 22:17:20,988 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.nonlin_attention.whiten2, num_groups=1, num_channels=384, metric=3.05 vs. limit=15.0 +2024-01-15 22:17:36,572 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=102926.66666666667, ans=0.1 +2024-01-15 22:18:00,235 INFO [train.py:994] (1/2) Epoch 37, batch 550, loss[loss=0.1382, simple_loss=0.2189, pruned_loss=0.02873, over 24355.00 frames. ], tot_loss[loss=0.1381, simple_loss=0.2191, pruned_loss=0.02861, over 4531305.67 frames. ], batch size: 275, lr: 1.13e-02, grad_scale: 32.0 +2024-01-15 22:18:01,895 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=10.11 vs. limit=15.0 +2024-01-15 22:18:06,323 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=102993.33333333333, ans=0.1 +2024-01-15 22:18:15,852 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=103026.66666666667, ans=0.1 +2024-01-15 22:18:18,336 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=103026.66666666667, ans=0.0 +2024-01-15 22:18:28,065 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module1.balancer1.prob, batch_count=103060.0, ans=0.125 +2024-01-15 22:18:38,905 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.convnext.out_balancer.prob, batch_count=103093.33333333333, ans=0.125 +2024-01-15 22:18:51,076 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.balancer_ff3.min_abs, batch_count=103126.66666666667, ans=0.2 +2024-01-15 22:19:01,392 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.375e+02 1.771e+02 1.926e+02 2.307e+02 3.355e+02, threshold=3.851e+02, percent-clipped=0.0 +2024-01-15 22:19:02,686 INFO [train.py:994] (1/2) Epoch 37, batch 600, loss[loss=0.1488, simple_loss=0.2304, pruned_loss=0.03361, over 24544.00 frames. ], tot_loss[loss=0.1384, simple_loss=0.2194, pruned_loss=0.02874, over 4601572.95 frames. ], batch size: 176, lr: 1.13e-02, grad_scale: 32.0 +2024-01-15 22:19:05,512 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass_mid.scale_min, batch_count=103160.0, ans=0.2 +2024-01-15 22:19:10,955 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.ff3_skip_rate, batch_count=103160.0, ans=0.0 +2024-01-15 22:19:18,177 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module2.balancer2.prob, batch_count=103193.33333333333, ans=0.125 +2024-01-15 22:19:18,303 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=192, metric=2.89 vs. limit=10.0 +2024-01-15 22:19:19,546 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=9.17 vs. limit=15.0 +2024-01-15 22:19:33,723 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module2.balancer1.prob, batch_count=103226.66666666667, ans=0.125 +2024-01-15 22:20:06,193 INFO [train.py:994] (1/2) Epoch 37, batch 650, loss[loss=0.1486, simple_loss=0.2278, pruned_loss=0.03468, over 24384.00 frames. ], tot_loss[loss=0.1383, simple_loss=0.2191, pruned_loss=0.0287, over 4654807.33 frames. ], batch size: 153, lr: 1.13e-02, grad_scale: 32.0 +2024-01-15 22:20:12,069 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.feed_forward1.out_whiten, num_groups=1, num_channels=384, metric=13.87 vs. limit=15.0 +2024-01-15 22:20:25,342 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.self_attn1.whiten, num_groups=1, num_channels=192, metric=14.04 vs. limit=22.5 +2024-01-15 22:21:07,947 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.441e+02 1.744e+02 1.856e+02 2.130e+02 3.630e+02, threshold=3.712e+02, percent-clipped=0.0 +2024-01-15 22:21:09,204 INFO [train.py:994] (1/2) Epoch 37, batch 700, loss[loss=0.139, simple_loss=0.2276, pruned_loss=0.02521, over 24342.00 frames. ], tot_loss[loss=0.1379, simple_loss=0.2186, pruned_loss=0.02856, over 4683669.57 frames. ], batch size: 298, lr: 1.13e-02, grad_scale: 32.0 +2024-01-15 22:21:27,978 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.bypass.skip_rate, batch_count=103526.66666666667, ans=0.09899494936611666 +2024-01-15 22:21:47,973 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_skip_rate, batch_count=103593.33333333333, ans=0.0 +2024-01-15 22:21:54,610 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.ff3_skip_rate, batch_count=103593.33333333333, ans=0.0 +2024-01-15 22:22:01,909 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.feed_forward3.out_whiten, num_groups=1, num_channels=512, metric=10.42 vs. limit=15.0 +2024-01-15 22:22:02,785 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_skip_rate, batch_count=103626.66666666667, ans=0.0 +2024-01-15 22:22:12,146 INFO [train.py:994] (1/2) Epoch 37, batch 750, loss[loss=0.1481, simple_loss=0.2301, pruned_loss=0.03304, over 24464.00 frames. ], tot_loss[loss=0.1376, simple_loss=0.2182, pruned_loss=0.02851, over 4688340.15 frames. ], batch size: 216, lr: 1.13e-02, grad_scale: 32.0 +2024-01-15 22:22:13,146 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.nonlin_attention.balancer.prob, batch_count=103660.0, ans=0.125 +2024-01-15 22:22:28,346 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.bypass.scale_min, batch_count=103693.33333333333, ans=0.2 +2024-01-15 22:22:30,568 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.ff2_skip_rate, batch_count=103693.33333333333, ans=0.0 +2024-01-15 22:22:41,843 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.attention_skip_rate, batch_count=103726.66666666667, ans=0.0 +2024-01-15 22:22:46,437 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.0.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 22:23:01,425 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass.scale_min, batch_count=103793.33333333333, ans=0.2 +2024-01-15 22:23:01,427 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=103793.33333333333, ans=0.1 +2024-01-15 22:23:11,432 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.420e+02 1.720e+02 2.004e+02 2.343e+02 4.090e+02, threshold=4.009e+02, percent-clipped=1.0 +2024-01-15 22:23:12,641 INFO [train.py:994] (1/2) Epoch 37, batch 800, loss[loss=0.1319, simple_loss=0.2156, pruned_loss=0.02415, over 24515.00 frames. ], tot_loss[loss=0.1382, simple_loss=0.2188, pruned_loss=0.02877, over 4721742.23 frames. ], batch size: 267, lr: 1.13e-02, grad_scale: 32.0 +2024-01-15 22:23:28,067 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward3.hidden_balancer.prob, batch_count=103860.0, ans=0.125 +2024-01-15 22:23:45,757 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.balancer_ff3.min_abs, batch_count=103893.33333333333, ans=0.2 +2024-01-15 22:23:57,897 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=103960.0, ans=0.125 +2024-01-15 22:24:23,950 INFO [train.py:994] (1/2) Epoch 38, batch 0, loss[loss=0.1443, simple_loss=0.2235, pruned_loss=0.03257, over 24492.00 frames. ], tot_loss[loss=0.1443, simple_loss=0.2235, pruned_loss=0.03257, over 24492.00 frames. ], batch size: 181, lr: 1.11e-02, grad_scale: 32.0 +2024-01-15 22:24:23,950 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 22:24:44,665 INFO [train.py:1026] (1/2) Epoch 38, validation: loss=0.1668, simple_loss=0.2482, pruned_loss=0.04272, over 1622729.00 frames. +2024-01-15 22:24:44,666 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 22:24:45,468 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=384, metric=22.22 vs. limit=22.5 +2024-01-15 22:24:48,558 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=103970.0, ans=0.125 +2024-01-15 22:24:54,479 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_module2.balancer1.prob, batch_count=103970.0, ans=0.125 +2024-01-15 22:25:00,305 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.whiten, num_groups=1, num_channels=512, metric=4.77 vs. limit=12.0 +2024-01-15 22:25:14,733 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward3.hidden_balancer.prob, batch_count=104036.66666666667, ans=0.125 +2024-01-15 22:25:46,217 INFO [train.py:994] (1/2) Epoch 38, batch 50, loss[loss=0.1564, simple_loss=0.2296, pruned_loss=0.04161, over 24388.00 frames. ], tot_loss[loss=0.1374, simple_loss=0.218, pruned_loss=0.02838, over 1087553.02 frames. ], batch size: 159, lr: 1.11e-02, grad_scale: 32.0 +2024-01-15 22:25:53,869 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.467e+02 1.647e+02 1.866e+02 2.059e+02 3.431e+02, threshold=3.732e+02, percent-clipped=0.0 +2024-01-15 22:26:01,693 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=104170.0, ans=0.1 +2024-01-15 22:26:08,516 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_skip_rate, batch_count=104170.0, ans=0.0 +2024-01-15 22:26:23,346 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=104236.66666666667, ans=0.125 +2024-01-15 22:26:32,195 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=104236.66666666667, ans=0.125 +2024-01-15 22:26:39,780 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_skip_rate, batch_count=104270.0, ans=0.0 +2024-01-15 22:26:47,755 INFO [train.py:994] (1/2) Epoch 38, batch 100, loss[loss=0.1212, simple_loss=0.2009, pruned_loss=0.02075, over 24492.00 frames. ], tot_loss[loss=0.1352, simple_loss=0.2157, pruned_loss=0.02734, over 1908781.60 frames. ], batch size: 148, lr: 1.11e-02, grad_scale: 32.0 +2024-01-15 22:26:51,745 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.4.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 22:26:56,977 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.3.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 22:27:27,476 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=104403.33333333333, ans=0.1 +2024-01-15 22:27:28,176 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_skip_rate, batch_count=104403.33333333333, ans=0.0 +2024-01-15 22:27:49,337 INFO [train.py:994] (1/2) Epoch 38, batch 150, loss[loss=0.1362, simple_loss=0.217, pruned_loss=0.02771, over 24541.00 frames. ], tot_loss[loss=0.137, simple_loss=0.2176, pruned_loss=0.02823, over 2547102.17 frames. ], batch size: 193, lr: 1.11e-02, grad_scale: 32.0 +2024-01-15 22:27:55,028 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.hidden_balancer.prob, batch_count=104470.0, ans=0.125 +2024-01-15 22:27:57,184 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.421e+02 1.711e+02 1.943e+02 2.161e+02 3.646e+02, threshold=3.887e+02, percent-clipped=0.0 +2024-01-15 22:28:04,910 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=384, metric=14.09 vs. limit=22.5 +2024-01-15 22:28:19,846 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=104536.66666666667, ans=0.125 +2024-01-15 22:28:25,788 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.bypass.scale_min, batch_count=104570.0, ans=0.2 +2024-01-15 22:28:51,159 INFO [train.py:994] (1/2) Epoch 38, batch 200, loss[loss=0.138, simple_loss=0.2204, pruned_loss=0.02784, over 24503.00 frames. ], tot_loss[loss=0.1377, simple_loss=0.2186, pruned_loss=0.0284, over 3059519.86 frames. ], batch size: 229, lr: 1.11e-02, grad_scale: 32.0 +2024-01-15 22:29:09,332 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.feed_forward1.hidden_balancer.prob, batch_count=104670.0, ans=0.125 +2024-01-15 22:29:09,985 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=144, metric=9.43 vs. limit=10.0 +2024-01-15 22:29:27,264 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.balancer1.prob, batch_count=104736.66666666667, ans=0.125 +2024-01-15 22:29:32,734 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=104736.66666666667, ans=0.125 +2024-01-15 22:29:53,051 INFO [train.py:994] (1/2) Epoch 38, batch 250, loss[loss=0.1365, simple_loss=0.2156, pruned_loss=0.02868, over 24610.00 frames. ], tot_loss[loss=0.1378, simple_loss=0.2185, pruned_loss=0.02855, over 3452666.63 frames. ], batch size: 199, lr: 1.11e-02, grad_scale: 16.0 +2024-01-15 22:30:01,871 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.373e+02 1.688e+02 1.935e+02 2.329e+02 4.264e+02, threshold=3.870e+02, percent-clipped=1.0 +2024-01-15 22:30:17,047 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.balancer_na.min_abs, batch_count=104870.0, ans=0.02 +2024-01-15 22:30:18,798 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=15.49 vs. limit=15.0 +2024-01-15 22:30:42,014 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module2.balancer2.prob, batch_count=104936.66666666667, ans=0.125 +2024-01-15 22:30:55,914 INFO [train.py:994] (1/2) Epoch 38, batch 300, loss[loss=0.1383, simple_loss=0.2193, pruned_loss=0.02868, over 24523.00 frames. ], tot_loss[loss=0.1384, simple_loss=0.2192, pruned_loss=0.02883, over 3766661.23 frames. ], batch size: 236, lr: 1.11e-02, grad_scale: 16.0 +2024-01-15 22:31:19,854 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_module1.balancer2.prob, batch_count=105036.66666666667, ans=0.125 +2024-01-15 22:31:42,767 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_module1.balancer2.prob, batch_count=105070.0, ans=0.125 +2024-01-15 22:31:51,707 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_skip_rate, batch_count=105103.33333333333, ans=0.0 +2024-01-15 22:31:57,308 INFO [train.py:994] (1/2) Epoch 38, batch 350, loss[loss=0.1344, simple_loss=0.2204, pruned_loss=0.02426, over 23912.00 frames. ], tot_loss[loss=0.1383, simple_loss=0.2193, pruned_loss=0.02871, over 4008521.57 frames. ], batch size: 328, lr: 1.11e-02, grad_scale: 16.0 +2024-01-15 22:32:06,171 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.427e+02 1.724e+02 1.882e+02 2.215e+02 3.681e+02, threshold=3.763e+02, percent-clipped=0.0 +2024-01-15 22:32:17,946 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=288, metric=3.57 vs. limit=10.0 +2024-01-15 22:32:21,220 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.4.encoder.layers.2.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 22:32:21,229 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.attention_skip_rate, batch_count=105203.33333333333, ans=0.0 +2024-01-15 22:32:29,343 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.conv_module2.whiten, num_groups=1, num_channels=512, metric=8.86 vs. limit=15.0 +2024-01-15 22:32:30,206 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.balancer2.prob, batch_count=105203.33333333333, ans=0.125 +2024-01-15 22:32:33,768 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module2.balancer2.prob, batch_count=105236.66666666667, ans=0.125 +2024-01-15 22:32:42,809 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.nonlin_attention.balancer.min_positive, batch_count=105236.66666666667, ans=0.05 +2024-01-15 22:32:51,806 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=384, metric=8.36 vs. limit=15.0 +2024-01-15 22:32:59,800 INFO [train.py:994] (1/2) Epoch 38, batch 400, loss[loss=0.133, simple_loss=0.2176, pruned_loss=0.02425, over 24533.00 frames. ], tot_loss[loss=0.138, simple_loss=0.2189, pruned_loss=0.02852, over 4192725.50 frames. ], batch size: 236, lr: 1.10e-02, grad_scale: 32.0 +2024-01-15 22:33:06,557 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.attention_skip_rate, batch_count=105303.33333333333, ans=0.0 +2024-01-15 22:33:19,796 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_skip_rate, batch_count=105336.66666666667, ans=0.0 +2024-01-15 22:33:49,771 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=105436.66666666667, ans=0.125 +2024-01-15 22:33:49,949 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.self_attn2.whiten.whitening_limit, batch_count=105436.66666666667, ans=22.5 +2024-01-15 22:34:02,000 INFO [train.py:994] (1/2) Epoch 38, batch 450, loss[loss=0.145, simple_loss=0.2175, pruned_loss=0.03626, over 24374.00 frames. ], tot_loss[loss=0.1375, simple_loss=0.2185, pruned_loss=0.02829, over 4329590.79 frames. ], batch size: 153, lr: 1.10e-02, grad_scale: 32.0 +2024-01-15 22:34:10,360 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.421e+02 1.663e+02 1.877e+02 2.191e+02 2.912e+02, threshold=3.754e+02, percent-clipped=0.0 +2024-01-15 22:34:22,415 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.out_proj.dropout_p, batch_count=105503.33333333333, ans=0.1 +2024-01-15 22:34:25,941 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module1.balancer2.prob, batch_count=105536.66666666667, ans=0.125 +2024-01-15 22:34:32,264 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=8.43 vs. limit=15.0 +2024-01-15 22:34:48,779 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 22:34:57,286 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass.scale_min, batch_count=105603.33333333333, ans=0.2 +2024-01-15 22:35:03,884 INFO [train.py:994] (1/2) Epoch 38, batch 500, loss[loss=0.1395, simple_loss=0.2226, pruned_loss=0.02819, over 24490.00 frames. ], tot_loss[loss=0.1374, simple_loss=0.2185, pruned_loss=0.02817, over 4434421.80 frames. ], batch size: 210, lr: 1.10e-02, grad_scale: 16.0 +2024-01-15 22:35:12,619 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=16.41 vs. limit=15.0 +2024-01-15 22:35:41,798 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module1.balancer1.max_abs, batch_count=105736.66666666667, ans=10.0 +2024-01-15 22:35:56,753 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.bypass.skip_rate, batch_count=105770.0, ans=0.09899494936611666 +2024-01-15 22:36:06,551 INFO [train.py:994] (1/2) Epoch 38, batch 550, loss[loss=0.1444, simple_loss=0.2177, pruned_loss=0.03554, over 24369.00 frames. ], tot_loss[loss=0.1366, simple_loss=0.2176, pruned_loss=0.02781, over 4517010.56 frames. ], batch size: 153, lr: 1.10e-02, grad_scale: 16.0 +2024-01-15 22:36:15,912 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.386e+02 1.718e+02 2.000e+02 2.477e+02 3.890e+02, threshold=4.000e+02, percent-clipped=2.0 +2024-01-15 22:36:29,420 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module2.balancer1.max_abs, batch_count=105836.66666666667, ans=10.0 +2024-01-15 22:36:33,053 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=105870.0, ans=0.125 +2024-01-15 22:36:35,478 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass_mid.scale_min, batch_count=105870.0, ans=0.2 +2024-01-15 22:36:47,426 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.balancer1.prob, batch_count=105903.33333333333, ans=0.125 +2024-01-15 22:36:49,186 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_skip_rate, batch_count=105903.33333333333, ans=0.0 +2024-01-15 22:36:53,450 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.attention_skip_rate, batch_count=105903.33333333333, ans=0.0 +2024-01-15 22:37:08,517 INFO [train.py:994] (1/2) Epoch 38, batch 600, loss[loss=0.1314, simple_loss=0.2106, pruned_loss=0.02604, over 24338.00 frames. ], tot_loss[loss=0.1365, simple_loss=0.2174, pruned_loss=0.02777, over 4569528.39 frames. ], batch size: 298, lr: 1.10e-02, grad_scale: 16.0 +2024-01-15 22:37:24,346 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.balancer2.prob, batch_count=106003.33333333333, ans=0.125 +2024-01-15 22:37:39,585 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.ff3_skip_rate, batch_count=106036.66666666667, ans=0.0 +2024-01-15 22:37:53,530 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.out_proj.dropout_p, batch_count=106070.0, ans=0.1 +2024-01-15 22:37:59,512 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.bypass.skip_rate, batch_count=106103.33333333333, ans=0.09899494936611666 +2024-01-15 22:38:03,129 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module2.balancer2.prob, batch_count=106103.33333333333, ans=0.125 +2024-01-15 22:38:03,136 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module1.balancer1.prob, batch_count=106103.33333333333, ans=0.125 +2024-01-15 22:38:11,077 INFO [train.py:994] (1/2) Epoch 38, batch 650, loss[loss=0.1446, simple_loss=0.2303, pruned_loss=0.0294, over 23883.00 frames. ], tot_loss[loss=0.1358, simple_loss=0.2165, pruned_loss=0.02759, over 4611489.47 frames. ], batch size: 328, lr: 1.10e-02, grad_scale: 16.0 +2024-01-15 22:38:18,453 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.3.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 22:38:20,425 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.326e+02 1.653e+02 1.885e+02 2.174e+02 2.953e+02, threshold=3.771e+02, percent-clipped=0.0 +2024-01-15 22:38:38,526 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.bypass_mid.scale_min, batch_count=106203.33333333333, ans=0.2 +2024-01-15 22:38:52,460 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module2.balancer1.prob, batch_count=106236.66666666667, ans=0.125 +2024-01-15 22:39:11,936 INFO [train.py:994] (1/2) Epoch 38, batch 700, loss[loss=0.1334, simple_loss=0.2195, pruned_loss=0.02359, over 24332.00 frames. ], tot_loss[loss=0.1361, simple_loss=0.2169, pruned_loss=0.02765, over 4647295.30 frames. ], batch size: 298, lr: 1.10e-02, grad_scale: 16.0 +2024-01-15 22:39:39,306 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module1.balancer2.prob, batch_count=106370.0, ans=0.125 +2024-01-15 22:39:44,088 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.ff3_skip_rate, batch_count=106370.0, ans=0.0 +2024-01-15 22:40:06,502 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward2.hidden_balancer.prob, batch_count=106436.66666666667, ans=0.125 +2024-01-15 22:40:13,872 INFO [train.py:994] (1/2) Epoch 38, batch 750, loss[loss=0.1513, simple_loss=0.2293, pruned_loss=0.03666, over 24486.00 frames. ], tot_loss[loss=0.1364, simple_loss=0.2173, pruned_loss=0.02773, over 4697115.67 frames. ], batch size: 181, lr: 1.10e-02, grad_scale: 16.0 +2024-01-15 22:40:23,819 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.394e+02 1.724e+02 1.869e+02 2.193e+02 3.130e+02, threshold=3.738e+02, percent-clipped=0.0 +2024-01-15 22:40:27,634 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module1.balancer1.prob, batch_count=106503.33333333333, ans=0.125 +2024-01-15 22:40:41,463 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_skip_rate, batch_count=106536.66666666667, ans=0.0 +2024-01-15 22:40:48,198 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.nonlin_attention.balancer.prob, batch_count=106536.66666666667, ans=0.125 +2024-01-15 22:40:54,285 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=192, metric=5.21 vs. limit=10.0 +2024-01-15 22:41:02,455 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.bypass.scale_min, batch_count=106603.33333333333, ans=0.2 +2024-01-15 22:41:08,200 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.bypass_mid.scale_min, batch_count=106603.33333333333, ans=0.2 +2024-01-15 22:41:11,112 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=192, metric=10.76 vs. limit=15.0 +2024-01-15 22:41:12,533 INFO [train.py:994] (1/2) Epoch 38, batch 800, loss[loss=0.1191, simple_loss=0.195, pruned_loss=0.02154, over 23990.00 frames. ], tot_loss[loss=0.1364, simple_loss=0.2171, pruned_loss=0.02786, over 4724257.91 frames. ], batch size: 131, lr: 1.10e-02, grad_scale: 32.0 +2024-01-15 22:41:14,482 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=256, metric=6.79 vs. limit=15.0 +2024-01-15 22:41:30,325 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=106670.0, ans=0.1 +2024-01-15 22:41:35,822 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=106670.0, ans=0.125 +2024-01-15 22:41:45,801 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.3.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 22:41:58,453 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=9.28 vs. limit=15.0 +2024-01-15 22:42:24,294 INFO [train.py:994] (1/2) Epoch 39, batch 0, loss[loss=0.1361, simple_loss=0.2179, pruned_loss=0.02715, over 24462.00 frames. ], tot_loss[loss=0.1361, simple_loss=0.2179, pruned_loss=0.02715, over 24462.00 frames. ], batch size: 216, lr: 1.08e-02, grad_scale: 32.0 +2024-01-15 22:42:24,294 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 22:42:44,901 INFO [zipformer.py:1858] (1/2) name=encoder.encoders.4.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([1.8761, 3.2037, 3.3225, 2.5775], device='cuda:1') +2024-01-15 22:42:45,267 INFO [train.py:1026] (1/2) Epoch 39, validation: loss=0.1671, simple_loss=0.2478, pruned_loss=0.04323, over 1622729.00 frames. +2024-01-15 22:42:45,267 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 22:42:59,726 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.bypass_mid.scale_min, batch_count=106813.33333333333, ans=0.2 +2024-01-15 22:43:03,615 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.446e+02 1.753e+02 1.942e+02 2.163e+02 3.117e+02, threshold=3.885e+02, percent-clipped=0.0 +2024-01-15 22:43:21,144 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.bypass.scale_min, batch_count=106880.0, ans=0.2 +2024-01-15 22:43:22,417 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_skip_rate, batch_count=106880.0, ans=0.0 +2024-01-15 22:43:30,610 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.ff3_skip_rate, batch_count=106880.0, ans=0.0 +2024-01-15 22:43:46,618 INFO [train.py:994] (1/2) Epoch 39, batch 50, loss[loss=0.1226, simple_loss=0.1998, pruned_loss=0.02276, over 24240.00 frames. ], tot_loss[loss=0.1335, simple_loss=0.2147, pruned_loss=0.0261, over 1088965.05 frames. ], batch size: 140, lr: 1.08e-02, grad_scale: 32.0 +2024-01-15 22:44:19,733 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass.scale_min, batch_count=107013.33333333333, ans=0.2 +2024-01-15 22:44:43,066 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=8.94 vs. limit=15.0 +2024-01-15 22:44:49,779 INFO [train.py:994] (1/2) Epoch 39, batch 100, loss[loss=0.1461, simple_loss=0.2205, pruned_loss=0.03579, over 24420.00 frames. ], tot_loss[loss=0.1364, simple_loss=0.217, pruned_loss=0.0279, over 1913705.21 frames. ], batch size: 159, lr: 1.08e-02, grad_scale: 32.0 +2024-01-15 22:45:00,562 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=107146.66666666667, ans=0.1 +2024-01-15 22:45:08,970 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.490e+02 1.665e+02 1.865e+02 2.261e+02 2.773e+02, threshold=3.730e+02, percent-clipped=0.0 +2024-01-15 22:45:19,216 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.ff3_skip_rate, batch_count=107180.0, ans=0.0 +2024-01-15 22:45:51,052 INFO [train.py:994] (1/2) Epoch 39, batch 150, loss[loss=0.1286, simple_loss=0.2131, pruned_loss=0.02207, over 24474.00 frames. ], tot_loss[loss=0.136, simple_loss=0.2171, pruned_loss=0.02741, over 2567591.26 frames. ], batch size: 267, lr: 1.08e-02, grad_scale: 16.0 +2024-01-15 22:45:51,287 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=107280.0, ans=0.0 +2024-01-15 22:45:59,269 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_skip_rate, batch_count=107280.0, ans=0.0 +2024-01-15 22:46:01,596 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_skip_rate, batch_count=107280.0, ans=0.0 +2024-01-15 22:46:03,428 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.attention_skip_rate, batch_count=107313.33333333333, ans=0.0 +2024-01-15 22:46:13,864 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=8.10 vs. limit=15.0 +2024-01-15 22:46:18,202 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.conv_module2.whiten, num_groups=1, num_channels=256, metric=2.72 vs. limit=15.0 +2024-01-15 22:46:22,780 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.ff2_skip_rate, batch_count=107346.66666666667, ans=0.0 +2024-01-15 22:46:54,032 INFO [train.py:994] (1/2) Epoch 39, batch 200, loss[loss=0.1376, simple_loss=0.2224, pruned_loss=0.02638, over 24473.00 frames. ], tot_loss[loss=0.1364, simple_loss=0.2174, pruned_loss=0.02768, over 3053450.39 frames. ], batch size: 267, lr: 1.08e-02, grad_scale: 16.0 +2024-01-15 22:47:02,092 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.4.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 22:47:13,484 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.380e+02 1.667e+02 1.845e+02 2.071e+02 3.015e+02, threshold=3.690e+02, percent-clipped=0.0 +2024-01-15 22:47:16,822 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=107480.0, ans=0.1 +2024-01-15 22:47:38,724 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=107546.66666666667, ans=0.125 +2024-01-15 22:47:56,363 INFO [train.py:994] (1/2) Epoch 39, batch 250, loss[loss=0.1402, simple_loss=0.2248, pruned_loss=0.02774, over 22314.00 frames. ], tot_loss[loss=0.1366, simple_loss=0.2178, pruned_loss=0.02773, over 3441532.49 frames. ], batch size: 357, lr: 1.08e-02, grad_scale: 16.0 +2024-01-15 22:48:19,458 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.whiten, num_groups=1, num_channels=384, metric=2.95 vs. limit=12.0 +2024-01-15 22:48:21,535 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.bypass.skip_rate, batch_count=107680.0, ans=0.07 +2024-01-15 22:48:32,571 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module2.balancer1.prob, batch_count=107713.33333333333, ans=0.125 +2024-01-15 22:48:34,915 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=107713.33333333333, ans=0.125 +2024-01-15 22:48:39,720 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_skip_rate, batch_count=107713.33333333333, ans=0.0 +2024-01-15 22:48:58,308 INFO [train.py:994] (1/2) Epoch 39, batch 300, loss[loss=0.147, simple_loss=0.2275, pruned_loss=0.03324, over 24477.00 frames. ], tot_loss[loss=0.1364, simple_loss=0.2175, pruned_loss=0.02763, over 3743542.24 frames. ], batch size: 187, lr: 1.08e-02, grad_scale: 16.0 +2024-01-15 22:49:17,961 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.422e+02 1.701e+02 1.871e+02 2.220e+02 3.630e+02, threshold=3.743e+02, percent-clipped=0.0 +2024-01-15 22:49:25,582 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.convnext.out_balancer.prob, batch_count=107846.66666666667, ans=0.125 +2024-01-15 22:49:42,855 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.balancer1.prob, batch_count=107880.0, ans=0.125 +2024-01-15 22:49:42,921 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.ff3_skip_rate, batch_count=107880.0, ans=0.0 +2024-01-15 22:49:44,372 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=384, metric=13.23 vs. limit=22.5 +2024-01-15 22:50:00,447 INFO [train.py:994] (1/2) Epoch 39, batch 350, loss[loss=0.1426, simple_loss=0.2297, pruned_loss=0.02773, over 22580.00 frames. ], tot_loss[loss=0.1368, simple_loss=0.218, pruned_loss=0.02777, over 3989489.72 frames. ], batch size: 357, lr: 1.08e-02, grad_scale: 16.0 +2024-01-15 22:50:04,494 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=384, metric=16.48 vs. limit=22.5 +2024-01-15 22:50:05,375 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_skip_rate, batch_count=107946.66666666667, ans=0.0 +2024-01-15 22:50:07,711 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module2.balancer2.prob, batch_count=107946.66666666667, ans=0.125 +2024-01-15 22:50:11,970 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass.scale_min, batch_count=107980.0, ans=0.2 +2024-01-15 22:50:13,227 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass.scale_min, batch_count=107980.0, ans=0.2 +2024-01-15 22:50:14,860 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.self_attn2.whiten, num_groups=1, num_channels=192, metric=14.39 vs. limit=22.5 +2024-01-15 22:50:18,985 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=107980.0, ans=0.1 +2024-01-15 22:50:33,554 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.whiten.whitening_limit, batch_count=108013.33333333333, ans=15.0 +2024-01-15 22:50:55,748 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module2.balancer1.min_positive, batch_count=108080.0, ans=0.025 +2024-01-15 22:51:02,423 INFO [train.py:994] (1/2) Epoch 39, batch 400, loss[loss=0.1332, simple_loss=0.2137, pruned_loss=0.02632, over 24493.00 frames. ], tot_loss[loss=0.1362, simple_loss=0.2171, pruned_loss=0.02767, over 4149463.36 frames. ], batch size: 216, lr: 1.08e-02, grad_scale: 32.0 +2024-01-15 22:51:04,051 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=108113.33333333333, ans=0.125 +2024-01-15 22:51:15,516 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.feed_forward3.out_whiten, num_groups=1, num_channels=512, metric=12.78 vs. limit=15.0 +2024-01-15 22:51:22,584 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.422e+02 1.708e+02 1.948e+02 2.231e+02 3.344e+02, threshold=3.895e+02, percent-clipped=0.0 +2024-01-15 22:51:50,000 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.ff3_skip_rate, batch_count=108213.33333333333, ans=0.0 +2024-01-15 22:52:04,611 INFO [train.py:994] (1/2) Epoch 39, batch 450, loss[loss=0.1476, simple_loss=0.2245, pruned_loss=0.03537, over 24521.00 frames. ], tot_loss[loss=0.136, simple_loss=0.2168, pruned_loss=0.02755, over 4298274.14 frames. ], batch size: 193, lr: 1.08e-02, grad_scale: 32.0 +2024-01-15 22:52:08,363 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_skip_rate, batch_count=108280.0, ans=0.0 +2024-01-15 22:52:11,881 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module2.balancer1.prob, batch_count=108280.0, ans=0.125 +2024-01-15 22:52:25,139 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module2.balancer1.prob, batch_count=108313.33333333333, ans=0.125 +2024-01-15 22:52:26,255 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.nonlin_attention.balancer.prob, batch_count=108313.33333333333, ans=0.125 +2024-01-15 22:52:26,381 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.ff3_skip_rate, batch_count=108313.33333333333, ans=0.0 +2024-01-15 22:52:52,007 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=5.81 vs. limit=6.0 +2024-01-15 22:52:52,997 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=384, metric=5.97 vs. limit=10.0 +2024-01-15 22:53:04,298 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.balancer1.prob, batch_count=108413.33333333333, ans=0.125 +2024-01-15 22:53:04,337 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=108413.33333333333, ans=0.1 +2024-01-15 22:53:07,083 INFO [train.py:994] (1/2) Epoch 39, batch 500, loss[loss=0.1309, simple_loss=0.2168, pruned_loss=0.02249, over 24501.00 frames. ], tot_loss[loss=0.1363, simple_loss=0.2172, pruned_loss=0.02771, over 4408053.35 frames. ], batch size: 267, lr: 1.08e-02, grad_scale: 32.0 +2024-01-15 22:53:07,440 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.ff2_skip_rate, batch_count=108446.66666666667, ans=0.0 +2024-01-15 22:53:26,415 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.459e+02 1.682e+02 1.844e+02 2.205e+02 3.574e+02, threshold=3.688e+02, percent-clipped=0.0 +2024-01-15 22:53:34,374 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_skip_rate, batch_count=108513.33333333333, ans=0.0 +2024-01-15 22:53:43,343 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=108546.66666666667, ans=0.125 +2024-01-15 22:53:43,861 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=9.05 vs. limit=15.0 +2024-01-15 22:54:04,288 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=108580.0, ans=0.1 +2024-01-15 22:54:08,753 INFO [train.py:994] (1/2) Epoch 39, batch 550, loss[loss=0.1333, simple_loss=0.2179, pruned_loss=0.0244, over 24460.00 frames. ], tot_loss[loss=0.1362, simple_loss=0.2171, pruned_loss=0.02761, over 4497675.31 frames. ], batch size: 267, lr: 1.08e-02, grad_scale: 32.0 +2024-01-15 22:54:09,016 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.bypass.skip_rate, batch_count=108613.33333333333, ans=0.07 +2024-01-15 22:54:11,862 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.self_attn1.whiten, num_groups=1, num_channels=384, metric=20.13 vs. limit=22.5 +2024-01-15 22:54:17,255 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass.skip_rate, batch_count=108613.33333333333, ans=0.07 +2024-01-15 22:54:28,055 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.ff2_skip_rate, batch_count=108646.66666666667, ans=0.0 +2024-01-15 22:55:10,597 INFO [train.py:994] (1/2) Epoch 39, batch 600, loss[loss=0.1371, simple_loss=0.2147, pruned_loss=0.02978, over 24592.00 frames. ], tot_loss[loss=0.1361, simple_loss=0.2169, pruned_loss=0.02764, over 4555890.08 frames. ], batch size: 176, lr: 1.07e-02, grad_scale: 32.0 +2024-01-15 22:55:18,246 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.2.conv_module2.whiten, num_groups=1, num_channels=384, metric=2.86 vs. limit=15.0 +2024-01-15 22:55:28,382 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.attention_skip_rate, batch_count=108813.33333333333, ans=0.0 +2024-01-15 22:55:30,393 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.451e+02 1.623e+02 1.740e+02 1.924e+02 3.185e+02, threshold=3.479e+02, percent-clipped=0.0 +2024-01-15 22:55:39,420 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.conv_module2.whiten, num_groups=1, num_channels=384, metric=2.96 vs. limit=15.0 +2024-01-15 22:55:42,062 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module2.balancer1.prob, batch_count=108846.66666666667, ans=0.125 +2024-01-15 22:55:45,109 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.balancer2.prob, batch_count=108846.66666666667, ans=0.125 +2024-01-15 22:56:13,014 INFO [train.py:994] (1/2) Epoch 39, batch 650, loss[loss=0.1154, simple_loss=0.1955, pruned_loss=0.0176, over 23988.00 frames. ], tot_loss[loss=0.1356, simple_loss=0.2163, pruned_loss=0.02745, over 4606960.01 frames. ], batch size: 131, lr: 1.07e-02, grad_scale: 32.0 +2024-01-15 22:56:34,987 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module2.balancer2.prob, batch_count=108980.0, ans=0.125 +2024-01-15 22:56:46,494 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=109013.33333333333, ans=0.125 +2024-01-15 22:56:47,800 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module1.balancer2.prob, batch_count=109013.33333333333, ans=0.125 +2024-01-15 22:57:08,718 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.ff3_skip_rate, batch_count=109080.0, ans=0.0 +2024-01-15 22:57:14,413 INFO [train.py:994] (1/2) Epoch 39, batch 700, loss[loss=0.1471, simple_loss=0.2292, pruned_loss=0.03248, over 24504.00 frames. ], tot_loss[loss=0.1359, simple_loss=0.2169, pruned_loss=0.02751, over 4651302.28 frames. ], batch size: 165, lr: 1.07e-02, grad_scale: 32.0 +2024-01-15 22:57:34,442 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.385e+02 1.721e+02 2.012e+02 2.362e+02 3.911e+02, threshold=4.024e+02, percent-clipped=3.0 +2024-01-15 22:57:35,908 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.out_combiner.scale_min, batch_count=109146.66666666667, ans=0.2 +2024-01-15 22:57:38,338 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_module2.balancer1.prob, batch_count=109180.0, ans=0.125 +2024-01-15 22:58:06,848 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=109246.66666666667, ans=0.1 +2024-01-15 22:58:09,179 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=109246.66666666667, ans=0.125 +2024-01-15 22:58:16,418 INFO [train.py:994] (1/2) Epoch 39, batch 750, loss[loss=0.1503, simple_loss=0.2297, pruned_loss=0.03552, over 24477.00 frames. ], tot_loss[loss=0.1355, simple_loss=0.2163, pruned_loss=0.0274, over 4677292.51 frames. ], batch size: 222, lr: 1.07e-02, grad_scale: 32.0 +2024-01-15 22:58:22,072 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.out_proj.dropout_p, batch_count=109280.0, ans=0.1 +2024-01-15 22:58:50,672 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_skip_rate, batch_count=109346.66666666667, ans=0.0 +2024-01-15 22:59:05,285 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=256, metric=16.65 vs. limit=22.5 +2024-01-15 22:59:07,222 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.out_proj.dropout_p, batch_count=109413.33333333333, ans=0.1 +2024-01-15 22:59:12,002 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=512, metric=14.27 vs. limit=15.0 +2024-01-15 22:59:15,874 INFO [train.py:994] (1/2) Epoch 39, batch 800, loss[loss=0.1454, simple_loss=0.2253, pruned_loss=0.03272, over 24512.00 frames. ], tot_loss[loss=0.136, simple_loss=0.2167, pruned_loss=0.02767, over 4713901.08 frames. ], batch size: 165, lr: 1.07e-02, grad_scale: 32.0 +2024-01-15 22:59:21,006 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=109446.66666666667, ans=0.125 +2024-01-15 22:59:34,622 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.413e+02 1.634e+02 1.767e+02 1.970e+02 3.701e+02, threshold=3.534e+02, percent-clipped=0.0 +2024-01-15 22:59:53,944 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.bypass.skip_rate, batch_count=109546.66666666667, ans=0.07 +2024-01-15 22:59:58,643 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.conv_module1.balancer2.prob, batch_count=109546.66666666667, ans=0.125 +2024-01-15 23:00:27,701 INFO [train.py:994] (1/2) Epoch 40, batch 0, loss[loss=0.1471, simple_loss=0.2267, pruned_loss=0.0338, over 24523.00 frames. ], tot_loss[loss=0.1471, simple_loss=0.2267, pruned_loss=0.0338, over 24523.00 frames. ], batch size: 243, lr: 1.06e-02, grad_scale: 32.0 +2024-01-15 23:00:27,701 INFO [train.py:1017] (1/2) Computing validation loss +2024-01-15 23:00:47,946 INFO [train.py:1026] (1/2) Epoch 40, validation: loss=0.1662, simple_loss=0.2482, pruned_loss=0.04211, over 1622729.00 frames. +2024-01-15 23:00:47,946 INFO [train.py:1027] (1/2) Maximum memory allocated so far is 15882MB +2024-01-15 23:00:50,521 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.conv.8.prob, batch_count=109590.0, ans=0.125 +2024-01-15 23:01:10,915 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.balancer1.prob, batch_count=109623.33333333333, ans=0.125 +2024-01-15 23:01:21,455 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.balancer2.prob, batch_count=109656.66666666667, ans=0.125 +2024-01-15 23:01:28,776 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=109690.0, ans=0.1 +2024-01-15 23:01:39,206 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.ff3_skip_rate, batch_count=109723.33333333333, ans=0.0 +2024-01-15 23:01:49,405 INFO [train.py:994] (1/2) Epoch 40, batch 50, loss[loss=0.1297, simple_loss=0.208, pruned_loss=0.0257, over 24506.00 frames. ], tot_loss[loss=0.1326, simple_loss=0.2132, pruned_loss=0.02598, over 1076844.39 frames. ], batch size: 243, lr: 1.06e-02, grad_scale: 32.0 +2024-01-15 23:01:57,602 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass_mid.scale_min, batch_count=109756.66666666667, ans=0.2 +2024-01-15 23:02:03,912 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.nonlin_attention.whiten2, num_groups=1, num_channels=512, metric=5.52 vs. limit=15.0 +2024-01-15 23:02:04,611 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder_embed.conv.2.prob, batch_count=109790.0, ans=0.125 +2024-01-15 23:02:08,838 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=7.24 vs. limit=15.0 +2024-01-15 23:02:18,486 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.360e+02 1.725e+02 1.948e+02 2.244e+02 4.405e+02, threshold=3.895e+02, percent-clipped=1.0 +2024-01-15 23:02:24,519 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.balancer.min_positive, batch_count=109823.33333333333, ans=0.05 +2024-01-15 23:02:25,757 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module1.balancer1.min_positive, batch_count=109856.66666666667, ans=0.025 +2024-01-15 23:02:51,522 INFO [train.py:994] (1/2) Epoch 40, batch 100, loss[loss=0.138, simple_loss=0.2211, pruned_loss=0.0275, over 24444.00 frames. ], tot_loss[loss=0.1353, simple_loss=0.2156, pruned_loss=0.02746, over 1910524.74 frames. ], batch size: 267, lr: 1.06e-02, grad_scale: 32.0 +2024-01-15 23:03:01,230 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.ff3_skip_rate, batch_count=109923.33333333333, ans=0.0 +2024-01-15 23:03:01,231 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass.skip_rate, batch_count=109923.33333333333, ans=0.04949747468305833 +2024-01-15 23:03:09,196 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=10.89 vs. limit=15.0 +2024-01-15 23:03:10,548 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.out_proj.dropout_p, batch_count=109956.66666666667, ans=0.1 +2024-01-15 23:03:11,767 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.ff2_skip_rate, batch_count=109956.66666666667, ans=0.0 +2024-01-15 23:03:23,850 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=109990.0, ans=0.125 +2024-01-15 23:03:53,390 INFO [train.py:994] (1/2) Epoch 40, batch 150, loss[loss=0.1358, simple_loss=0.2142, pruned_loss=0.02869, over 24543.00 frames. ], tot_loss[loss=0.1357, simple_loss=0.2166, pruned_loss=0.02744, over 2563579.41 frames. ], batch size: 243, lr: 1.06e-02, grad_scale: 32.0 +2024-01-15 23:03:53,673 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=110090.0, ans=0.125 +2024-01-15 23:03:54,109 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.conv_module2.whiten, num_groups=1, num_channels=384, metric=2.82 vs. limit=15.0 +2024-01-15 23:03:54,775 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=110090.0, ans=0.1 +2024-01-15 23:04:11,062 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=144, metric=6.10 vs. limit=10.0 +2024-01-15 23:04:17,643 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.2.feed_forward1.out_whiten, num_groups=1, num_channels=512, metric=6.44 vs. limit=15.0 +2024-01-15 23:04:21,716 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.371e+02 1.612e+02 1.772e+02 1.915e+02 2.986e+02, threshold=3.544e+02, percent-clipped=0.0 +2024-01-15 23:04:31,987 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=110190.0, ans=0.125 +2024-01-15 23:04:56,636 INFO [train.py:994] (1/2) Epoch 40, batch 200, loss[loss=0.1489, simple_loss=0.2264, pruned_loss=0.03567, over 24448.00 frames. ], tot_loss[loss=0.1354, simple_loss=0.2163, pruned_loss=0.02724, over 3061588.68 frames. ], batch size: 170, lr: 1.05e-02, grad_scale: 32.0 +2024-01-15 23:05:01,680 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_skip_rate, batch_count=110256.66666666667, ans=0.0 +2024-01-15 23:05:16,693 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=110290.0, ans=0.0 +2024-01-15 23:05:20,139 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=110323.33333333333, ans=0.0 +2024-01-15 23:05:58,331 INFO [train.py:994] (1/2) Epoch 40, batch 250, loss[loss=0.1407, simple_loss=0.2262, pruned_loss=0.02758, over 24514.00 frames. ], tot_loss[loss=0.136, simple_loss=0.2172, pruned_loss=0.02736, over 3452733.92 frames. ], batch size: 193, lr: 1.05e-02, grad_scale: 32.0 +2024-01-15 23:06:18,696 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.balancer1.min_positive, batch_count=110456.66666666667, ans=0.025 +2024-01-15 23:06:21,120 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.bypass.scale_min, batch_count=110456.66666666667, ans=0.2 +2024-01-15 23:06:24,299 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=4.32 vs. limit=6.0 +2024-01-15 23:06:25,140 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.2.feed_forward3.out_whiten, num_groups=1, num_channels=384, metric=10.89 vs. limit=15.0 +2024-01-15 23:06:26,698 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.275e+02 1.651e+02 1.792e+02 2.080e+02 3.773e+02, threshold=3.583e+02, percent-clipped=1.0 +2024-01-15 23:06:58,439 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=110556.66666666667, ans=0.1 +2024-01-15 23:07:00,540 INFO [train.py:994] (1/2) Epoch 40, batch 300, loss[loss=0.1423, simple_loss=0.2227, pruned_loss=0.03096, over 24463.00 frames. ], tot_loss[loss=0.1359, simple_loss=0.217, pruned_loss=0.02739, over 3758227.45 frames. ], batch size: 181, lr: 1.05e-02, grad_scale: 32.0 +2024-01-15 23:07:02,541 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=9.49 vs. limit=15.0 +2024-01-15 23:07:10,881 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 23:07:12,083 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.balancer1.prob, batch_count=110623.33333333333, ans=0.125 +2024-01-15 23:07:20,843 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 23:07:21,970 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.ff3_skip_rate, batch_count=110623.33333333333, ans=0.0 +2024-01-15 23:08:02,775 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_skip_rate, batch_count=110756.66666666667, ans=0.0 +2024-01-15 23:08:03,651 INFO [train.py:994] (1/2) Epoch 40, batch 350, loss[loss=0.1375, simple_loss=0.2228, pruned_loss=0.02607, over 24239.00 frames. ], tot_loss[loss=0.1358, simple_loss=0.217, pruned_loss=0.02734, over 3991809.91 frames. ], batch size: 311, lr: 1.05e-02, grad_scale: 32.0 +2024-01-15 23:08:32,145 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.357e+02 1.654e+02 1.827e+02 2.159e+02 3.996e+02, threshold=3.655e+02, percent-clipped=2.0 +2024-01-15 23:08:41,381 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward3.hidden_balancer.prob, batch_count=110856.66666666667, ans=0.125 +2024-01-15 23:09:05,332 INFO [train.py:994] (1/2) Epoch 40, batch 400, loss[loss=0.1224, simple_loss=0.2039, pruned_loss=0.02046, over 24318.00 frames. ], tot_loss[loss=0.135, simple_loss=0.2159, pruned_loss=0.02703, over 4159909.38 frames. ], batch size: 147, lr: 1.05e-02, grad_scale: 32.0 +2024-01-15 23:09:05,588 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_skip_rate, batch_count=110923.33333333333, ans=0.0 +2024-01-15 23:09:07,431 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass.scale_min, batch_count=110923.33333333333, ans=0.2 +2024-01-15 23:09:31,388 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.self_attn_weights.pos_emb_skip_rate, batch_count=110990.0, ans=0.0 +2024-01-15 23:09:53,050 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_module1.balancer1.prob, batch_count=111023.33333333333, ans=0.125 +2024-01-15 23:10:07,369 INFO [train.py:994] (1/2) Epoch 40, batch 450, loss[loss=0.1515, simple_loss=0.2323, pruned_loss=0.03537, over 24573.00 frames. ], tot_loss[loss=0.135, simple_loss=0.2161, pruned_loss=0.027, over 4305194.10 frames. ], batch size: 176, lr: 1.05e-02, grad_scale: 32.0 +2024-01-15 23:10:36,316 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.450e+02 1.698e+02 1.858e+02 2.185e+02 3.048e+02, threshold=3.715e+02, percent-clipped=0.0 +2024-01-15 23:10:39,185 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=111156.66666666667, ans=0.1 +2024-01-15 23:10:40,755 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=512, metric=10.28 vs. limit=15.0 +2024-01-15 23:10:41,792 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=256, metric=12.24 vs. limit=22.5 +2024-01-15 23:10:42,663 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.balancer1.prob, batch_count=111156.66666666667, ans=0.125 +2024-01-15 23:10:49,210 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=111190.0, ans=0.125 +2024-01-15 23:11:06,279 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module2.balancer2.prob, batch_count=111223.33333333333, ans=0.125 +2024-01-15 23:11:07,465 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module2.balancer1.prob, batch_count=111223.33333333333, ans=0.125 +2024-01-15 23:11:09,531 INFO [train.py:994] (1/2) Epoch 40, batch 500, loss[loss=0.1367, simple_loss=0.2193, pruned_loss=0.02702, over 24427.00 frames. ], tot_loss[loss=0.135, simple_loss=0.216, pruned_loss=0.027, over 4410683.52 frames. ], batch size: 258, lr: 1.05e-02, grad_scale: 32.0 +2024-01-15 23:11:14,102 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.whiten, num_groups=1, num_channels=256, metric=2.56 vs. limit=12.0 +2024-01-15 23:11:16,047 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.bypass_mid.scale_min, batch_count=111256.66666666667, ans=0.2 +2024-01-15 23:11:19,596 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.attention_skip_rate, batch_count=111256.66666666667, ans=0.0 +2024-01-15 23:11:21,923 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=111290.0, ans=0.1 +2024-01-15 23:11:30,747 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.ff2_skip_rate, batch_count=111290.0, ans=0.0 +2024-01-15 23:11:31,993 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=111290.0, ans=0.125 +2024-01-15 23:11:32,291 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.3.feed_forward2.out_whiten, num_groups=1, num_channels=512, metric=10.81 vs. limit=15.0 +2024-01-15 23:11:41,170 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_module1.balancer1.prob, batch_count=111323.33333333333, ans=0.125 +2024-01-15 23:11:43,556 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass_mid.scale_min, batch_count=111323.33333333333, ans=0.2 +2024-01-15 23:11:44,717 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.attention_skip_rate, batch_count=111323.33333333333, ans=0.0 +2024-01-15 23:11:52,319 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.conv_module2.balancer2.prob, batch_count=111356.66666666667, ans=0.125 +2024-01-15 23:11:55,935 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.ff2_skip_rate, batch_count=111356.66666666667, ans=0.0 +2024-01-15 23:12:11,764 INFO [train.py:994] (1/2) Epoch 40, batch 550, loss[loss=0.1511, simple_loss=0.2276, pruned_loss=0.03729, over 24524.00 frames. ], tot_loss[loss=0.135, simple_loss=0.2162, pruned_loss=0.02687, over 4500775.07 frames. ], batch size: 165, lr: 1.05e-02, grad_scale: 32.0 +2024-01-15 23:12:14,393 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=111423.33333333333, ans=0.1 +2024-01-15 23:12:27,660 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass_mid.scale_min, batch_count=111456.66666666667, ans=0.2 +2024-01-15 23:12:34,818 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.bypass_mid.scale_min, batch_count=111456.66666666667, ans=0.2 +2024-01-15 23:12:40,492 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.335e+02 1.659e+02 1.800e+02 2.093e+02 3.891e+02, threshold=3.601e+02, percent-clipped=1.0 +2024-01-15 23:12:50,962 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=111523.33333333333, ans=0.125 +2024-01-15 23:12:55,517 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=384, metric=3.47 vs. limit=15.0 +2024-01-15 23:12:56,525 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.ff3_skip_rate, batch_count=111523.33333333333, ans=0.0 +2024-01-15 23:13:14,779 INFO [train.py:994] (1/2) Epoch 40, batch 600, loss[loss=0.1322, simple_loss=0.2153, pruned_loss=0.02459, over 24316.00 frames. ], tot_loss[loss=0.1348, simple_loss=0.2157, pruned_loss=0.02697, over 4544713.19 frames. ], batch size: 285, lr: 1.05e-02, grad_scale: 32.0 +2024-01-15 23:13:23,927 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module2.balancer2.prob, batch_count=111590.0, ans=0.125 +2024-01-15 23:13:36,250 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=8.43 vs. limit=15.0 +2024-01-15 23:13:52,249 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.balancer.max_positive, batch_count=111690.0, ans=0.95 +2024-01-15 23:14:10,373 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.bypass.scale_min, batch_count=111723.33333333333, ans=0.2 +2024-01-15 23:14:16,601 INFO [train.py:994] (1/2) Epoch 40, batch 650, loss[loss=0.1372, simple_loss=0.2255, pruned_loss=0.02443, over 23853.00 frames. ], tot_loss[loss=0.1347, simple_loss=0.2156, pruned_loss=0.02691, over 4600407.34 frames. ], batch size: 328, lr: 1.05e-02, grad_scale: 32.0 +2024-01-15 23:14:31,615 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.ff2_skip_rate, batch_count=111790.0, ans=0.0 +2024-01-15 23:14:44,874 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.423e+02 1.625e+02 1.766e+02 2.027e+02 4.095e+02, threshold=3.531e+02, percent-clipped=1.0 +2024-01-15 23:14:47,550 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass.scale_min, batch_count=111823.33333333333, ans=0.2 +2024-01-15 23:14:52,298 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.2.feed_forward3.hidden_balancer.prob, batch_count=111856.66666666667, ans=0.125 +2024-01-15 23:14:59,975 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.out_proj.dropout_p, batch_count=111856.66666666667, ans=0.1 +2024-01-15 23:15:03,137 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=111856.66666666667, ans=0.1 +2024-01-15 23:15:18,236 INFO [train.py:994] (1/2) Epoch 40, batch 700, loss[loss=0.1464, simple_loss=0.233, pruned_loss=0.02993, over 22298.00 frames. ], tot_loss[loss=0.1353, simple_loss=0.2162, pruned_loss=0.02718, over 4651224.57 frames. ], batch size: 357, lr: 1.05e-02, grad_scale: 32.0 +2024-01-15 23:15:42,011 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.2.conv_module2.balancer2.prob, batch_count=111990.0, ans=0.125 +2024-01-15 23:15:58,968 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.ff3_skip_rate, batch_count=112023.33333333333, ans=0.0 +2024-01-15 23:16:20,891 INFO [train.py:994] (1/2) Epoch 40, batch 750, loss[loss=0.1103, simple_loss=0.1774, pruned_loss=0.02164, over 19147.00 frames. ], tot_loss[loss=0.1351, simple_loss=0.2161, pruned_loss=0.02704, over 4676759.09 frames. ], batch size: 83, lr: 1.05e-02, grad_scale: 32.0 +2024-01-15 23:16:30,611 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=112090.0, ans=0.125 +2024-01-15 23:16:49,273 WARNING [optim.py:484] (1/2) Clipping_scale=2.0, grad-norm quartiles 1.344e+02 1.684e+02 1.887e+02 2.105e+02 4.185e+02, threshold=3.775e+02, percent-clipped=1.0 +2024-01-15 23:16:56,508 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.conv_module2.balancer2.prob, batch_count=112190.0, ans=0.125 +2024-01-15 23:17:06,558 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=112190.0, ans=0.0 +2024-01-15 23:17:20,980 INFO [train.py:994] (1/2) Epoch 40, batch 800, loss[loss=0.1321, simple_loss=0.2151, pruned_loss=0.02455, over 24164.00 frames. ], tot_loss[loss=0.1349, simple_loss=0.2158, pruned_loss=0.02698, over 4698059.83 frames. ], batch size: 140, lr: 1.05e-02, grad_scale: 32.0 +2024-01-15 23:17:22,784 INFO [scaling.py:1022] (1/2) Whitening: name=encoder.encoders.0.layers.1.conv_module2.whiten, num_groups=1, num_channels=192, metric=3.44 vs. limit=15.0 +2024-01-15 23:17:31,042 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.3.nonlin_attention.balancer.max_positive, batch_count=112290.0, ans=0.95 +2024-01-15 23:17:43,341 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.2.feed_forward1.out_proj.dropout_p, batch_count=112323.33333333333, ans=0.1 +2024-01-15 23:17:52,516 INFO [scaling.py:213] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=112323.33333333333, ans=0.1 +2024-01-15 23:18:00,602 INFO [scaling.py:1118] (1/2) WithLoss: name=encoder.encoders.0.layers.1.self_attn_weights, loss-sum=0.000e+00 +2024-01-15 23:18:09,042 INFO [train.py:1256] (1/2) Done!