diff --git "a/exp/log/log-train-2023-05-10-14-59-59-1" "b/exp/log/log-train-2023-05-10-14-59-59-1" new file mode 100644--- /dev/null +++ "b/exp/log/log-train-2023-05-10-14-59-59-1" @@ -0,0 +1,941 @@ +2023-05-10 14:59:59,103 INFO [train.py:1091] (1/2) Training started +2023-05-10 14:59:59,103 INFO [train.py:1101] (1/2) Device: cuda:1 +2023-05-10 14:59:59,107 INFO [train.py:1110] (1/2) {'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 50, 'reset_interval': 200, 'valid_interval': 3000, 'feature_dim': 80, 'subsampling_factor': 4, 'warm_step': 2000, 'env_info': {'k2-version': '1.23.4', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '7efe024b23078ffa0bcb5598afff14f356edae7c', 'k2-git-date': 'Mon Jan 30 20:22:57 2023', 'lhotse-version': '1.12.0.dev+git.891bad1.clean', 'torch-version': '1.10.0+cu102', 'torch-cuda-available': True, 'torch-cuda-version': '10.2', 'python-version': '3.8', 'icefall-git-branch': 'from_dan_scaled_adam_exp1119', 'icefall-git-sha1': '432b2fa3-dirty', 'icefall-git-date': 'Mon May 8 18:46:45 2023', 'icefall-path': '/ceph-zw/workspace/zipformer/icefall_dan_streaming', 'k2-path': '/ceph-zw/workspace/k2/k2/k2/python/k2/__init__.py', 'lhotse-path': '/ceph-zw/workspace/share/lhotse/lhotse/__init__.py', 'hostname': 'de-74279-k2-train-6-0423201309-7c68fd68fb-6cszs', 'IP address': '10.177.28.83'}, 'world_size': 2, 'master_port': 12348, 'tensorboard': True, 'num_epochs': 40, 'start_epoch': 31, 'start_batch': 0, 'exp_dir': PosixPath('pruned_transducer_stateless7/exp1119-smaller-md1500'), 'bpe_model': 'data/lang_bpe_500/bpe.model', 'base_lr': 0.04, 'lr_batches': 7500, 'lr_epochs': 3.5, 'lr_warmup_start': 0.5, 'ref_duration': 600, 'context_size': 2, 'prune_range': 5, 'lm_scale': 0.25, 'am_scale': 0.0, 'simple_loss_scale': 0.5, 'seed': 42, 'print_diagnostics': False, 'inf_check': False, 'save_every_n': 4000, 'keep_last_k': 30, 'average_period': 200, 'use_fp16': True, 'num_encoder_layers': '2,2,2,2,2,2', 'downsampling_factor': '1,2,4,8,4,2', 'feedforward_dim': '512,768,768,768,768,768', 'num_heads': '4,4,4,8,4,4', 'encoder_dim': '192,256,256,256,256,256', 'query_head_dim': '32', 'value_head_dim': '12', 'pos_head_dim': '4', 'pos_dim': 48, 'encoder_unmasked_dim': '192,192,192,192,192,192', 'cnn_module_kernel': '31,31,15,15,15,31', 'decoder_dim': 512, 'joiner_dim': 512, 'causal': False, 'chunk_size': '16,32,64,-1', 'left_context_frames': '64,128,256,-1', 'full_libri': True, 'manifest_dir': PosixPath('data/fbank'), 'max_duration': 1500, 'bucketing_sampler': True, 'num_buckets': 30, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0, 'on_the_fly_feats': False, 'shuffle': True, 'drop_last': True, 'return_cuts': True, 'num_workers': 2, 'enable_spec_aug': True, 'spec_aug_time_warp_factor': 80, 'enable_musan': True, 'input_strategy': 'PrecomputedFeatures', 'blank_id': 0, 'vocab_size': 500} +2023-05-10 14:59:59,108 INFO [train.py:1112] (1/2) About to create model +2023-05-10 14:59:59,568 INFO [train.py:1116] (1/2) Number of model parameters: 23285615 +2023-05-10 14:59:59,569 INFO [checkpoint.py:112] (1/2) Loading checkpoint from pruned_transducer_stateless7/exp1119-smaller-md1500/epoch-30.pt +2023-05-10 15:00:06,341 INFO [train.py:1131] (1/2) Using DDP +2023-05-10 15:00:06,754 INFO [train.py:1145] (1/2) Loading optimizer state dict +2023-05-10 15:00:07,300 INFO [train.py:1153] (1/2) Loading scheduler state dict +2023-05-10 15:00:07,300 INFO [asr_datamodule.py:409] (1/2) About to get train-clean-100 cuts +2023-05-10 15:00:07,333 INFO [asr_datamodule.py:416] (1/2) About to get train-clean-360 cuts +2023-05-10 15:00:07,335 INFO [asr_datamodule.py:423] (1/2) About to get train-other-500 cuts +2023-05-10 15:00:07,336 INFO [asr_datamodule.py:225] (1/2) Enable MUSAN +2023-05-10 15:00:07,337 INFO [asr_datamodule.py:226] (1/2) About to get Musan cuts +2023-05-10 15:00:09,909 INFO [asr_datamodule.py:254] (1/2) Enable SpecAugment +2023-05-10 15:00:09,910 INFO [asr_datamodule.py:255] (1/2) Time warp factor: 80 +2023-05-10 15:00:09,910 INFO [asr_datamodule.py:267] (1/2) Num frame mask: 10 +2023-05-10 15:00:09,910 INFO [asr_datamodule.py:280] (1/2) About to create train dataset +2023-05-10 15:00:09,910 INFO [asr_datamodule.py:309] (1/2) Using DynamicBucketingSampler. +2023-05-10 15:00:15,298 WARNING [train.py:1182] (1/2) Exclude cut with ID 7859-102521-0017-7548-0_sp1.1 from training. Duration: 22.2954375 +2023-05-10 15:00:17,038 INFO [asr_datamodule.py:324] (1/2) About to create train dataloader +2023-05-10 15:00:17,040 INFO [asr_datamodule.py:430] (1/2) About to get dev-clean cuts +2023-05-10 15:00:17,041 INFO [asr_datamodule.py:437] (1/2) About to get dev-other cuts +2023-05-10 15:00:17,042 INFO [asr_datamodule.py:355] (1/2) About to create dev dataset +2023-05-10 15:00:17,482 INFO [asr_datamodule.py:374] (1/2) About to create dev dataloader +2023-05-10 15:00:17,483 INFO [train.py:1329] (1/2) Sanity check -- see if any of the batches in epoch 1 would cause OOM. +2023-05-10 15:00:23,192 WARNING [train.py:1182] (1/2) Exclude cut with ID 7859-102521-0017-7548-0_sp1.1 from training. Duration: 22.2954375 +2023-05-10 15:00:30,789 WARNING [train.py:1182] (1/2) Exclude cut with ID 7859-102521-0017-7548-0_sp1.1 from training. Duration: 22.2954375 +2023-05-10 15:00:36,207 WARNING [train.py:1182] (1/2) Exclude cut with ID 298-126791-0067-24026-0_sp0.9 from training. Duration: 21.438875 +2023-05-10 15:00:36,498 WARNING [train.py:1182] (1/2) Exclude cut with ID 5652-39938-0025-23684-0_sp0.9 from training. Duration: 22.2055625 +2023-05-10 15:00:46,069 WARNING [train.py:1182] (1/2) Exclude cut with ID 7859-102521-0017-7548-0 from training. Duration: 24.525 +2023-05-10 15:00:48,408 WARNING [train.py:1182] (1/2) Exclude cut with ID 3699-47246-0007-3408-0_sp0.9 from training. Duration: 20.26675 +2023-05-10 15:00:49,061 WARNING [train.py:1182] (1/2) Exclude cut with ID 7859-102521-0017-7548-0_sp0.9 from training. Duration: 27.25 +2023-05-10 15:00:52,802 WARNING [train.py:1182] (1/2) Exclude cut with ID 6426-64292-0017-15984-0 from training. Duration: 21.68 +2023-05-10 15:00:53,399 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0007-59342-0 from training. Duration: 21.6300625 +2023-05-10 15:00:55,258 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0007-59342-0_sp0.9 from training. Duration: 24.033375 +2023-05-10 15:00:58,179 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0009-59344-0 from training. Duration: 22.905 +2023-05-10 15:00:58,236 WARNING [train.py:1182] (1/2) Exclude cut with ID 5622-44585-0006-90525-0_sp1.1 from training. Duration: 23.4318125 +2023-05-10 15:01:04,737 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0009-59344-0_sp1.1 from training. Duration: 20.82275 +2023-05-10 15:01:04,810 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0009-59344-0_sp0.9 from training. Duration: 25.45 +2023-05-10 15:01:08,391 WARNING [train.py:1182] (1/2) Exclude cut with ID 5622-44585-0006-90525-0 from training. Duration: 25.775 +2023-05-10 15:01:09,478 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0071-62375-0_sp0.9 from training. Duration: 22.25 +2023-05-10 15:01:10,884 WARNING [train.py:1182] (1/2) Exclude cut with ID 3972-170212-0014-23379-0 from training. Duration: 26.205 +2023-05-10 15:01:11,763 WARNING [train.py:1182] (1/2) Exclude cut with ID 5239-32139-0047-9341-0_sp0.9 from training. Duration: 30.1555625 +2023-05-10 15:01:12,041 WARNING [train.py:1182] (1/2) Exclude cut with ID 1265-135635-0050-6781-0_sp0.9 from training. Duration: 21.8333125 +2023-05-10 15:01:12,490 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0024-94914-0_sp1.1 from training. Duration: 20.6545625 +2023-05-10 15:01:15,248 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0045-39920-0_sp0.9 from training. Duration: 20.52225 +2023-05-10 15:01:16,280 WARNING [train.py:1182] (1/2) Exclude cut with ID 3972-170212-0014-23379-0_sp0.9 from training. Duration: 29.1166875 +2023-05-10 15:01:19,815 WARNING [train.py:1182] (1/2) Exclude cut with ID 543-133211-0007-59831-0_sp0.9 from training. Duration: 21.388875 +2023-05-10 15:01:21,570 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0024-94914-0 from training. Duration: 22.72 +2023-05-10 15:01:21,637 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0031-94921-0_sp0.9 from training. Duration: 22.7444375 +2023-05-10 15:01:23,877 WARNING [train.py:1182] (1/2) Exclude cut with ID 4133-6541-0027-40495-0_sp1.1 from training. Duration: 0.9681875 +2023-05-10 15:01:24,063 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62851-0022-91297-0_sp0.9 from training. Duration: 22.3166875 +2023-05-10 15:01:24,390 WARNING [train.py:1182] (1/2) Exclude cut with ID 543-133212-0015-59917-0_sp0.9 from training. Duration: 21.8166875 +2023-05-10 15:01:30,184 WARNING [train.py:1182] (1/2) Exclude cut with ID 4957-30119-0041-23990-0_sp0.9 from training. Duration: 20.22775 +2023-05-10 15:01:33,071 WARNING [train.py:1182] (1/2) Exclude cut with ID 5239-32139-0047-9341-0_sp1.1 from training. Duration: 24.67275 +2023-05-10 15:01:34,516 WARNING [train.py:1182] (1/2) Exclude cut with ID 3082-165428-0081-50734-0_sp0.9 from training. Duration: 21.8055625 +2023-05-10 15:01:36,374 WARNING [train.py:1182] (1/2) Exclude cut with ID 3340-169293-0054-76830-0_sp0.9 from training. Duration: 22.6666875 +2023-05-10 15:01:40,085 WARNING [train.py:1182] (1/2) Exclude cut with ID 2411-132532-0017-82279-0_sp1.1 from training. Duration: 0.9681875 +2023-05-10 15:01:41,406 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62850-0007-91323-0 from training. Duration: 22.485 +2023-05-10 15:01:43,304 WARNING [train.py:1182] (1/2) Exclude cut with ID 3972-170212-0014-23379-0_sp1.1 from training. Duration: 23.82275 +2023-05-10 15:01:44,015 WARNING [train.py:1182] (1/2) Exclude cut with ID 4860-13185-0032-76709-0 from training. Duration: 20.77 +2023-05-10 15:01:45,010 WARNING [train.py:1182] (1/2) Exclude cut with ID 6426-64292-0017-15984-0_sp0.9 from training. Duration: 24.088875 +2023-05-10 15:01:46,465 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62850-0007-91323-0_sp1.1 from training. Duration: 20.4409375 +2023-05-10 15:01:50,289 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0062-62366-0_sp0.9 from training. Duration: 22.511125 +2023-05-10 15:01:50,320 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0031-39906-0 from training. Duration: 20.675 +2023-05-10 15:01:55,937 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62850-0007-91323-0_sp0.9 from training. Duration: 24.9833125 +2023-05-10 15:01:58,177 WARNING [train.py:1182] (1/2) Exclude cut with ID 5239-32139-0047-9341-0 from training. Duration: 27.14 +2023-05-10 15:01:58,979 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0044-62348-0 from training. Duration: 22.44 +2023-05-10 15:02:03,069 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0060-62364-0_sp0.9 from training. Duration: 21.361125 +2023-05-10 15:02:03,439 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0079-62383-0_sp1.1 from training. Duration: 27.0318125 +2023-05-10 15:02:04,555 WARNING [train.py:1182] (1/2) Exclude cut with ID 5622-44585-0006-90525-0_sp0.9 from training. Duration: 28.638875 +2023-05-10 15:02:05,320 WARNING [train.py:1182] (1/2) Exclude cut with ID 3340-169293-0054-76830-0 from training. Duration: 20.4 +2023-05-10 15:02:06,916 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0071-62375-0 from training. Duration: 20.025 +2023-05-10 15:02:06,928 WARNING [train.py:1182] (1/2) Exclude cut with ID 2364-131735-0112-64612-0_sp0.9 from training. Duration: 20.488875 +2023-05-10 15:02:07,220 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0079-62383-0 from training. Duration: 29.735 +2023-05-10 15:02:11,968 WARNING [train.py:1182] (1/2) Exclude cut with ID 7276-92427-0014-12983-0_sp0.9 from training. Duration: 21.3055625 +2023-05-10 15:02:12,038 WARNING [train.py:1182] (1/2) Exclude cut with ID 1025-75365-0008-79168-0_sp0.9 from training. Duration: 22.0666875 +2023-05-10 15:02:17,935 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0062-62366-0 from training. Duration: 20.26 +2023-05-10 15:02:18,703 WARNING [train.py:1182] (1/2) Exclude cut with ID 5239-32139-0030-9324-0_sp0.9 from training. Duration: 21.3444375 +2023-05-10 15:02:21,702 WARNING [train.py:1182] (1/2) Exclude cut with ID 497-129325-0061-62254-0_sp1.1 from training. Duration: 0.97725 +2023-05-10 15:02:24,397 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0031-39906-0_sp0.9 from training. Duration: 22.97225 +2023-05-10 15:02:25,949 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0047-39922-0_sp0.9 from training. Duration: 21.97775 +2023-05-10 15:02:26,623 WARNING [train.py:1182] (1/2) Exclude cut with ID 1112-1043-0006-89194-0_sp0.9 from training. Duration: 21.8333125 +2023-05-10 15:02:27,229 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0031-94921-0 from training. Duration: 20.47 +2023-05-10 15:02:31,220 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0037-39912-0_sp0.9 from training. Duration: 20.67225 +2023-05-10 15:02:32,158 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0024-94914-0_sp0.9 from training. Duration: 25.2444375 +2023-05-10 15:02:33,347 WARNING [train.py:1182] (1/2) Exclude cut with ID 3340-169293-0021-76797-0_sp0.9 from training. Duration: 21.1445 +2023-05-10 15:02:37,750 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0079-62383-0_sp0.9 from training. Duration: 33.038875 +2023-05-10 15:02:39,645 WARNING [train.py:1182] (1/2) Exclude cut with ID 6426-64291-0000-16059-0_sp0.9 from training. Duration: 20.0944375 +2023-05-10 15:02:40,384 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0044-62348-0_sp1.1 from training. Duration: 20.4 +2023-05-10 15:02:40,777 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62851-0022-91297-0 from training. Duration: 20.085 +2023-05-10 15:02:41,301 WARNING [train.py:1182] (1/2) Exclude cut with ID 4860-13185-0032-76709-0_sp0.9 from training. Duration: 23.07775 +2023-05-10 15:02:44,179 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0044-62348-0_sp0.9 from training. Duration: 24.9333125 +2023-05-10 15:02:46,317 WARNING [train.py:1182] (1/2) Exclude cut with ID 5118-111612-0016-124680-0_sp0.9 from training. Duration: 20.388875 +2023-05-10 15:02:46,641 WARNING [train.py:1182] (1/2) Exclude cut with ID 432-122774-0017-62487-0_sp1.1 from training. Duration: 20.3590625 +2023-05-10 15:02:50,673 WARNING [train.py:1182] (1/2) Exclude cut with ID 3557-8342-0013-54691-0_sp1.1 from training. Duration: 0.836375 +2023-05-10 15:02:52,741 WARNING [train.py:1182] (1/2) Exclude cut with ID 8565-290391-0049-67394-0_sp0.9 from training. Duration: 21.3166875 +2023-05-10 15:02:54,941 WARNING [train.py:1182] (1/2) Exclude cut with ID 6533-399-0029-104863-0_sp0.9 from training. Duration: 22.1055625 +2023-05-10 15:02:55,443 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0094-26379-0_sp1.1 from training. Duration: 21.77725 +2023-05-10 15:02:56,398 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0005-134304-0_sp0.9 from training. Duration: 27.8166875 +2023-05-10 15:02:57,619 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0021-15852-0_sp1.1 from training. Duration: 22.5090625 +2023-05-10 15:02:57,930 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0005-134304-0 from training. Duration: 25.035 +2023-05-10 15:02:58,879 WARNING [train.py:1182] (1/2) Exclude cut with ID 774-127930-0014-10412-0_sp1.1 from training. Duration: 0.95 +2023-05-10 15:02:59,769 WARNING [train.py:1182] (1/2) Exclude cut with ID 3033-130750-0096-55598-0_sp0.9 from training. Duration: 0.92225 +2023-05-10 15:03:01,677 WARNING [train.py:1182] (1/2) Exclude cut with ID 4511-76322-0006-80011-0 from training. Duration: 21.97 +2023-05-10 15:03:02,575 WARNING [train.py:1182] (1/2) Exclude cut with ID 7492-105653-0055-62765-0_sp0.9 from training. Duration: 21.97225 +2023-05-10 15:03:02,609 WARNING [train.py:1182] (1/2) Exclude cut with ID 453-131332-0000-47844-0_sp0.9 from training. Duration: 25.3333125 +2023-05-10 15:03:03,109 WARNING [train.py:1182] (1/2) Exclude cut with ID 5172-29468-0015-19128-0_sp0.9 from training. Duration: 21.5055625 +2023-05-10 15:03:03,565 WARNING [train.py:1182] (1/2) Exclude cut with ID 453-131332-0000-47844-0_sp1.1 from training. Duration: 20.72725 +2023-05-10 15:03:05,162 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0030-130156-0_sp0.9 from training. Duration: 26.32775 +2023-05-10 15:03:07,346 WARNING [train.py:1182] (1/2) Exclude cut with ID 3867-173237-0077-144769-0 from training. Duration: 20.025 +2023-05-10 15:03:07,598 WARNING [train.py:1182] (1/2) Exclude cut with ID 6709-74022-0004-86860-0_sp1.1 from training. Duration: 0.9409375 +2023-05-10 15:03:07,608 WARNING [train.py:1182] (1/2) Exclude cut with ID 4757-1811-0023-62229-0_sp0.9 from training. Duration: 21.37775 +2023-05-10 15:03:08,673 WARNING [train.py:1182] (1/2) Exclude cut with ID 1250-135782-0004-25974-0_sp0.9 from training. Duration: 21.17225 +2023-05-10 15:03:08,684 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0021-15852-0_sp0.9 from training. Duration: 27.511125 +2023-05-10 15:03:10,270 WARNING [train.py:1182] (1/2) Exclude cut with ID 453-131332-0000-47844-0 from training. Duration: 22.8 +2023-05-10 15:03:10,490 WARNING [train.py:1182] (1/2) Exclude cut with ID 4964-30587-0040-44509-0 from training. Duration: 22.585 +2023-05-10 15:03:12,544 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0001-146967-0_sp0.9 from training. Duration: 22.0166875 +2023-05-10 15:03:13,315 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0003-134302-0_sp1.1 from training. Duration: 24.395375 +2023-05-10 15:03:13,689 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0017-41203-0_sp0.9 from training. Duration: 27.47775 +2023-05-10 15:03:13,888 WARNING [train.py:1182] (1/2) Exclude cut with ID 432-122774-0017-62487-0_sp0.9 from training. Duration: 24.8833125 +2023-05-10 15:03:14,025 WARNING [train.py:1182] (1/2) Exclude cut with ID 6758-72288-0033-108368-0 from training. Duration: 23.39 +2023-05-10 15:03:14,327 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0007-12994-0_sp0.9 from training. Duration: 28.72225 +2023-05-10 15:03:14,795 WARNING [train.py:1182] (1/2) Exclude cut with ID 585-294811-0110-133686-0_sp0.9 from training. Duration: 20.8944375 +2023-05-10 15:03:15,493 WARNING [train.py:1182] (1/2) Exclude cut with ID 5796-66357-0007-116447-0_sp0.9 from training. Duration: 23.8444375 +2023-05-10 15:03:16,754 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0007-12994-0 from training. Duration: 25.85 +2023-05-10 15:03:16,763 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0023-13010-0 from training. Duration: 21.39 +2023-05-10 15:03:17,304 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0014-15845-0 from training. Duration: 27.92 +2023-05-10 15:03:19,241 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0039-130165-0_sp0.9 from training. Duration: 20.661125 +2023-05-10 15:03:20,384 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0043-15874-0_sp0.9 from training. Duration: 20.07225 +2023-05-10 15:03:20,767 WARNING [train.py:1182] (1/2) Exclude cut with ID 1085-156170-0017-128270-0 from training. Duration: 21.01 +2023-05-10 15:03:24,736 WARNING [train.py:1182] (1/2) Exclude cut with ID 2195-150901-0045-59933-0 from training. Duration: 20.65 +2023-05-10 15:03:25,089 WARNING [train.py:1182] (1/2) Exclude cut with ID 5796-66357-0007-116447-0 from training. Duration: 21.46 +2023-05-10 15:03:27,307 WARNING [train.py:1182] (1/2) Exclude cut with ID 3557-8342-0013-54691-0 from training. Duration: 0.92 +2023-05-10 15:03:27,631 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0023-13010-0_sp0.9 from training. Duration: 23.7666875 +2023-05-10 15:03:29,446 WARNING [train.py:1182] (1/2) Exclude cut with ID 8544-281189-0060-101339-0_sp0.9 from training. Duration: 20.861125 +2023-05-10 15:03:29,904 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-65654-0031-41259-0_sp0.9 from training. Duration: 22.711125 +2023-05-10 15:03:32,551 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0043-132310-0_sp1.1 from training. Duration: 22.986375 +2023-05-10 15:03:33,360 WARNING [train.py:1182] (1/2) Exclude cut with ID 8040-260924-0003-80960-0_sp0.9 from training. Duration: 22.07225 +2023-05-10 15:03:33,589 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0045-26330-0_sp0.9 from training. Duration: 20.3055625 +2023-05-10 15:03:33,711 WARNING [train.py:1182] (1/2) Exclude cut with ID 6356-271890-0060-94317-0_sp0.9 from training. Duration: 20.72225 +2023-05-10 15:03:34,631 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0017-41203-0_sp1.1 from training. Duration: 22.4818125 +2023-05-10 15:03:35,888 WARNING [train.py:1182] (1/2) Exclude cut with ID 4964-30587-0040-44509-0_sp0.9 from training. Duration: 25.0944375 +2023-05-10 15:03:36,695 WARNING [train.py:1182] (1/2) Exclude cut with ID 6533-399-0047-104881-0 from training. Duration: 21.515 +2023-05-10 15:03:37,025 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0009-15840-0_sp0.9 from training. Duration: 27.02225 +2023-05-10 15:03:37,254 WARNING [train.py:1182] (1/2) Exclude cut with ID 432-122774-0010-62480-0_sp0.9 from training. Duration: 22.22225 +2023-05-10 15:03:37,608 WARNING [train.py:1182] (1/2) Exclude cut with ID 4964-30587-0085-44554-0_sp0.9 from training. Duration: 20.85 +2023-05-10 15:03:39,255 WARNING [train.py:1182] (1/2) Exclude cut with ID 4295-39940-0007-92567-0 from training. Duration: 21.54 +2023-05-10 15:03:39,455 WARNING [train.py:1182] (1/2) Exclude cut with ID 4964-30587-0040-44509-0_sp1.1 from training. Duration: 20.5318125 +2023-05-10 15:03:39,920 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0012-134311-0_sp0.9 from training. Duration: 21.9333125 +2023-05-10 15:03:42,868 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0025-130151-0_sp0.9 from training. Duration: 21.7944375 +2023-05-10 15:03:43,531 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0002-12989-0_sp0.9 from training. Duration: 22.4666875 +2023-05-10 15:03:43,879 WARNING [train.py:1182] (1/2) Exclude cut with ID 6121-9014-0076-24124-0 from training. Duration: 21.635 +2023-05-10 15:03:44,133 WARNING [train.py:1182] (1/2) Exclude cut with ID 6121-9014-0076-24124-0_sp0.9 from training. Duration: 24.038875 +2023-05-10 15:03:46,163 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0009-134308-0_sp1.1 from training. Duration: 21.786375 +2023-05-10 15:03:46,727 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0002-12989-0 from training. Duration: 20.22 +2023-05-10 15:03:53,156 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0043-132310-0 from training. Duration: 25.285 +2023-05-10 15:03:56,840 WARNING [train.py:1182] (1/2) Exclude cut with ID 811-130148-0001-63453-0_sp0.9 from training. Duration: 20.861125 +2023-05-10 15:03:57,897 WARNING [train.py:1182] (1/2) Exclude cut with ID 6010-56788-0055-90261-0 from training. Duration: 20.88 +2023-05-10 15:03:59,436 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0045-15876-0_sp0.9 from training. Duration: 23.4166875 +2023-05-10 15:04:03,685 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0006-134305-0 from training. Duration: 21.24 +2023-05-10 15:04:03,703 WARNING [train.py:1182] (1/2) Exclude cut with ID 6533-399-0047-104881-0_sp0.9 from training. Duration: 23.9055625 +2023-05-10 15:04:05,307 WARNING [train.py:1182] (1/2) Exclude cut with ID 6758-72288-0033-108368-0_sp0.9 from training. Duration: 25.988875 +2023-05-10 15:04:06,357 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0001-134300-0_sp0.9 from training. Duration: 20.67225 +2023-05-10 15:04:08,650 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0038-41224-0 from training. Duration: 20.34 +2023-05-10 15:04:12,723 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0026-15857-0_sp0.9 from training. Duration: 25.061125 +2023-05-10 15:04:13,296 WARNING [train.py:1182] (1/2) Exclude cut with ID 3033-130750-0096-55598-0 from training. Duration: 0.83 +2023-05-10 15:04:14,611 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0017-41203-0 from training. Duration: 24.73 +2023-05-10 15:04:15,211 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0009-134308-0 from training. Duration: 23.965 +2023-05-10 15:04:15,628 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0030-146996-0_sp0.9 from training. Duration: 22.088875 +2023-05-10 15:04:16,436 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0006-134305-0_sp0.9 from training. Duration: 23.6 +2023-05-10 15:04:21,931 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0024-13011-0 from training. Duration: 23.795 +2023-05-10 15:04:22,729 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0030-130156-0_sp1.1 from training. Duration: 21.5409375 +2023-05-10 15:04:22,864 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0019-146985-0_sp0.9 from training. Duration: 24.97775 +2023-05-10 15:04:24,042 WARNING [train.py:1182] (1/2) Exclude cut with ID 1085-156170-0017-128270-0_sp0.9 from training. Duration: 23.3444375 +2023-05-10 15:04:25,371 WARNING [train.py:1182] (1/2) Exclude cut with ID 6010-56788-0055-90261-0_sp0.9 from training. Duration: 23.2 +2023-05-10 15:04:25,631 WARNING [train.py:1182] (1/2) Exclude cut with ID 5653-46179-0060-117930-0_sp0.9 from training. Duration: 21.17225 +2023-05-10 15:04:26,612 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0008-134307-0_sp0.9 from training. Duration: 24.6555625 +2023-05-10 15:04:29,893 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-65654-0031-41259-0 from training. Duration: 20.44 +2023-05-10 15:04:30,692 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0018-132285-0_sp0.9 from training. Duration: 23.45 +2023-05-10 15:04:32,214 WARNING [train.py:1182] (1/2) Exclude cut with ID 6945-60535-0076-12784-0_sp0.9 from training. Duration: 20.52225 +2023-05-10 15:04:32,611 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0008-134307-0 from training. Duration: 22.19 +2023-05-10 15:04:33,152 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0014-15845-0_sp1.1 from training. Duration: 25.3818125 +2023-05-10 15:04:33,994 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0043-132310-0_sp0.9 from training. Duration: 28.0944375 +2023-05-10 15:04:34,318 WARNING [train.py:1182] (1/2) Exclude cut with ID 2195-150901-0045-59933-0_sp0.9 from training. Duration: 22.9444375 +2023-05-10 15:04:34,722 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0024-13011-0_sp1.1 from training. Duration: 21.6318125 +2023-05-10 15:04:36,175 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0030-130156-0 from training. Duration: 23.695 +2023-05-10 15:04:37,359 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0094-26379-0 from training. Duration: 23.955 +2023-05-10 15:04:38,973 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0024-13011-0_sp0.9 from training. Duration: 26.438875 +2023-05-10 15:04:41,156 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0021-26306-0_sp0.9 from training. Duration: 21.2444375 +2023-05-10 15:04:41,201 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0014-15845-0_sp0.9 from training. Duration: 31.02225 +2023-05-10 15:04:42,290 WARNING [train.py:1182] (1/2) Exclude cut with ID 432-122774-0017-62487-0 from training. Duration: 22.395 +2023-05-10 15:04:43,100 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0045-15876-0 from training. Duration: 21.075 +2023-05-10 15:04:43,341 WARNING [train.py:1182] (1/2) Exclude cut with ID 6482-98857-0025-147532-0_sp0.9 from training. Duration: 20.0055625 +2023-05-10 15:04:43,360 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0037-132304-0_sp0.9 from training. Duration: 22.05 +2023-05-10 15:04:43,371 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0003-134302-0 from training. Duration: 26.8349375 +2023-05-10 15:04:43,527 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0009-15840-0_sp1.1 from training. Duration: 22.1090625 +2023-05-10 15:04:43,854 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0094-26379-0_sp0.9 from training. Duration: 26.6166875 +2023-05-10 15:04:44,707 WARNING [train.py:1182] (1/2) Exclude cut with ID 2046-178027-0000-53705-0_sp0.9 from training. Duration: 20.3055625 +2023-05-10 15:04:45,853 WARNING [train.py:1182] (1/2) Exclude cut with ID 7205-50138-0008-5373-0_sp0.9 from training. Duration: 20.7 +2023-05-10 15:04:48,695 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0019-146985-0 from training. Duration: 22.48 +2023-05-10 15:04:49,486 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0003-134302-0_sp0.9 from training. Duration: 29.816625 +2023-05-10 15:04:50,421 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0005-134304-0_sp1.1 from training. Duration: 22.7590625 +2023-05-10 15:04:50,676 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0026-15857-0 from training. Duration: 22.555 +2023-05-10 15:04:51,636 WARNING [train.py:1182] (1/2) Exclude cut with ID 1250-135782-0005-25975-0_sp0.9 from training. Duration: 21.688875 +2023-05-10 15:04:53,889 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0038-41224-0_sp0.9 from training. Duration: 22.6 +2023-05-10 15:04:55,717 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0009-15840-0 from training. Duration: 24.32 +2023-05-10 15:04:58,556 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-276745-0093-13116-0_sp0.9 from training. Duration: 21.061125 +2023-05-10 15:04:59,848 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0024-15855-0_sp0.9 from training. Duration: 20.32225 +2023-05-10 15:05:00,467 WARNING [train.py:1182] (1/2) Exclude cut with ID 3033-130750-0096-55598-0_sp1.1 from training. Duration: 0.7545625 +2023-05-10 15:05:01,243 WARNING [train.py:1182] (1/2) Exclude cut with ID 4295-39940-0007-92567-0_sp0.9 from training. Duration: 23.9333125 +2023-05-10 15:05:02,821 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0008-134307-0_sp1.1 from training. Duration: 20.17275 +2023-05-10 15:05:03,119 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0019-146985-0_sp1.1 from training. Duration: 20.436375 +2023-05-10 15:05:07,678 WARNING [train.py:1182] (1/2) Exclude cut with ID 4234-40345-0022-142709-0_sp0.9 from training. Duration: 23.1055625 +2023-05-10 15:05:07,777 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0007-12994-0_sp1.1 from training. Duration: 23.5 +2023-05-10 15:05:08,297 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0009-134308-0_sp0.9 from training. Duration: 26.62775 +2023-05-10 15:05:08,917 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0018-132285-0 from training. Duration: 21.105 +2023-05-10 15:05:09,159 WARNING [train.py:1182] (1/2) Exclude cut with ID 4511-76322-0006-80011-0_sp0.9 from training. Duration: 24.411125 +2023-05-10 15:05:10,481 WARNING [train.py:1182] (1/2) Exclude cut with ID 6758-72288-0033-108368-0_sp1.1 from training. Duration: 21.263625 +2023-05-10 15:05:12,602 WARNING [train.py:1182] (1/2) Exclude cut with ID 4234-40345-0022-142709-0 from training. Duration: 20.795 +2023-05-10 15:05:13,096 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0021-15852-0 from training. Duration: 24.76 +2023-05-10 15:05:13,113 WARNING [train.py:1182] (1/2) Exclude cut with ID 3867-173237-0077-144769-0_sp0.9 from training. Duration: 22.25 +2023-05-10 15:05:14,371 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0026-15857-0_sp1.1 from training. Duration: 20.5045625 +2023-05-10 15:05:20,205 INFO [train.py:1357] (1/2) Maximum memory allocated so far is 17018MB +2023-05-10 15:05:23,134 INFO [train.py:1357] (1/2) Maximum memory allocated so far is 17965MB +2023-05-10 15:05:26,276 INFO [train.py:1357] (1/2) Maximum memory allocated so far is 17965MB +2023-05-10 15:05:29,341 INFO [train.py:1357] (1/2) Maximum memory allocated so far is 17965MB +2023-05-10 15:05:31,750 INFO [scaling.py:969] (1/2) Whitening: name=None, num_groups=1, num_channels=256, metric=13.50 vs. limit=7.5 +2023-05-10 15:05:32,417 INFO [train.py:1357] (1/2) Maximum memory allocated so far is 17965MB +2023-05-10 15:05:35,465 INFO [train.py:1357] (1/2) Maximum memory allocated so far is 17965MB +2023-05-10 15:05:35,478 INFO [train.py:1238] (1/2) Loading grad scaler state dict +2023-05-10 15:05:50,558 WARNING [train.py:1182] (1/2) Exclude cut with ID 7859-102521-0017-7548-0_sp1.1 from training. Duration: 22.2954375 +2023-05-10 15:05:56,791 INFO [train.py:1021] (1/2) Epoch 31, batch 0, loss[loss=0.1571, simple_loss=0.2437, pruned_loss=0.0352, over 36752.00 frames. ], tot_loss[loss=0.1571, simple_loss=0.2437, pruned_loss=0.0352, over 36752.00 frames. ], batch size: 89, lr: 3.57e-03, grad_scale: 32.0 +2023-05-10 15:05:56,792 INFO [train.py:1048] (1/2) Computing validation loss +2023-05-10 15:06:07,475 INFO [train.py:1057] (1/2) Epoch 31, validation: loss=0.1535, simple_loss=0.2545, pruned_loss=0.02622, over 944034.00 frames. +2023-05-10 15:06:07,475 INFO [train.py:1058] (1/2) Maximum memory allocated so far is 17965MB +2023-05-10 15:06:46,057 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=6.35 vs. limit=15.0 +2023-05-10 15:07:02,573 WARNING [train.py:1182] (1/2) Exclude cut with ID 298-126791-0067-24026-0_sp0.9 from training. Duration: 21.438875 +2023-05-10 15:07:08,599 WARNING [train.py:1182] (1/2) Exclude cut with ID 5652-39938-0025-23684-0_sp0.9 from training. Duration: 22.2055625 +2023-05-10 15:07:16,352 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([4.7105, 4.0234, 4.2983, 4.2732], device='cuda:1') +2023-05-10 15:07:25,367 INFO [train.py:1021] (1/2) Epoch 31, batch 50, loss[loss=0.1503, simple_loss=0.2411, pruned_loss=0.02969, over 37168.00 frames. ], tot_loss[loss=0.1656, simple_loss=0.2574, pruned_loss=0.03693, over 1605079.78 frames. ], batch size: 93, lr: 3.57e-03, grad_scale: 32.0 +2023-05-10 15:07:25,795 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.ff3_skip_rate, batch_count=545660.0, ans=0.0 +2023-05-10 15:07:45,988 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module1.balancer2.min_abs, batch_count=545710.0, ans=0.5 +2023-05-10 15:07:51,050 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=545710.0, ans=0.0 +2023-05-10 15:08:13,433 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_skip_rate, batch_count=545810.0, ans=0.0 +2023-05-10 15:08:16,560 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.5.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.3785, 4.7317, 2.3124, 2.5457], device='cuda:1') +2023-05-10 15:08:35,743 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.352e+02 3.107e+02 3.758e+02 4.562e+02 7.673e+02, threshold=7.517e+02, percent-clipped=0.0 +2023-05-10 15:08:39,181 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module2.balancer2.prob, batch_count=545860.0, ans=0.125 +2023-05-10 15:08:41,820 INFO [train.py:1021] (1/2) Epoch 31, batch 100, loss[loss=0.1562, simple_loss=0.241, pruned_loss=0.03573, over 36771.00 frames. ], tot_loss[loss=0.165, simple_loss=0.2567, pruned_loss=0.0367, over 2840425.01 frames. ], batch size: 89, lr: 3.56e-03, grad_scale: 32.0 +2023-05-10 15:09:20,628 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.balancer1.prob, batch_count=546010.0, ans=0.125 +2023-05-10 15:09:53,333 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([4.9361, 4.1131, 4.4959, 4.5152], device='cuda:1') +2023-05-10 15:09:54,954 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.attention_skip_rate, batch_count=546110.0, ans=0.0 +2023-05-10 15:09:55,297 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=4.57 vs. limit=15.0 +2023-05-10 15:09:56,536 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.2910, 4.0907, 3.8199, 4.1024, 3.4512, 3.1063, 3.5098, 3.0488], + device='cuda:1') +2023-05-10 15:09:59,066 INFO [train.py:1021] (1/2) Epoch 31, batch 150, loss[loss=0.1604, simple_loss=0.2488, pruned_loss=0.03597, over 37186.00 frames. ], tot_loss[loss=0.1648, simple_loss=0.2566, pruned_loss=0.03651, over 3811320.74 frames. ], batch size: 93, lr: 3.56e-03, grad_scale: 32.0 +2023-05-10 15:10:05,399 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass.skip_rate, batch_count=546160.0, ans=0.04949747468305833 +2023-05-10 15:10:17,140 WARNING [train.py:1182] (1/2) Exclude cut with ID 7859-102521-0017-7548-0 from training. Duration: 24.525 +2023-05-10 15:10:41,198 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.bypass.skip_rate, batch_count=546260.0, ans=0.04949747468305833 +2023-05-10 15:10:45,827 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.bypass.skip_rate, batch_count=546310.0, ans=0.07 +2023-05-10 15:10:54,266 WARNING [train.py:1182] (1/2) Exclude cut with ID 3699-47246-0007-3408-0_sp0.9 from training. Duration: 20.26675 +2023-05-10 15:10:57,975 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=4.45 vs. limit=15.0 +2023-05-10 15:11:06,586 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module1.balancer2.prob, batch_count=546360.0, ans=0.125 +2023-05-10 15:11:07,984 WARNING [train.py:1182] (1/2) Exclude cut with ID 7859-102521-0017-7548-0_sp0.9 from training. Duration: 27.25 +2023-05-10 15:11:09,310 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.288e+02 3.040e+02 3.340e+02 4.209e+02 6.958e+02, threshold=6.680e+02, percent-clipped=1.0 +2023-05-10 15:11:12,710 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.const_attention_rate, batch_count=546360.0, ans=0.025 +2023-05-10 15:11:15,506 INFO [train.py:1021] (1/2) Epoch 31, batch 200, loss[loss=0.1509, simple_loss=0.2379, pruned_loss=0.03196, over 36925.00 frames. ], tot_loss[loss=0.1638, simple_loss=0.2558, pruned_loss=0.03593, over 4578741.65 frames. ], batch size: 91, lr: 3.56e-03, grad_scale: 32.0 +2023-05-10 15:11:25,767 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.bypass.skip_rate, batch_count=546410.0, ans=0.04949747468305833 +2023-05-10 15:11:49,173 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.5742, 3.8031, 4.2178, 3.7402], device='cuda:1') +2023-05-10 15:12:05,824 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.balancer2.prob, batch_count=546560.0, ans=0.125 +2023-05-10 15:12:29,278 WARNING [train.py:1182] (1/2) Exclude cut with ID 6426-64292-0017-15984-0 from training. Duration: 21.68 +2023-05-10 15:12:33,770 INFO [train.py:1021] (1/2) Epoch 31, batch 250, loss[loss=0.1732, simple_loss=0.2712, pruned_loss=0.03763, over 36383.00 frames. ], tot_loss[loss=0.1634, simple_loss=0.2553, pruned_loss=0.03578, over 5160790.94 frames. ], batch size: 126, lr: 3.56e-03, grad_scale: 32.0 +2023-05-10 15:12:39,751 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0007-59342-0 from training. Duration: 21.6300625 +2023-05-10 15:12:41,420 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_skip_rate, batch_count=546660.0, ans=0.0 +2023-05-10 15:13:05,700 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0007-59342-0_sp0.9 from training. Duration: 24.033375 +2023-05-10 15:13:15,112 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=546760.0, ans=0.125 +2023-05-10 15:13:29,969 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.conv_module2.whiten, num_groups=1, num_channels=256, metric=5.63 vs. limit=15.0 +2023-05-10 15:13:30,947 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.ff3_skip_rate, batch_count=546810.0, ans=0.0 +2023-05-10 15:13:32,475 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward3.hidden_balancer.prob, batch_count=546810.0, ans=0.125 +2023-05-10 15:13:43,965 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.396e+02 2.848e+02 3.242e+02 3.786e+02 5.651e+02, threshold=6.485e+02, percent-clipped=0.0 +2023-05-10 15:13:49,854 INFO [train.py:1021] (1/2) Epoch 31, batch 300, loss[loss=0.1529, simple_loss=0.2481, pruned_loss=0.02884, over 36933.00 frames. ], tot_loss[loss=0.1633, simple_loss=0.2554, pruned_loss=0.03558, over 5610479.90 frames. ], batch size: 100, lr: 3.56e-03, grad_scale: 32.0 +2023-05-10 15:14:01,313 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass.skip_rate, batch_count=546910.0, ans=0.04949747468305833 +2023-05-10 15:14:05,771 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.0.layers.1.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 15:14:07,128 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0009-59344-0 from training. Duration: 22.905 +2023-05-10 15:14:08,675 WARNING [train.py:1182] (1/2) Exclude cut with ID 5622-44585-0006-90525-0_sp1.1 from training. Duration: 23.4318125 +2023-05-10 15:14:16,411 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.out_proj.dropout_p, batch_count=546960.0, ans=0.1 +2023-05-10 15:14:38,470 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.whiten, num_groups=1, num_channels=256, metric=5.30 vs. limit=12.0 +2023-05-10 15:14:40,990 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.0.layers.1.self_attn_weights, attn_weights_entropy = tensor([6.2204, 5.3491, 5.5509, 6.0881], device='cuda:1') +2023-05-10 15:14:41,256 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=12.06 vs. limit=15.0 +2023-05-10 15:14:53,771 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=547110.0, ans=0.0 +2023-05-10 15:15:04,980 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=11.27 vs. limit=15.0 +2023-05-10 15:15:05,896 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder_embed.conv.8.prob, batch_count=547160.0, ans=0.125 +2023-05-10 15:15:07,231 INFO [train.py:1021] (1/2) Epoch 31, batch 350, loss[loss=0.168, simple_loss=0.2588, pruned_loss=0.03861, over 37033.00 frames. ], tot_loss[loss=0.1637, simple_loss=0.2557, pruned_loss=0.03584, over 5951899.00 frames. ], batch size: 99, lr: 3.56e-03, grad_scale: 32.0 +2023-05-10 15:15:12,089 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.bypass_mid.scale_min, batch_count=547160.0, ans=0.2 +2023-05-10 15:15:22,105 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.conv_module2.whiten, num_groups=1, num_channels=256, metric=3.77 vs. limit=15.0 +2023-05-10 15:15:42,826 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([2.7493, 3.4858, 3.3160, 4.1927, 2.4019, 3.6425, 4.2509, 3.5678], + device='cuda:1') +2023-05-10 15:15:49,121 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.bypass.skip_rate, batch_count=547260.0, ans=0.035 +2023-05-10 15:16:12,535 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0009-59344-0_sp1.1 from training. Duration: 20.82275 +2023-05-10 15:16:14,135 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0009-59344-0_sp0.9 from training. Duration: 25.45 +2023-05-10 15:16:18,345 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.673e+02 3.280e+02 4.154e+02 5.203e+02 7.776e+02, threshold=8.309e+02, percent-clipped=9.0 +2023-05-10 15:16:24,511 INFO [train.py:1021] (1/2) Epoch 31, batch 400, loss[loss=0.1703, simple_loss=0.2692, pruned_loss=0.0357, over 36391.00 frames. ], tot_loss[loss=0.1643, simple_loss=0.2565, pruned_loss=0.03601, over 6210437.86 frames. ], batch size: 126, lr: 3.56e-03, grad_scale: 32.0 +2023-05-10 15:16:26,370 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([4.3557, 3.6962, 3.9260, 3.9809], device='cuda:1') +2023-05-10 15:16:49,435 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.bypass_mid.scale_min, batch_count=547460.0, ans=0.2 +2023-05-10 15:17:03,185 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=547510.0, ans=0.1 +2023-05-10 15:17:06,779 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.ff3_skip_rate, batch_count=547510.0, ans=0.0 +2023-05-10 15:17:08,195 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=547510.0, ans=0.0 +2023-05-10 15:17:15,541 WARNING [train.py:1182] (1/2) Exclude cut with ID 5622-44585-0006-90525-0 from training. Duration: 25.775 +2023-05-10 15:17:38,679 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0071-62375-0_sp0.9 from training. Duration: 22.25 +2023-05-10 15:17:41,639 INFO [train.py:1021] (1/2) Epoch 31, batch 450, loss[loss=0.146, simple_loss=0.2293, pruned_loss=0.03132, over 37061.00 frames. ], tot_loss[loss=0.1652, simple_loss=0.2579, pruned_loss=0.03627, over 6449096.99 frames. ], batch size: 88, lr: 3.56e-03, grad_scale: 32.0 +2023-05-10 15:17:44,966 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.balancer_ff2.min_abs, batch_count=547660.0, ans=0.1 +2023-05-10 15:17:52,564 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=547660.0, ans=0.1 +2023-05-10 15:18:08,157 WARNING [train.py:1182] (1/2) Exclude cut with ID 3972-170212-0014-23379-0 from training. Duration: 26.205 +2023-05-10 15:18:26,251 WARNING [train.py:1182] (1/2) Exclude cut with ID 5239-32139-0047-9341-0_sp0.9 from training. Duration: 30.1555625 +2023-05-10 15:18:31,463 WARNING [train.py:1182] (1/2) Exclude cut with ID 1265-135635-0050-6781-0_sp0.9 from training. Duration: 21.8333125 +2023-05-10 15:18:40,291 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0024-94914-0_sp1.1 from training. Duration: 20.6545625 +2023-05-10 15:18:47,805 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=547860.0, ans=0.125 +2023-05-10 15:18:53,485 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.465e+02 3.105e+02 3.561e+02 4.354e+02 6.456e+02, threshold=7.122e+02, percent-clipped=0.0 +2023-05-10 15:18:58,025 INFO [train.py:1021] (1/2) Epoch 31, batch 500, loss[loss=0.1527, simple_loss=0.2371, pruned_loss=0.03409, over 37034.00 frames. ], tot_loss[loss=0.1654, simple_loss=0.2581, pruned_loss=0.03639, over 6628546.41 frames. ], batch size: 88, lr: 3.56e-03, grad_scale: 16.0 +2023-05-10 15:19:26,471 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0045-39920-0_sp0.9 from training. Duration: 20.52225 +2023-05-10 15:19:45,759 WARNING [train.py:1182] (1/2) Exclude cut with ID 3972-170212-0014-23379-0_sp0.9 from training. Duration: 29.1166875 +2023-05-10 15:19:51,853 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 15:20:14,650 INFO [train.py:1021] (1/2) Epoch 31, batch 550, loss[loss=0.1499, simple_loss=0.236, pruned_loss=0.03193, over 36171.00 frames. ], tot_loss[loss=0.1659, simple_loss=0.2584, pruned_loss=0.03669, over 6735713.80 frames. ], batch size: 80, lr: 3.56e-03, grad_scale: 16.0 +2023-05-10 15:20:27,031 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.0.self_attn2.whiten, num_groups=1, num_channels=192, metric=10.07 vs. limit=22.5 +2023-05-10 15:20:33,921 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module2.balancer2.prob, batch_count=548210.0, ans=0.125 +2023-05-10 15:20:39,753 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=548210.0, ans=0.0 +2023-05-10 15:20:41,321 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module2.balancer1.prob, batch_count=548210.0, ans=0.125 +2023-05-10 15:20:42,839 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_skip_rate, batch_count=548210.0, ans=0.0 +2023-05-10 15:20:44,363 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([2.0123, 3.0120, 4.2329, 3.0406], device='cuda:1') +2023-05-10 15:20:47,675 WARNING [train.py:1182] (1/2) Exclude cut with ID 543-133211-0007-59831-0_sp0.9 from training. Duration: 21.388875 +2023-05-10 15:21:07,463 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module1.balancer1.prob, batch_count=548310.0, ans=0.125 +2023-05-10 15:21:08,947 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module1.balancer2.prob, batch_count=548310.0, ans=0.125 +2023-05-10 15:21:25,899 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0024-94914-0 from training. Duration: 22.72 +2023-05-10 15:21:27,152 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.540e+02 3.228e+02 4.169e+02 5.214e+02 9.380e+02, threshold=8.337e+02, percent-clipped=5.0 +2023-05-10 15:21:27,322 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0031-94921-0_sp0.9 from training. Duration: 22.7444375 +2023-05-10 15:21:28,988 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.const_attention_rate, batch_count=548360.0, ans=0.025 +2023-05-10 15:21:31,832 INFO [train.py:1021] (1/2) Epoch 31, batch 600, loss[loss=0.1825, simple_loss=0.2743, pruned_loss=0.0453, over 37042.00 frames. ], tot_loss[loss=0.1664, simple_loss=0.259, pruned_loss=0.03692, over 6849847.15 frames. ], batch size: 116, lr: 3.56e-03, grad_scale: 16.0 +2023-05-10 15:22:04,165 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.attention_skip_rate, batch_count=548510.0, ans=0.0 +2023-05-10 15:22:16,240 WARNING [train.py:1182] (1/2) Exclude cut with ID 4133-6541-0027-40495-0_sp1.1 from training. Duration: 0.9681875 +2023-05-10 15:22:19,400 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62851-0022-91297-0_sp0.9 from training. Duration: 22.3166875 +2023-05-10 15:22:19,668 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=548560.0, ans=0.1 +2023-05-10 15:22:25,521 WARNING [train.py:1182] (1/2) Exclude cut with ID 543-133212-0015-59917-0_sp0.9 from training. Duration: 21.8166875 +2023-05-10 15:22:45,813 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.ff2_skip_rate, batch_count=548610.0, ans=0.0 +2023-05-10 15:22:48,259 INFO [train.py:1021] (1/2) Epoch 31, batch 650, loss[loss=0.1794, simple_loss=0.2776, pruned_loss=0.04058, over 36434.00 frames. ], tot_loss[loss=0.1668, simple_loss=0.2595, pruned_loss=0.03703, over 6936648.61 frames. ], batch size: 126, lr: 3.56e-03, grad_scale: 16.0 +2023-05-10 15:22:50,138 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.0.layers.1.self_attn_weights, attn_weights_entropy = tensor([6.1723, 5.4103, 5.4835, 6.0391], device='cuda:1') +2023-05-10 15:23:25,608 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.conv_module2.whiten, num_groups=1, num_channels=256, metric=4.41 vs. limit=15.0 +2023-05-10 15:23:32,789 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module1.balancer2.prob, batch_count=548810.0, ans=0.125 +2023-05-10 15:24:00,838 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.447e+02 2.901e+02 3.227e+02 3.862e+02 5.542e+02, threshold=6.454e+02, percent-clipped=0.0 +2023-05-10 15:24:05,301 INFO [train.py:1021] (1/2) Epoch 31, batch 700, loss[loss=0.1753, simple_loss=0.273, pruned_loss=0.03882, over 35925.00 frames. ], tot_loss[loss=0.1674, simple_loss=0.2604, pruned_loss=0.0372, over 6984459.91 frames. ], batch size: 133, lr: 3.55e-03, grad_scale: 16.0 +2023-05-10 15:24:07,152 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=548910.0, ans=0.1 +2023-05-10 15:24:14,520 WARNING [train.py:1182] (1/2) Exclude cut with ID 4957-30119-0041-23990-0_sp0.9 from training. Duration: 20.22775 +2023-05-10 15:24:18,539 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.conv_module1.whiten, num_groups=1, num_channels=192, metric=8.83 vs. limit=15.0 +2023-05-10 15:24:26,585 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass.scale_min, batch_count=548960.0, ans=0.2 +2023-05-10 15:24:31,780 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=144, metric=5.34 vs. limit=10.0 +2023-05-10 15:25:00,740 WARNING [train.py:1182] (1/2) Exclude cut with ID 5239-32139-0047-9341-0_sp1.1 from training. Duration: 24.67275 +2023-05-10 15:25:03,884 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.balancer1.prob, batch_count=549060.0, ans=0.125 +2023-05-10 15:25:21,536 INFO [train.py:1021] (1/2) Epoch 31, batch 750, loss[loss=0.1733, simple_loss=0.2696, pruned_loss=0.03848, over 36359.00 frames. ], tot_loss[loss=0.1671, simple_loss=0.2599, pruned_loss=0.03717, over 7029209.02 frames. ], batch size: 126, lr: 3.55e-03, grad_scale: 16.0 +2023-05-10 15:25:21,945 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([4.9812, 4.2132, 4.5228, 4.5619], device='cuda:1') +2023-05-10 15:25:29,003 WARNING [train.py:1182] (1/2) Exclude cut with ID 3082-165428-0081-50734-0_sp0.9 from training. Duration: 21.8055625 +2023-05-10 15:25:51,518 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_skip_rate, batch_count=549260.0, ans=0.0 +2023-05-10 15:25:51,579 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module2.balancer2.prob, batch_count=549260.0, ans=0.125 +2023-05-10 15:25:56,023 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_skip_rate, batch_count=549260.0, ans=0.0 +2023-05-10 15:26:09,175 WARNING [train.py:1182] (1/2) Exclude cut with ID 3340-169293-0054-76830-0_sp0.9 from training. Duration: 22.6666875 +2023-05-10 15:26:09,503 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module2.balancer2.min_positive, batch_count=549310.0, ans=0.05 +2023-05-10 15:26:14,325 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=7.14 vs. limit=15.0 +2023-05-10 15:26:33,626 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.413e+02 3.048e+02 3.343e+02 3.928e+02 7.448e+02, threshold=6.687e+02, percent-clipped=2.0 +2023-05-10 15:26:34,022 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.balancer2.prob, batch_count=549360.0, ans=0.125 +2023-05-10 15:26:35,481 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=549360.0, ans=0.1 +2023-05-10 15:26:38,124 INFO [train.py:1021] (1/2) Epoch 31, batch 800, loss[loss=0.1764, simple_loss=0.2693, pruned_loss=0.04171, over 36945.00 frames. ], tot_loss[loss=0.167, simple_loss=0.2599, pruned_loss=0.03708, over 7075923.57 frames. ], batch size: 108, lr: 3.55e-03, grad_scale: 32.0 +2023-05-10 15:26:40,780 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.ff2_skip_rate, batch_count=549410.0, ans=0.0 +2023-05-10 15:26:48,126 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([4.9882, 4.1962, 4.5227, 4.5330], device='cuda:1') +2023-05-10 15:26:49,623 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module1.balancer2.min_positive, batch_count=549410.0, ans=0.05 +2023-05-10 15:27:01,687 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_skip_rate, batch_count=549460.0, ans=0.0 +2023-05-10 15:27:09,284 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass.scale_min, batch_count=549510.0, ans=0.2 +2023-05-10 15:27:09,913 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.self_attn1.whiten, num_groups=1, num_channels=192, metric=10.40 vs. limit=22.5 +2023-05-10 15:27:16,302 WARNING [train.py:1182] (1/2) Exclude cut with ID 2411-132532-0017-82279-0_sp1.1 from training. Duration: 0.9681875 +2023-05-10 15:27:22,737 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.const_attention_rate, batch_count=549560.0, ans=0.025 +2023-05-10 15:27:30,660 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.balancer2.prob, batch_count=549560.0, ans=0.125 +2023-05-10 15:27:32,162 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass_mid.scale_min, batch_count=549560.0, ans=0.2 +2023-05-10 15:27:32,585 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.self_attn2.whiten, num_groups=1, num_channels=256, metric=7.78 vs. limit=22.5 +2023-05-10 15:27:43,460 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.balancer1.prob, batch_count=549610.0, ans=0.125 +2023-05-10 15:27:44,616 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62850-0007-91323-0 from training. Duration: 22.485 +2023-05-10 15:27:54,967 INFO [train.py:1021] (1/2) Epoch 31, batch 850, loss[loss=0.162, simple_loss=0.2602, pruned_loss=0.03188, over 37187.00 frames. ], tot_loss[loss=0.1664, simple_loss=0.2593, pruned_loss=0.03675, over 7131842.83 frames. ], batch size: 102, lr: 3.55e-03, grad_scale: 16.0 +2023-05-10 15:28:08,799 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.ff2_skip_rate, batch_count=549710.0, ans=0.0 +2023-05-10 15:28:11,986 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.7227, 3.7649, 3.9951, 3.5915], device='cuda:1') +2023-05-10 15:28:26,877 WARNING [train.py:1182] (1/2) Exclude cut with ID 3972-170212-0014-23379-0_sp1.1 from training. Duration: 23.82275 +2023-05-10 15:28:39,429 WARNING [train.py:1182] (1/2) Exclude cut with ID 4860-13185-0032-76709-0 from training. Duration: 20.77 +2023-05-10 15:28:42,669 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.const_attention_rate, batch_count=549810.0, ans=0.025 +2023-05-10 15:28:42,687 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_skip_rate, batch_count=549810.0, ans=0.0 +2023-05-10 15:28:46,214 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.conv_module2.whiten, num_groups=1, num_channels=256, metric=4.02 vs. limit=15.0 +2023-05-10 15:28:46,300 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=192, metric=3.76 vs. limit=10.0 +2023-05-10 15:28:48,547 WARNING [train.py:1182] (1/2) Exclude cut with ID 6426-64292-0017-15984-0_sp0.9 from training. Duration: 24.088875 +2023-05-10 15:29:07,776 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.440e+02 3.060e+02 3.523e+02 4.302e+02 6.165e+02, threshold=7.046e+02, percent-clipped=0.0 +2023-05-10 15:29:10,742 INFO [train.py:1021] (1/2) Epoch 31, batch 900, loss[loss=0.1779, simple_loss=0.271, pruned_loss=0.0424, over 36819.00 frames. ], tot_loss[loss=0.1661, simple_loss=0.2591, pruned_loss=0.03659, over 7141447.67 frames. ], batch size: 122, lr: 3.55e-03, grad_scale: 16.0 +2023-05-10 15:29:18,961 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62850-0007-91323-0_sp1.1 from training. Duration: 20.4409375 +2023-05-10 15:29:31,845 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([2.0206, 2.9379, 4.5751, 3.4585], device='cuda:1') +2023-05-10 15:29:31,881 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass_mid.scale_min, batch_count=549960.0, ans=0.2 +2023-05-10 15:29:45,428 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_skip_rate, batch_count=550010.0, ans=0.0 +2023-05-10 15:29:57,410 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module2.balancer1.prob, batch_count=550060.0, ans=0.125 +2023-05-10 15:30:28,651 INFO [train.py:1021] (1/2) Epoch 31, batch 950, loss[loss=0.1438, simple_loss=0.2282, pruned_loss=0.02968, over 36948.00 frames. ], tot_loss[loss=0.166, simple_loss=0.259, pruned_loss=0.03651, over 7170765.43 frames. ], batch size: 86, lr: 3.55e-03, grad_scale: 16.0 +2023-05-10 15:30:29,603 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=5.43 vs. limit=15.0 +2023-05-10 15:30:40,492 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0062-62366-0_sp0.9 from training. Duration: 22.511125 +2023-05-10 15:30:40,530 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0031-39906-0 from training. Duration: 20.675 +2023-05-10 15:31:07,136 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=256, metric=8.00 vs. limit=22.5 +2023-05-10 15:31:26,983 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.ff3_skip_rate, batch_count=550310.0, ans=0.0 +2023-05-10 15:31:30,054 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module1.balancer1.prob, batch_count=550360.0, ans=0.125 +2023-05-10 15:31:30,162 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=550360.0, ans=0.1 +2023-05-10 15:31:40,809 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.balancer1.prob, batch_count=550360.0, ans=0.125 +2023-05-10 15:31:41,808 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.302e+02 3.080e+02 3.899e+02 4.855e+02 7.501e+02, threshold=7.799e+02, percent-clipped=2.0 +2023-05-10 15:31:44,802 INFO [train.py:1021] (1/2) Epoch 31, batch 1000, loss[loss=0.1624, simple_loss=0.2547, pruned_loss=0.03502, over 36874.00 frames. ], tot_loss[loss=0.1655, simple_loss=0.2586, pruned_loss=0.03619, over 7189504.46 frames. ], batch size: 96, lr: 3.55e-03, grad_scale: 16.0 +2023-05-10 15:31:49,793 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.ff3_skip_rate, batch_count=550410.0, ans=0.0 +2023-05-10 15:31:49,887 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.ff3_skip_rate, batch_count=550410.0, ans=0.0 +2023-05-10 15:32:08,582 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module2.balancer2.prob, batch_count=550460.0, ans=0.125 +2023-05-10 15:32:25,500 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62850-0007-91323-0_sp0.9 from training. Duration: 24.9833125 +2023-05-10 15:32:25,717 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=550510.0, ans=0.1 +2023-05-10 15:32:55,586 WARNING [train.py:1182] (1/2) Exclude cut with ID 5239-32139-0047-9341-0 from training. Duration: 27.14 +2023-05-10 15:33:02,141 INFO [train.py:1021] (1/2) Epoch 31, batch 1050, loss[loss=0.1699, simple_loss=0.2663, pruned_loss=0.03677, over 36735.00 frames. ], tot_loss[loss=0.1657, simple_loss=0.2589, pruned_loss=0.03626, over 7202175.61 frames. ], batch size: 118, lr: 3.55e-03, grad_scale: 16.0 +2023-05-10 15:33:02,586 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass.skip_rate, batch_count=550660.0, ans=0.04949747468305833 +2023-05-10 15:33:11,908 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0044-62348-0 from training. Duration: 22.44 +2023-05-10 15:33:22,698 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.balancer1.prob, batch_count=550710.0, ans=0.125 +2023-05-10 15:33:23,129 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=3.39 vs. limit=6.0 +2023-05-10 15:33:31,509 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module1.balancer1.prob, batch_count=550760.0, ans=0.125 +2023-05-10 15:34:13,147 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=550860.0, ans=0.125 +2023-05-10 15:34:13,182 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.balancer2.prob, batch_count=550860.0, ans=0.125 +2023-05-10 15:34:15,750 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.555e+02 3.227e+02 3.954e+02 5.012e+02 6.836e+02, threshold=7.908e+02, percent-clipped=0.0 +2023-05-10 15:34:18,911 INFO [train.py:1021] (1/2) Epoch 31, batch 1100, loss[loss=0.153, simple_loss=0.2357, pruned_loss=0.03515, over 35741.00 frames. ], tot_loss[loss=0.1654, simple_loss=0.2586, pruned_loss=0.03616, over 7202130.87 frames. ], batch size: 79, lr: 3.55e-03, grad_scale: 16.0 +2023-05-10 15:34:20,732 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward3.hidden_balancer.prob, batch_count=550910.0, ans=0.125 +2023-05-10 15:34:22,318 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module2.balancer1.prob, batch_count=550910.0, ans=0.125 +2023-05-10 15:34:26,914 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.6626, 3.8480, 4.2061, 3.8676], device='cuda:1') +2023-05-10 15:34:34,004 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0060-62364-0_sp0.9 from training. Duration: 21.361125 +2023-05-10 15:34:39,818 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0079-62383-0_sp1.1 from training. Duration: 27.0318125 +2023-05-10 15:34:40,025 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.0.layers.0.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 15:34:52,681 WARNING [train.py:1182] (1/2) Exclude cut with ID 5622-44585-0006-90525-0_sp0.9 from training. Duration: 28.638875 +2023-05-10 15:35:10,214 WARNING [train.py:1182] (1/2) Exclude cut with ID 3340-169293-0054-76830-0 from training. Duration: 20.4 +2023-05-10 15:35:35,558 INFO [train.py:1021] (1/2) Epoch 31, batch 1150, loss[loss=0.1535, simple_loss=0.2439, pruned_loss=0.03153, over 37062.00 frames. ], tot_loss[loss=0.1656, simple_loss=0.2587, pruned_loss=0.0363, over 7213802.94 frames. ], batch size: 94, lr: 3.55e-03, grad_scale: 16.0 +2023-05-10 15:35:42,291 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0071-62375-0 from training. Duration: 20.025 +2023-05-10 15:35:43,839 WARNING [train.py:1182] (1/2) Exclude cut with ID 2364-131735-0112-64612-0_sp0.9 from training. Duration: 20.488875 +2023-05-10 15:35:45,527 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.balancer1.prob, batch_count=551160.0, ans=0.125 +2023-05-10 15:35:50,460 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0079-62383-0 from training. Duration: 29.735 +2023-05-10 15:36:08,934 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=551260.0, ans=0.0 +2023-05-10 15:36:22,883 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.conv_module1.whiten, num_groups=1, num_channels=192, metric=8.45 vs. limit=15.0 +2023-05-10 15:36:29,759 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_skip_rate, batch_count=551310.0, ans=0.0 +2023-05-10 15:36:50,023 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.506e+02 3.034e+02 3.462e+02 4.224e+02 6.851e+02, threshold=6.925e+02, percent-clipped=0.0 +2023-05-10 15:36:53,000 INFO [train.py:1021] (1/2) Epoch 31, batch 1200, loss[loss=0.1631, simple_loss=0.2609, pruned_loss=0.03262, over 36894.00 frames. ], tot_loss[loss=0.165, simple_loss=0.258, pruned_loss=0.036, over 7228017.32 frames. ], batch size: 105, lr: 3.55e-03, grad_scale: 32.0 +2023-05-10 15:36:53,342 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.balancer2.prob, batch_count=551410.0, ans=0.125 +2023-05-10 15:36:58,039 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.balancer1.max_abs, batch_count=551410.0, ans=10.0 +2023-05-10 15:37:08,875 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=1.83 vs. limit=6.0 +2023-05-10 15:37:11,749 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.const_attention_rate, batch_count=551460.0, ans=0.025 +2023-05-10 15:37:15,976 WARNING [train.py:1182] (1/2) Exclude cut with ID 7276-92427-0014-12983-0_sp0.9 from training. Duration: 21.3055625 +2023-05-10 15:37:17,398 WARNING [train.py:1182] (1/2) Exclude cut with ID 1025-75365-0008-79168-0_sp0.9 from training. Duration: 22.0666875 +2023-05-10 15:37:41,632 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module1.balancer2.prob, batch_count=551560.0, ans=0.125 +2023-05-10 15:37:50,851 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=4.69 vs. limit=15.0 +2023-05-10 15:38:05,580 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.4.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 15:38:09,535 INFO [train.py:1021] (1/2) Epoch 31, batch 1250, loss[loss=0.155, simple_loss=0.2455, pruned_loss=0.0323, over 37122.00 frames. ], tot_loss[loss=0.1655, simple_loss=0.2585, pruned_loss=0.03623, over 7207136.50 frames. ], batch size: 98, lr: 3.55e-03, grad_scale: 16.0 +2023-05-10 15:38:16,083 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.bypass_mid.scale_min, batch_count=551660.0, ans=0.2 +2023-05-10 15:38:17,475 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=551660.0, ans=0.0 +2023-05-10 15:38:17,601 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.ff2_skip_rate, batch_count=551660.0, ans=0.0 +2023-05-10 15:39:07,125 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=551810.0, ans=0.125 +2023-05-10 15:39:11,678 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0062-62366-0 from training. Duration: 20.26 +2023-05-10 15:39:25,714 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.426e+02 2.971e+02 3.292e+02 3.644e+02 6.816e+02, threshold=6.584e+02, percent-clipped=0.0 +2023-05-10 15:39:27,285 INFO [train.py:1021] (1/2) Epoch 31, batch 1300, loss[loss=0.1687, simple_loss=0.2652, pruned_loss=0.03607, over 37189.00 frames. ], tot_loss[loss=0.1651, simple_loss=0.258, pruned_loss=0.03605, over 7234912.26 frames. ], batch size: 102, lr: 3.55e-03, grad_scale: 16.0 +2023-05-10 15:39:28,898 WARNING [train.py:1182] (1/2) Exclude cut with ID 5239-32139-0030-9324-0_sp0.9 from training. Duration: 21.3444375 +2023-05-10 15:39:33,238 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=6.31 vs. limit=15.0 +2023-05-10 15:39:49,510 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.5.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 15:39:55,616 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=551960.0, ans=0.1 +2023-05-10 15:40:04,829 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_skip_rate, batch_count=552010.0, ans=0.0 +2023-05-10 15:40:31,178 WARNING [train.py:1182] (1/2) Exclude cut with ID 497-129325-0061-62254-0_sp1.1 from training. Duration: 0.97725 +2023-05-10 15:40:44,774 INFO [train.py:1021] (1/2) Epoch 31, batch 1350, loss[loss=0.1639, simple_loss=0.2596, pruned_loss=0.03408, over 37078.00 frames. ], tot_loss[loss=0.1643, simple_loss=0.2572, pruned_loss=0.03567, over 7233201.17 frames. ], batch size: 103, lr: 3.54e-03, grad_scale: 16.0 +2023-05-10 15:41:01,800 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module2.balancer2.min_abs, batch_count=552210.0, ans=0.5 +2023-05-10 15:41:03,284 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module2.balancer1.max_abs, batch_count=552210.0, ans=10.0 +2023-05-10 15:41:03,466 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=256, metric=4.09 vs. limit=15.0 +2023-05-10 15:41:14,901 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0031-39906-0_sp0.9 from training. Duration: 22.97225 +2023-05-10 15:41:26,838 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=552260.0, ans=0.125 +2023-05-10 15:41:29,983 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_skip_rate, batch_count=552310.0, ans=0.0 +2023-05-10 15:41:49,378 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0047-39922-0_sp0.9 from training. Duration: 21.97775 +2023-05-10 15:41:59,635 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.485e+02 2.971e+02 3.534e+02 4.225e+02 6.633e+02, threshold=7.069e+02, percent-clipped=1.0 +2023-05-10 15:42:01,138 INFO [train.py:1021] (1/2) Epoch 31, batch 1400, loss[loss=0.1788, simple_loss=0.2728, pruned_loss=0.04238, over 36833.00 frames. ], tot_loss[loss=0.1648, simple_loss=0.2579, pruned_loss=0.03588, over 7229552.35 frames. ], batch size: 113, lr: 3.54e-03, grad_scale: 16.0 +2023-05-10 15:42:01,278 WARNING [train.py:1182] (1/2) Exclude cut with ID 1112-1043-0006-89194-0_sp0.9 from training. Duration: 21.8333125 +2023-05-10 15:42:03,617 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.whiten, num_groups=1, num_channels=192, metric=4.86 vs. limit=12.0 +2023-05-10 15:42:11,786 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0031-94921-0 from training. Duration: 20.47 +2023-05-10 15:42:23,866 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.ff3_skip_rate, batch_count=552460.0, ans=0.0 +2023-05-10 15:42:39,067 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=552510.0, ans=0.1 +2023-05-10 15:42:41,940 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.1.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([5.4130, 5.2616, 4.5584, 4.9855], device='cuda:1') +2023-05-10 15:42:44,857 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.1.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([5.1730, 4.5069, 4.6982, 4.3965], device='cuda:1') +2023-05-10 15:43:18,781 INFO [train.py:1021] (1/2) Epoch 31, batch 1450, loss[loss=0.1774, simple_loss=0.2705, pruned_loss=0.04221, over 36820.00 frames. ], tot_loss[loss=0.1658, simple_loss=0.2589, pruned_loss=0.03636, over 7197108.75 frames. ], batch size: 113, lr: 3.54e-03, grad_scale: 8.0 +2023-05-10 15:43:23,364 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0037-39912-0_sp0.9 from training. Duration: 20.67225 +2023-05-10 15:43:28,179 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([5.1008, 4.4391, 3.0399, 2.9739], device='cuda:1') +2023-05-10 15:43:37,367 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.5.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([3.0097, 4.1020, 4.6655, 4.8181], device='cuda:1') +2023-05-10 15:43:41,956 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=552710.0, ans=0.1 +2023-05-10 15:43:43,224 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0024-94914-0_sp0.9 from training. Duration: 25.2444375 +2023-05-10 15:43:56,900 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=552760.0, ans=0.1 +2023-05-10 15:44:09,793 WARNING [train.py:1182] (1/2) Exclude cut with ID 3340-169293-0021-76797-0_sp0.9 from training. Duration: 21.1445 +2023-05-10 15:44:11,503 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=552810.0, ans=0.1 +2023-05-10 15:44:19,468 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=256, metric=7.94 vs. limit=22.5 +2023-05-10 15:44:35,231 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.433e+02 3.008e+02 3.523e+02 4.786e+02 7.344e+02, threshold=7.045e+02, percent-clipped=1.0 +2023-05-10 15:44:35,265 INFO [train.py:1021] (1/2) Epoch 31, batch 1500, loss[loss=0.1596, simple_loss=0.2512, pruned_loss=0.03395, over 37035.00 frames. ], tot_loss[loss=0.1654, simple_loss=0.2583, pruned_loss=0.03622, over 7190451.93 frames. ], batch size: 99, lr: 3.54e-03, grad_scale: 8.0 +2023-05-10 15:44:55,100 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=552960.0, ans=0.1 +2023-05-10 15:44:55,128 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.attention_skip_rate, batch_count=552960.0, ans=0.0 +2023-05-10 15:45:02,282 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.attention_skip_rate, batch_count=552960.0, ans=0.0 +2023-05-10 15:45:04,095 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module1.balancer1.prob, batch_count=552960.0, ans=0.125 +2023-05-10 15:45:17,605 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_skip_rate, batch_count=553010.0, ans=0.0 +2023-05-10 15:45:26,575 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=553060.0, ans=0.1 +2023-05-10 15:45:29,308 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0079-62383-0_sp0.9 from training. Duration: 33.038875 +2023-05-10 15:45:37,360 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=10.54 vs. limit=15.0 +2023-05-10 15:45:41,881 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=4.18 vs. limit=15.0 +2023-05-10 15:45:51,082 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=553160.0, ans=0.1 +2023-05-10 15:45:52,080 INFO [train.py:1021] (1/2) Epoch 31, batch 1550, loss[loss=0.1667, simple_loss=0.2663, pruned_loss=0.03349, over 32532.00 frames. ], tot_loss[loss=0.1656, simple_loss=0.2587, pruned_loss=0.0362, over 7202860.18 frames. ], batch size: 170, lr: 3.54e-03, grad_scale: 8.0 +2023-05-10 15:46:00,563 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.5.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.3171, 4.0320, 2.1472, 2.4528], device='cuda:1') +2023-05-10 15:46:09,200 WARNING [train.py:1182] (1/2) Exclude cut with ID 6426-64291-0000-16059-0_sp0.9 from training. Duration: 20.0944375 +2023-05-10 15:46:19,944 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass.skip_rate, batch_count=553210.0, ans=0.04949747468305833 +2023-05-10 15:46:21,596 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.bypass.scale_min, batch_count=553260.0, ans=0.2 +2023-05-10 15:46:25,748 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0044-62348-0_sp1.1 from training. Duration: 20.4 +2023-05-10 15:46:33,167 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62851-0022-91297-0 from training. Duration: 20.085 +2023-05-10 15:46:38,206 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.5.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.3369, 4.0236, 2.1358, 2.4560], device='cuda:1') +2023-05-10 15:46:43,726 WARNING [train.py:1182] (1/2) Exclude cut with ID 4860-13185-0032-76709-0_sp0.9 from training. Duration: 23.07775 +2023-05-10 15:46:53,713 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.ff3_skip_rate, batch_count=553360.0, ans=0.0 +2023-05-10 15:47:03,621 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_skip_rate, batch_count=553360.0, ans=0.0 +2023-05-10 15:47:09,363 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.542e+02 2.986e+02 3.648e+02 4.500e+02 6.936e+02, threshold=7.296e+02, percent-clipped=0.0 +2023-05-10 15:47:09,397 INFO [train.py:1021] (1/2) Epoch 31, batch 1600, loss[loss=0.1678, simple_loss=0.2691, pruned_loss=0.03321, over 37084.00 frames. ], tot_loss[loss=0.1655, simple_loss=0.2588, pruned_loss=0.03615, over 7194321.63 frames. ], batch size: 110, lr: 3.54e-03, grad_scale: 16.0 +2023-05-10 15:47:27,337 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=144, metric=8.72 vs. limit=10.0 +2023-05-10 15:47:35,316 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0044-62348-0_sp0.9 from training. Duration: 24.9333125 +2023-05-10 15:47:50,033 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=553510.0, ans=0.0 +2023-05-10 15:47:50,107 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.5.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 15:48:06,913 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.const_attention_rate, batch_count=553560.0, ans=0.025 +2023-05-10 15:48:20,060 WARNING [train.py:1182] (1/2) Exclude cut with ID 5118-111612-0016-124680-0_sp0.9 from training. Duration: 20.388875 +2023-05-10 15:48:20,326 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.1.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([5.7021, 4.9488, 5.2027, 4.8928], device='cuda:1') +2023-05-10 15:48:26,090 INFO [train.py:1021] (1/2) Epoch 31, batch 1650, loss[loss=0.1689, simple_loss=0.2642, pruned_loss=0.03674, over 36866.00 frames. ], tot_loss[loss=0.1655, simple_loss=0.2586, pruned_loss=0.03617, over 7202638.14 frames. ], batch size: 113, lr: 3.54e-03, grad_scale: 16.0 +2023-05-10 15:48:26,187 WARNING [train.py:1182] (1/2) Exclude cut with ID 432-122774-0017-62487-0_sp1.1 from training. Duration: 20.3590625 +2023-05-10 15:48:32,299 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward2.hidden_balancer.prob, batch_count=553660.0, ans=0.125 +2023-05-10 15:48:48,663 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=553710.0, ans=0.125 +2023-05-10 15:48:57,642 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module1.balancer2.min_positive, batch_count=553760.0, ans=0.05 +2023-05-10 15:49:08,182 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.ff2_skip_rate, batch_count=553760.0, ans=0.0 +2023-05-10 15:49:14,196 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.balancer2.prob, batch_count=553810.0, ans=0.125 +2023-05-10 15:49:37,093 WARNING [train.py:1182] (1/2) Exclude cut with ID 3557-8342-0013-54691-0_sp1.1 from training. Duration: 0.836375 +2023-05-10 15:49:42,340 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=553910.0, ans=0.125 +2023-05-10 15:49:43,447 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.604e+02 3.201e+02 3.994e+02 5.631e+02 7.886e+02, threshold=7.988e+02, percent-clipped=6.0 +2023-05-10 15:49:43,477 INFO [train.py:1021] (1/2) Epoch 31, batch 1700, loss[loss=0.177, simple_loss=0.2675, pruned_loss=0.0432, over 36807.00 frames. ], tot_loss[loss=0.1657, simple_loss=0.2584, pruned_loss=0.0365, over 7189034.75 frames. ], batch size: 113, lr: 3.54e-03, grad_scale: 16.0 +2023-05-10 15:49:43,793 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=553910.0, ans=0.125 +2023-05-10 15:49:45,245 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.bypass.skip_rate, batch_count=553910.0, ans=0.07 +2023-05-10 15:50:05,146 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.conv_module2.whiten, num_groups=1, num_channels=256, metric=5.70 vs. limit=15.0 +2023-05-10 15:50:06,282 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.balancer2.prob, batch_count=553960.0, ans=0.125 +2023-05-10 15:50:06,414 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=553960.0, ans=0.125 +2023-05-10 15:50:23,615 WARNING [train.py:1182] (1/2) Exclude cut with ID 8565-290391-0049-67394-0_sp0.9 from training. Duration: 21.3166875 +2023-05-10 15:50:55,910 WARNING [train.py:1182] (1/2) Exclude cut with ID 6533-399-0029-104863-0_sp0.9 from training. Duration: 22.1055625 +2023-05-10 15:51:00,258 INFO [train.py:1021] (1/2) Epoch 31, batch 1750, loss[loss=0.1764, simple_loss=0.2677, pruned_loss=0.0425, over 37075.00 frames. ], tot_loss[loss=0.1666, simple_loss=0.2584, pruned_loss=0.03733, over 7189152.31 frames. ], batch size: 116, lr: 3.54e-03, grad_scale: 16.0 +2023-05-10 15:51:07,885 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0094-26379-0_sp1.1 from training. Duration: 21.77725 +2023-05-10 15:51:14,365 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([5.0924, 4.2280, 4.5884, 4.6657], device='cuda:1') +2023-05-10 15:51:29,915 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0005-134304-0_sp0.9 from training. Duration: 27.8166875 +2023-05-10 15:51:53,016 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0021-15852-0_sp1.1 from training. Duration: 22.5090625 +2023-05-10 15:52:00,563 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0005-134304-0 from training. Duration: 25.035 +2023-05-10 15:52:03,852 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=554360.0, ans=0.1 +2023-05-10 15:52:07,545 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.balancer2.prob, batch_count=554360.0, ans=0.125 +2023-05-10 15:52:16,803 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=256, metric=6.34 vs. limit=15.0 +2023-05-10 15:52:17,446 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.880e+02 3.405e+02 3.760e+02 4.228e+02 6.080e+02, threshold=7.520e+02, percent-clipped=0.0 +2023-05-10 15:52:17,478 INFO [train.py:1021] (1/2) Epoch 31, batch 1800, loss[loss=0.1582, simple_loss=0.2387, pruned_loss=0.03889, over 36971.00 frames. ], tot_loss[loss=0.1678, simple_loss=0.259, pruned_loss=0.03831, over 7147236.34 frames. ], batch size: 86, lr: 3.54e-03, grad_scale: 16.0 +2023-05-10 15:52:19,096 WARNING [train.py:1182] (1/2) Exclude cut with ID 774-127930-0014-10412-0_sp1.1 from training. Duration: 0.95 +2023-05-10 15:52:33,400 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module1.balancer1.prob, batch_count=554460.0, ans=0.125 +2023-05-10 15:52:37,510 WARNING [train.py:1182] (1/2) Exclude cut with ID 3033-130750-0096-55598-0_sp0.9 from training. Duration: 0.92225 +2023-05-10 15:52:42,340 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.0.layers.0.self_attn_weights, attn_weights_entropy = tensor([6.2949, 5.6301, 5.4743, 6.0497], device='cuda:1') +2023-05-10 15:53:05,298 WARNING [train.py:1182] (1/2) Exclude cut with ID 4511-76322-0006-80011-0 from training. Duration: 21.97 +2023-05-10 15:53:22,976 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.whiten, num_groups=1, num_channels=256, metric=3.83 vs. limit=12.0 +2023-05-10 15:53:25,366 WARNING [train.py:1182] (1/2) Exclude cut with ID 7492-105653-0055-62765-0_sp0.9 from training. Duration: 21.97225 +2023-05-10 15:53:25,628 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.balancer_ff2.min_abs, batch_count=554610.0, ans=0.1 +2023-05-10 15:53:26,812 WARNING [train.py:1182] (1/2) Exclude cut with ID 453-131332-0000-47844-0_sp0.9 from training. Duration: 25.3333125 +2023-05-10 15:53:34,225 INFO [train.py:1021] (1/2) Epoch 31, batch 1850, loss[loss=0.1522, simple_loss=0.2327, pruned_loss=0.03588, over 35922.00 frames. ], tot_loss[loss=0.1684, simple_loss=0.2588, pruned_loss=0.03903, over 7147467.11 frames. ], batch size: 79, lr: 3.54e-03, grad_scale: 16.0 +2023-05-10 15:53:35,865 WARNING [train.py:1182] (1/2) Exclude cut with ID 5172-29468-0015-19128-0_sp0.9 from training. Duration: 21.5055625 +2023-05-10 15:53:40,560 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=554660.0, ans=0.125 +2023-05-10 15:53:46,153 WARNING [train.py:1182] (1/2) Exclude cut with ID 453-131332-0000-47844-0_sp1.1 from training. Duration: 20.72725 +2023-05-10 15:53:50,844 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.balancer2.prob, batch_count=554710.0, ans=0.125 +2023-05-10 15:54:22,255 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=554810.0, ans=0.0 +2023-05-10 15:54:23,461 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0030-130156-0_sp0.9 from training. Duration: 26.32775 +2023-05-10 15:54:25,212 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.balancer1.prob, batch_count=554810.0, ans=0.125 +2023-05-10 15:54:41,787 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.bypass_mid.scale_min, batch_count=554860.0, ans=0.2 +2023-05-10 15:54:41,808 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=554860.0, ans=0.1 +2023-05-10 15:54:50,320 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module2.balancer1.prob, batch_count=554910.0, ans=0.125 +2023-05-10 15:54:51,174 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.812e+02 3.440e+02 3.852e+02 4.214e+02 5.941e+02, threshold=7.704e+02, percent-clipped=0.0 +2023-05-10 15:54:51,214 INFO [train.py:1021] (1/2) Epoch 31, batch 1900, loss[loss=0.1772, simple_loss=0.2655, pruned_loss=0.04446, over 34278.00 frames. ], tot_loss[loss=0.1689, simple_loss=0.2584, pruned_loss=0.03967, over 7149962.62 frames. ], batch size: 144, lr: 3.54e-03, grad_scale: 16.0 +2023-05-10 15:54:52,044 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=9.54 vs. limit=15.0 +2023-05-10 15:54:55,616 WARNING [train.py:1182] (1/2) Exclude cut with ID 3867-173237-0077-144769-0 from training. Duration: 20.025 +2023-05-10 15:55:00,338 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=554910.0, ans=0.1 +2023-05-10 15:55:01,509 WARNING [train.py:1182] (1/2) Exclude cut with ID 6709-74022-0004-86860-0_sp1.1 from training. Duration: 0.9409375 +2023-05-10 15:55:01,518 WARNING [train.py:1182] (1/2) Exclude cut with ID 4757-1811-0023-62229-0_sp0.9 from training. Duration: 21.37775 +2023-05-10 15:55:20,689 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.5.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([3.0293, 3.3088, 3.7363, 3.7097], device='cuda:1') +2023-05-10 15:55:21,864 WARNING [train.py:1182] (1/2) Exclude cut with ID 1250-135782-0004-25974-0_sp0.9 from training. Duration: 21.17225 +2023-05-10 15:55:21,884 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0021-15852-0_sp0.9 from training. Duration: 27.511125 +2023-05-10 15:55:51,193 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_skip_rate, batch_count=555110.0, ans=0.0 +2023-05-10 15:55:51,355 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_skip_rate, batch_count=555110.0, ans=0.0 +2023-05-10 15:55:52,811 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.attention_skip_rate, batch_count=555110.0, ans=0.0 +2023-05-10 15:55:55,950 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.5.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.4048, 4.7234, 2.5265, 2.5948], device='cuda:1') +2023-05-10 15:55:57,008 WARNING [train.py:1182] (1/2) Exclude cut with ID 453-131332-0000-47844-0 from training. Duration: 22.8 +2023-05-10 15:56:01,569 WARNING [train.py:1182] (1/2) Exclude cut with ID 4964-30587-0040-44509-0 from training. Duration: 22.585 +2023-05-10 15:56:08,040 INFO [train.py:1021] (1/2) Epoch 31, batch 1950, loss[loss=0.1595, simple_loss=0.2396, pruned_loss=0.03967, over 36785.00 frames. ], tot_loss[loss=0.1699, simple_loss=0.2591, pruned_loss=0.04039, over 7143359.12 frames. ], batch size: 89, lr: 3.53e-03, grad_scale: 16.0 +2023-05-10 15:56:19,044 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_skip_rate, batch_count=555160.0, ans=0.0 +2023-05-10 15:56:31,053 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module2.balancer1.prob, batch_count=555210.0, ans=0.125 +2023-05-10 15:56:32,971 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0001-146967-0_sp0.9 from training. Duration: 22.0166875 +2023-05-10 15:56:48,097 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0003-134302-0_sp1.1 from training. Duration: 24.395375 +2023-05-10 15:56:54,154 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0017-41203-0_sp0.9 from training. Duration: 27.47775 +2023-05-10 15:56:59,160 WARNING [train.py:1182] (1/2) Exclude cut with ID 432-122774-0017-62487-0_sp0.9 from training. Duration: 24.8833125 +2023-05-10 15:56:59,480 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.1.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 15:56:59,570 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.4.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 15:57:02,250 WARNING [train.py:1182] (1/2) Exclude cut with ID 6758-72288-0033-108368-0 from training. Duration: 23.39 +2023-05-10 15:57:06,967 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.balancer.max_positive, batch_count=555310.0, ans=0.95 +2023-05-10 15:57:09,661 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0007-12994-0_sp0.9 from training. Duration: 28.72225 +2023-05-10 15:57:09,945 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.0.layers.1.self_attn_weights, attn_weights_entropy = tensor([6.2782, 5.4496, 5.6079, 6.1879], device='cuda:1') +2023-05-10 15:57:09,971 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_skip_rate, batch_count=555360.0, ans=0.0 +2023-05-10 15:57:13,029 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module2.balancer2.prob, batch_count=555360.0, ans=0.125 +2023-05-10 15:57:18,592 WARNING [train.py:1182] (1/2) Exclude cut with ID 585-294811-0110-133686-0_sp0.9 from training. Duration: 20.8944375 +2023-05-10 15:57:25,051 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.744e+02 3.555e+02 4.043e+02 4.521e+02 6.056e+02, threshold=8.087e+02, percent-clipped=0.0 +2023-05-10 15:57:25,083 INFO [train.py:1021] (1/2) Epoch 31, batch 2000, loss[loss=0.186, simple_loss=0.2718, pruned_loss=0.05005, over 36695.00 frames. ], tot_loss[loss=0.171, simple_loss=0.2595, pruned_loss=0.04119, over 7115623.69 frames. ], batch size: 118, lr: 3.53e-03, grad_scale: 32.0 +2023-05-10 15:57:34,475 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.balancer2.prob, batch_count=555410.0, ans=0.125 +2023-05-10 15:57:35,724 WARNING [train.py:1182] (1/2) Exclude cut with ID 5796-66357-0007-116447-0_sp0.9 from training. Duration: 23.8444375 +2023-05-10 15:57:45,028 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.const_attention_rate, batch_count=555460.0, ans=0.025 +2023-05-10 15:57:58,656 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0007-12994-0 from training. Duration: 25.85 +2023-05-10 15:58:00,136 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0023-13010-0 from training. Duration: 21.39 +2023-05-10 15:58:10,605 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0014-15845-0 from training. Duration: 27.92 +2023-05-10 15:58:38,372 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0039-130165-0_sp0.9 from training. Duration: 20.661125 +2023-05-10 15:58:39,153 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.conv_module2.whiten, num_groups=1, num_channels=256, metric=6.72 vs. limit=15.0 +2023-05-10 15:58:41,244 INFO [train.py:1021] (1/2) Epoch 31, batch 2050, loss[loss=0.1559, simple_loss=0.2358, pruned_loss=0.03805, over 36841.00 frames. ], tot_loss[loss=0.1719, simple_loss=0.2601, pruned_loss=0.04181, over 7099480.12 frames. ], batch size: 84, lr: 3.53e-03, grad_scale: 32.0 +2023-05-10 15:58:43,280 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.const_attention_rate, batch_count=555660.0, ans=0.025 +2023-05-10 15:58:49,790 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.out_proj.dropout_p, batch_count=555660.0, ans=0.1 +2023-05-10 15:58:54,369 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module2.balancer2.prob, batch_count=555660.0, ans=0.125 +2023-05-10 15:59:02,974 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0043-15874-0_sp0.9 from training. Duration: 20.07225 +2023-05-10 15:59:09,603 WARNING [train.py:1182] (1/2) Exclude cut with ID 1085-156170-0017-128270-0 from training. Duration: 21.01 +2023-05-10 15:59:29,555 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.attention_skip_rate, batch_count=555810.0, ans=0.0 +2023-05-10 15:59:58,076 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.786e+02 3.584e+02 4.318e+02 5.188e+02 8.901e+02, threshold=8.636e+02, percent-clipped=3.0 +2023-05-10 15:59:58,117 INFO [train.py:1021] (1/2) Epoch 31, batch 2100, loss[loss=0.1702, simple_loss=0.2599, pruned_loss=0.04025, over 36944.00 frames. ], tot_loss[loss=0.1719, simple_loss=0.2597, pruned_loss=0.04202, over 7110895.30 frames. ], batch size: 108, lr: 3.53e-03, grad_scale: 32.0 +2023-05-10 16:00:07,893 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module2.balancer1.prob, batch_count=555910.0, ans=0.125 +2023-05-10 16:00:22,763 WARNING [train.py:1182] (1/2) Exclude cut with ID 2195-150901-0045-59933-0 from training. Duration: 20.65 +2023-05-10 16:00:32,191 WARNING [train.py:1182] (1/2) Exclude cut with ID 5796-66357-0007-116447-0 from training. Duration: 21.46 +2023-05-10 16:00:55,127 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module2.balancer1.prob, batch_count=556060.0, ans=0.125 +2023-05-10 16:01:14,497 INFO [train.py:1021] (1/2) Epoch 31, batch 2150, loss[loss=0.1633, simple_loss=0.2489, pruned_loss=0.0388, over 37188.00 frames. ], tot_loss[loss=0.1721, simple_loss=0.2597, pruned_loss=0.04222, over 7132663.14 frames. ], batch size: 93, lr: 3.53e-03, grad_scale: 32.0 +2023-05-10 16:01:20,607 WARNING [train.py:1182] (1/2) Exclude cut with ID 3557-8342-0013-54691-0 from training. Duration: 0.92 +2023-05-10 16:01:28,566 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0023-13010-0_sp0.9 from training. Duration: 23.7666875 +2023-05-10 16:01:55,142 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass.scale_min, batch_count=556260.0, ans=0.2 +2023-05-10 16:02:02,479 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.2673, 4.0585, 3.7742, 4.0697, 3.4150, 3.1801, 3.5906, 3.0791], + device='cuda:1') +2023-05-10 16:02:06,719 WARNING [train.py:1182] (1/2) Exclude cut with ID 8544-281189-0060-101339-0_sp0.9 from training. Duration: 20.861125 +2023-05-10 16:02:17,217 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-65654-0031-41259-0_sp0.9 from training. Duration: 22.711125 +2023-05-10 16:02:21,897 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=556360.0, ans=0.1 +2023-05-10 16:02:31,091 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.802e+02 3.512e+02 3.989e+02 4.976e+02 6.402e+02, threshold=7.979e+02, percent-clipped=0.0 +2023-05-10 16:02:31,130 INFO [train.py:1021] (1/2) Epoch 31, batch 2200, loss[loss=0.166, simple_loss=0.2591, pruned_loss=0.03641, over 37121.00 frames. ], tot_loss[loss=0.1726, simple_loss=0.2602, pruned_loss=0.04253, over 7122933.25 frames. ], batch size: 107, lr: 3.53e-03, grad_scale: 32.0 +2023-05-10 16:02:46,201 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.0.self_attn1.whiten, num_groups=1, num_channels=192, metric=10.68 vs. limit=22.5 +2023-05-10 16:02:57,442 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.balancer2.prob, batch_count=556460.0, ans=0.125 +2023-05-10 16:03:01,654 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0043-132310-0_sp1.1 from training. Duration: 22.986375 +2023-05-10 16:03:02,005 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=556510.0, ans=0.125 +2023-05-10 16:03:20,331 WARNING [train.py:1182] (1/2) Exclude cut with ID 8040-260924-0003-80960-0_sp0.9 from training. Duration: 22.07225 +2023-05-10 16:03:22,052 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_skip_rate, batch_count=556560.0, ans=0.0 +2023-05-10 16:03:24,805 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0045-26330-0_sp0.9 from training. Duration: 20.3055625 +2023-05-10 16:03:26,293 WARNING [train.py:1182] (1/2) Exclude cut with ID 6356-271890-0060-94317-0_sp0.9 from training. Duration: 20.72225 +2023-05-10 16:03:29,417 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=556560.0, ans=0.1 +2023-05-10 16:03:29,533 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([1.6568, 2.5501, 4.2749, 2.8342], device='cuda:1') +2023-05-10 16:03:37,710 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.attention_skip_rate, batch_count=556610.0, ans=0.0 +2023-05-10 16:03:46,836 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([5.0007, 4.3564, 2.6904, 2.7128], device='cuda:1') +2023-05-10 16:03:47,943 INFO [train.py:1021] (1/2) Epoch 31, batch 2250, loss[loss=0.1728, simple_loss=0.2591, pruned_loss=0.0432, over 36930.00 frames. ], tot_loss[loss=0.1719, simple_loss=0.2591, pruned_loss=0.04236, over 7124819.97 frames. ], batch size: 100, lr: 3.53e-03, grad_scale: 32.0 +2023-05-10 16:03:48,027 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0017-41203-0_sp1.1 from training. Duration: 22.4818125 +2023-05-10 16:03:52,001 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.whiten, num_groups=1, num_channels=192, metric=4.24 vs. limit=12.0 +2023-05-10 16:04:06,228 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.attention_skip_rate, batch_count=556710.0, ans=0.0 +2023-05-10 16:04:10,883 WARNING [train.py:1182] (1/2) Exclude cut with ID 4964-30587-0040-44509-0_sp0.9 from training. Duration: 25.0944375 +2023-05-10 16:04:13,817 WARNING [train.py:1182] (1/2) Exclude cut with ID 6533-399-0047-104881-0 from training. Duration: 21.515 +2023-05-10 16:04:16,078 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=7.02 vs. limit=15.0 +2023-05-10 16:04:21,379 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0009-15840-0_sp0.9 from training. Duration: 27.02225 +2023-05-10 16:04:23,531 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=192, metric=3.56 vs. limit=10.0 +2023-05-10 16:04:26,260 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module2.balancer2.prob, batch_count=556760.0, ans=0.125 +2023-05-10 16:04:27,441 WARNING [train.py:1182] (1/2) Exclude cut with ID 432-122774-0010-62480-0_sp0.9 from training. Duration: 22.22225 +2023-05-10 16:04:35,556 WARNING [train.py:1182] (1/2) Exclude cut with ID 4964-30587-0085-44554-0_sp0.9 from training. Duration: 20.85 +2023-05-10 16:05:01,540 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_skip_rate, batch_count=556860.0, ans=0.0 +2023-05-10 16:05:04,687 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.870e+02 3.533e+02 3.914e+02 4.507e+02 7.182e+02, threshold=7.827e+02, percent-clipped=0.0 +2023-05-10 16:05:04,727 INFO [train.py:1021] (1/2) Epoch 31, batch 2300, loss[loss=0.1475, simple_loss=0.2304, pruned_loss=0.0323, over 36807.00 frames. ], tot_loss[loss=0.1725, simple_loss=0.2596, pruned_loss=0.04271, over 7096676.18 frames. ], batch size: 84, lr: 3.53e-03, grad_scale: 32.0 +2023-05-10 16:05:07,715 WARNING [train.py:1182] (1/2) Exclude cut with ID 4295-39940-0007-92567-0 from training. Duration: 21.54 +2023-05-10 16:05:13,577 WARNING [train.py:1182] (1/2) Exclude cut with ID 4964-30587-0040-44509-0_sp1.1 from training. Duration: 20.5318125 +2023-05-10 16:05:24,705 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0012-134311-0_sp0.9 from training. Duration: 21.9333125 +2023-05-10 16:05:28,041 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([5.1519, 4.4125, 4.6748, 4.7490], device='cuda:1') +2023-05-10 16:05:33,034 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=2.15 vs. limit=6.0 +2023-05-10 16:05:43,326 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.balancer1.prob, batch_count=557010.0, ans=0.125 +2023-05-10 16:05:56,634 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module2.balancer2.prob, batch_count=557060.0, ans=0.125 +2023-05-10 16:06:08,107 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([2.3736, 3.3507, 3.1983, 4.0508, 2.2885, 3.4873, 4.0731, 3.5126], + device='cuda:1') +2023-05-10 16:06:12,454 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0025-130151-0_sp0.9 from training. Duration: 21.7944375 +2023-05-10 16:06:14,666 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder_embed.convnext.hidden_balancer.prob, batch_count=557110.0, ans=0.125 +2023-05-10 16:06:22,009 INFO [train.py:1021] (1/2) Epoch 31, batch 2350, loss[loss=0.1558, simple_loss=0.2418, pruned_loss=0.03492, over 37040.00 frames. ], tot_loss[loss=0.1722, simple_loss=0.2591, pruned_loss=0.04267, over 7096950.96 frames. ], batch size: 94, lr: 3.53e-03, grad_scale: 32.0 +2023-05-10 16:06:26,565 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0002-12989-0_sp0.9 from training. Duration: 22.4666875 +2023-05-10 16:06:30,448 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.self_attn2.whiten, num_groups=1, num_channels=256, metric=6.22 vs. limit=22.5 +2023-05-10 16:06:32,888 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module2.balancer2.prob, batch_count=557160.0, ans=0.125 +2023-05-10 16:06:32,941 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=557160.0, ans=0.1 +2023-05-10 16:06:33,072 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.5.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([2.9449, 3.9521, 4.4988, 4.7004], device='cuda:1') +2023-05-10 16:06:34,221 WARNING [train.py:1182] (1/2) Exclude cut with ID 6121-9014-0076-24124-0 from training. Duration: 21.635 +2023-05-10 16:06:35,921 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=557210.0, ans=0.1 +2023-05-10 16:06:40,108 WARNING [train.py:1182] (1/2) Exclude cut with ID 6121-9014-0076-24124-0_sp0.9 from training. Duration: 24.038875 +2023-05-10 16:07:06,622 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.bypass_mid.scale_min, batch_count=557310.0, ans=0.2 +2023-05-10 16:07:26,384 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0009-134308-0_sp1.1 from training. Duration: 21.786375 +2023-05-10 16:07:38,105 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.873e+02 3.512e+02 3.763e+02 4.233e+02 6.275e+02, threshold=7.527e+02, percent-clipped=0.0 +2023-05-10 16:07:38,145 INFO [train.py:1021] (1/2) Epoch 31, batch 2400, loss[loss=0.1534, simple_loss=0.2374, pruned_loss=0.0347, over 36954.00 frames. ], tot_loss[loss=0.1724, simple_loss=0.2594, pruned_loss=0.04271, over 7117365.61 frames. ], batch size: 95, lr: 3.53e-03, grad_scale: 32.0 +2023-05-10 16:07:38,266 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0002-12989-0 from training. Duration: 20.22 +2023-05-10 16:07:41,635 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.balancer2.prob, batch_count=557410.0, ans=0.125 +2023-05-10 16:07:49,573 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_skip_rate, batch_count=557410.0, ans=0.0 +2023-05-10 16:08:12,858 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.0.layers.0.self_attn_weights, attn_weights_entropy = tensor([6.2697, 5.6024, 5.4094, 6.0164], device='cuda:1') +2023-05-10 16:08:28,084 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.ff3_skip_rate, batch_count=557560.0, ans=0.0 +2023-05-10 16:08:29,996 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=2.00 vs. limit=6.0 +2023-05-10 16:08:38,671 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=557610.0, ans=0.125 +2023-05-10 16:08:55,209 INFO [train.py:1021] (1/2) Epoch 31, batch 2450, loss[loss=0.1874, simple_loss=0.2774, pruned_loss=0.04873, over 37051.00 frames. ], tot_loss[loss=0.1718, simple_loss=0.2589, pruned_loss=0.04234, over 7147982.46 frames. ], batch size: 116, lr: 3.53e-03, grad_scale: 32.0 +2023-05-10 16:09:14,148 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=557710.0, ans=0.125 +2023-05-10 16:09:44,540 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0043-132310-0 from training. Duration: 25.285 +2023-05-10 16:09:44,753 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.const_attention_rate, batch_count=557810.0, ans=0.025 +2023-05-10 16:10:08,231 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=256, metric=14.01 vs. limit=15.0 +2023-05-10 16:10:11,889 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.849e+02 3.602e+02 4.214e+02 4.879e+02 7.729e+02, threshold=8.429e+02, percent-clipped=1.0 +2023-05-10 16:10:11,920 INFO [train.py:1021] (1/2) Epoch 31, batch 2500, loss[loss=0.1556, simple_loss=0.2363, pruned_loss=0.03746, over 36825.00 frames. ], tot_loss[loss=0.1719, simple_loss=0.2589, pruned_loss=0.04247, over 7143377.04 frames. ], batch size: 89, lr: 3.53e-03, grad_scale: 32.0 +2023-05-10 16:10:29,601 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=557960.0, ans=0.1 +2023-05-10 16:10:54,070 WARNING [train.py:1182] (1/2) Exclude cut with ID 811-130148-0001-63453-0_sp0.9 from training. Duration: 20.861125 +2023-05-10 16:11:09,631 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=558060.0, ans=0.125 +2023-05-10 16:11:15,274 WARNING [train.py:1182] (1/2) Exclude cut with ID 6010-56788-0055-90261-0 from training. Duration: 20.88 +2023-05-10 16:11:29,441 INFO [train.py:1021] (1/2) Epoch 31, batch 2550, loss[loss=0.1659, simple_loss=0.2478, pruned_loss=0.04202, over 37087.00 frames. ], tot_loss[loss=0.1721, simple_loss=0.2586, pruned_loss=0.04275, over 7111576.18 frames. ], batch size: 94, lr: 3.53e-03, grad_scale: 32.0 +2023-05-10 16:11:39,129 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module1.balancer2.min_positive, batch_count=558160.0, ans=0.05 +2023-05-10 16:11:46,380 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0045-15876-0_sp0.9 from training. Duration: 23.4166875 +2023-05-10 16:12:06,586 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=558260.0, ans=0.1 +2023-05-10 16:12:09,416 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.bypass.scale_min, batch_count=558260.0, ans=0.2 +2023-05-10 16:12:18,358 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module2.balancer1.prob, batch_count=558310.0, ans=0.125 +2023-05-10 16:12:29,458 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=558360.0, ans=0.125 +2023-05-10 16:12:42,388 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.ff3_skip_rate, batch_count=558360.0, ans=0.0 +2023-05-10 16:12:45,151 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.702e+02 3.568e+02 3.932e+02 4.556e+02 7.220e+02, threshold=7.865e+02, percent-clipped=0.0 +2023-05-10 16:12:45,185 INFO [train.py:1021] (1/2) Epoch 31, batch 2600, loss[loss=0.1558, simple_loss=0.2431, pruned_loss=0.03429, over 37185.00 frames. ], tot_loss[loss=0.1729, simple_loss=0.2593, pruned_loss=0.04319, over 7054500.16 frames. ], batch size: 93, lr: 3.52e-03, grad_scale: 32.0 +2023-05-10 16:12:47,198 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.out_combiner.scale_min, batch_count=558410.0, ans=0.2 +2023-05-10 16:13:00,928 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0006-134305-0 from training. Duration: 21.24 +2023-05-10 16:13:00,945 WARNING [train.py:1182] (1/2) Exclude cut with ID 6533-399-0047-104881-0_sp0.9 from training. Duration: 23.9055625 +2023-05-10 16:13:01,362 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.5.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 16:13:36,082 WARNING [train.py:1182] (1/2) Exclude cut with ID 6758-72288-0033-108368-0_sp0.9 from training. Duration: 25.988875 +2023-05-10 16:13:42,561 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.const_attention_rate, batch_count=558560.0, ans=0.025 +2023-05-10 16:13:43,801 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0001-134300-0_sp0.9 from training. Duration: 20.67225 +2023-05-10 16:13:45,638 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([5.0927, 4.4260, 2.8750, 3.2977], device='cuda:1') +2023-05-10 16:14:02,157 INFO [train.py:1021] (1/2) Epoch 31, batch 2650, loss[loss=0.1971, simple_loss=0.2808, pruned_loss=0.05671, over 36306.00 frames. ], tot_loss[loss=0.1729, simple_loss=0.2594, pruned_loss=0.04317, over 7063140.32 frames. ], batch size: 126, lr: 3.52e-03, grad_scale: 32.0 +2023-05-10 16:14:11,373 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=192, metric=9.92 vs. limit=15.0 +2023-05-10 16:14:34,337 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0038-41224-0 from training. Duration: 20.34 +2023-05-10 16:15:03,788 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.bypass.skip_rate, batch_count=558860.0, ans=0.07 +2023-05-10 16:15:18,094 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.921e+02 3.403e+02 3.615e+02 4.137e+02 6.796e+02, threshold=7.230e+02, percent-clipped=0.0 +2023-05-10 16:15:18,132 INFO [train.py:1021] (1/2) Epoch 31, batch 2700, loss[loss=0.1618, simple_loss=0.2448, pruned_loss=0.03942, over 37000.00 frames. ], tot_loss[loss=0.1723, simple_loss=0.2594, pruned_loss=0.04261, over 7111219.27 frames. ], batch size: 99, lr: 3.52e-03, grad_scale: 32.0 +2023-05-10 16:15:27,613 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module2.balancer1.max_abs, batch_count=558910.0, ans=10.0 +2023-05-10 16:15:38,765 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_skip_rate, batch_count=558960.0, ans=0.0 +2023-05-10 16:15:41,904 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.const_attention_rate, batch_count=558960.0, ans=0.025 +2023-05-10 16:15:44,858 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder_embed.convnext.layerdrop_rate, batch_count=558960.0, ans=0.015 +2023-05-10 16:15:54,768 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0026-15857-0_sp0.9 from training. Duration: 25.061125 +2023-05-10 16:15:56,545 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module1.balancer1.prob, batch_count=559010.0, ans=0.125 +2023-05-10 16:15:58,189 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.6233, 3.9523, 4.2145, 3.9144], device='cuda:1') +2023-05-10 16:16:00,961 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_skip_rate, batch_count=559010.0, ans=0.0 +2023-05-10 16:16:05,233 WARNING [train.py:1182] (1/2) Exclude cut with ID 3033-130750-0096-55598-0 from training. Duration: 0.83 +2023-05-10 16:16:12,929 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.bypass_mid.scale_min, batch_count=559060.0, ans=0.2 +2023-05-10 16:16:16,184 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([2.0816, 2.4747, 3.4010, 2.6126], device='cuda:1') +2023-05-10 16:16:25,458 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.balancer1.prob, batch_count=559110.0, ans=0.125 +2023-05-10 16:16:33,248 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0017-41203-0 from training. Duration: 24.73 +2023-05-10 16:16:36,193 INFO [train.py:1021] (1/2) Epoch 31, batch 2750, loss[loss=0.1811, simple_loss=0.2715, pruned_loss=0.0453, over 37075.00 frames. ], tot_loss[loss=0.1722, simple_loss=0.2592, pruned_loss=0.04263, over 7131062.99 frames. ], batch size: 110, lr: 3.52e-03, grad_scale: 32.0 +2023-05-10 16:16:45,212 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0009-134308-0 from training. Duration: 23.965 +2023-05-10 16:16:56,179 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0030-146996-0_sp0.9 from training. Duration: 22.088875 +2023-05-10 16:17:11,225 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0006-134305-0_sp0.9 from training. Duration: 23.6 +2023-05-10 16:17:29,879 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.ff2_skip_rate, batch_count=559310.0, ans=0.0 +2023-05-10 16:17:52,696 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.949e+02 3.591e+02 4.328e+02 5.388e+02 7.598e+02, threshold=8.657e+02, percent-clipped=4.0 +2023-05-10 16:17:52,729 INFO [train.py:1021] (1/2) Epoch 31, batch 2800, loss[loss=0.1666, simple_loss=0.2499, pruned_loss=0.04164, over 37060.00 frames. ], tot_loss[loss=0.1722, simple_loss=0.2591, pruned_loss=0.04259, over 7132828.35 frames. ], batch size: 94, lr: 3.52e-03, grad_scale: 32.0 +2023-05-10 16:17:56,153 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.balancer2.prob, batch_count=559410.0, ans=0.125 +2023-05-10 16:18:12,715 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module2.balancer1.prob, batch_count=559460.0, ans=0.125 +2023-05-10 16:18:45,650 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=559560.0, ans=0.1 +2023-05-10 16:19:00,643 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0024-13011-0 from training. Duration: 23.795 +2023-05-10 16:19:09,260 INFO [train.py:1021] (1/2) Epoch 31, batch 2850, loss[loss=0.1874, simple_loss=0.2796, pruned_loss=0.04764, over 35973.00 frames. ], tot_loss[loss=0.1722, simple_loss=0.2591, pruned_loss=0.04261, over 7150946.98 frames. ], batch size: 133, lr: 3.52e-03, grad_scale: 16.0 +2023-05-10 16:19:19,142 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0030-130156-0_sp1.1 from training. Duration: 21.5409375 +2023-05-10 16:19:22,153 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0019-146985-0_sp0.9 from training. Duration: 24.97775 +2023-05-10 16:19:22,332 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_skip_rate, batch_count=559660.0, ans=0.0 +2023-05-10 16:19:34,766 WARNING [train.py:1182] (1/2) Exclude cut with ID 1085-156170-0017-128270-0_sp0.9 from training. Duration: 23.3444375 +2023-05-10 16:19:56,090 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=559810.0, ans=0.1 +2023-05-10 16:20:03,426 WARNING [train.py:1182] (1/2) Exclude cut with ID 6010-56788-0055-90261-0_sp0.9 from training. Duration: 23.2 +2023-05-10 16:20:05,450 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=1.93 vs. limit=6.0 +2023-05-10 16:20:09,376 WARNING [train.py:1182] (1/2) Exclude cut with ID 5653-46179-0060-117930-0_sp0.9 from training. Duration: 21.17225 +2023-05-10 16:20:26,949 INFO [train.py:1021] (1/2) Epoch 31, batch 2900, loss[loss=0.1577, simple_loss=0.239, pruned_loss=0.03817, over 36842.00 frames. ], tot_loss[loss=0.1717, simple_loss=0.2587, pruned_loss=0.0424, over 7127190.28 frames. ], batch size: 84, lr: 3.52e-03, grad_scale: 16.0 +2023-05-10 16:20:28,314 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.953e+02 3.836e+02 4.434e+02 5.575e+02 8.595e+02, threshold=8.868e+02, percent-clipped=0.0 +2023-05-10 16:20:33,135 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0008-134307-0_sp0.9 from training. Duration: 24.6555625 +2023-05-10 16:21:19,402 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module1.balancer2.prob, batch_count=560060.0, ans=0.125 +2023-05-10 16:21:31,204 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-65654-0031-41259-0 from training. Duration: 20.44 +2023-05-10 16:21:44,566 INFO [train.py:1021] (1/2) Epoch 31, batch 2950, loss[loss=0.1713, simple_loss=0.2617, pruned_loss=0.0404, over 37096.00 frames. ], tot_loss[loss=0.1718, simple_loss=0.2585, pruned_loss=0.0425, over 7123292.09 frames. ], batch size: 103, lr: 3.52e-03, grad_scale: 16.0 +2023-05-10 16:21:46,133 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0018-132285-0_sp0.9 from training. Duration: 23.45 +2023-05-10 16:21:47,974 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.out_proj.dropout_p, batch_count=560160.0, ans=0.1 +2023-05-10 16:22:19,225 WARNING [train.py:1182] (1/2) Exclude cut with ID 6945-60535-0076-12784-0_sp0.9 from training. Duration: 20.52225 +2023-05-10 16:22:28,082 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0008-134307-0 from training. Duration: 22.19 +2023-05-10 16:22:38,674 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0014-15845-0_sp1.1 from training. Duration: 25.3818125 +2023-05-10 16:22:45,260 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.bypass.skip_rate, batch_count=560360.0, ans=0.04949747468305833 +2023-05-10 16:22:58,783 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0043-132310-0_sp0.9 from training. Duration: 28.0944375 +2023-05-10 16:23:01,928 INFO [train.py:1021] (1/2) Epoch 31, batch 3000, loss[loss=0.1676, simple_loss=0.2527, pruned_loss=0.04124, over 36977.00 frames. ], tot_loss[loss=0.1714, simple_loss=0.2583, pruned_loss=0.04223, over 7171117.40 frames. ], batch size: 91, lr: 3.52e-03, grad_scale: 16.0 +2023-05-10 16:23:01,928 INFO [train.py:1048] (1/2) Computing validation loss +2023-05-10 16:23:12,860 INFO [train.py:1057] (1/2) Epoch 31, validation: loss=0.1522, simple_loss=0.2533, pruned_loss=0.02555, over 944034.00 frames. +2023-05-10 16:23:12,860 INFO [train.py:1058] (1/2) Maximum memory allocated so far is 18662MB +2023-05-10 16:23:14,341 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.810e+02 3.515e+02 3.984e+02 4.552e+02 6.373e+02, threshold=7.968e+02, percent-clipped=0.0 +2023-05-10 16:23:14,838 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module2.balancer2.prob, batch_count=560410.0, ans=0.125 +2023-05-10 16:23:15,923 WARNING [train.py:1182] (1/2) Exclude cut with ID 2195-150901-0045-59933-0_sp0.9 from training. Duration: 22.9444375 +2023-05-10 16:23:20,797 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.4.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 16:23:23,439 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0024-13011-0_sp1.1 from training. Duration: 21.6318125 +2023-05-10 16:23:39,339 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0030-130156-0 from training. Duration: 23.695 +2023-05-10 16:23:56,529 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_skip_rate, batch_count=560510.0, ans=0.0 +2023-05-10 16:23:59,576 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([4.9595, 4.2672, 3.0619, 2.8513], device='cuda:1') +2023-05-10 16:24:05,415 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module2.balancer1.prob, batch_count=560560.0, ans=0.125 +2023-05-10 16:24:08,167 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0094-26379-0 from training. Duration: 23.955 +2023-05-10 16:24:17,480 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=560610.0, ans=0.125 +2023-05-10 16:24:27,129 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=256, metric=12.45 vs. limit=22.5 +2023-05-10 16:24:29,297 INFO [train.py:1021] (1/2) Epoch 31, batch 3050, loss[loss=0.196, simple_loss=0.2829, pruned_loss=0.05454, over 37028.00 frames. ], tot_loss[loss=0.1713, simple_loss=0.2583, pruned_loss=0.04215, over 7166828.42 frames. ], batch size: 116, lr: 3.52e-03, grad_scale: 16.0 +2023-05-10 16:24:29,677 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module1.balancer2.min_positive, batch_count=560660.0, ans=0.05 +2023-05-10 16:24:31,906 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.ff3_skip_rate, batch_count=560660.0, ans=0.0 +2023-05-10 16:24:42,979 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.whiten, num_groups=1, num_channels=256, metric=3.83 vs. limit=12.0 +2023-05-10 16:24:43,686 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0024-13011-0_sp0.9 from training. Duration: 26.438875 +2023-05-10 16:25:07,994 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=560760.0, ans=0.0 +2023-05-10 16:25:28,381 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=560810.0, ans=0.125 +2023-05-10 16:25:29,578 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0021-26306-0_sp0.9 from training. Duration: 21.2444375 +2023-05-10 16:25:29,606 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0014-15845-0_sp0.9 from training. Duration: 31.02225 +2023-05-10 16:25:29,905 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass.scale_min, batch_count=560860.0, ans=0.2 +2023-05-10 16:25:42,298 WARNING [train.py:1182] (1/2) Exclude cut with ID 432-122774-0017-62487-0 from training. Duration: 22.395 +2023-05-10 16:25:46,704 INFO [train.py:1021] (1/2) Epoch 31, batch 3100, loss[loss=0.1636, simple_loss=0.2501, pruned_loss=0.03851, over 37148.00 frames. ], tot_loss[loss=0.1718, simple_loss=0.259, pruned_loss=0.0423, over 7127890.30 frames. ], batch size: 98, lr: 3.52e-03, grad_scale: 16.0 +2023-05-10 16:25:48,275 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.890e+02 3.495e+02 3.799e+02 4.441e+02 8.344e+02, threshold=7.597e+02, percent-clipped=1.0 +2023-05-10 16:25:57,306 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0045-15876-0 from training. Duration: 21.075 +2023-05-10 16:26:03,307 WARNING [train.py:1182] (1/2) Exclude cut with ID 6482-98857-0025-147532-0_sp0.9 from training. Duration: 20.0055625 +2023-05-10 16:26:03,319 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0037-132304-0_sp0.9 from training. Duration: 22.05 +2023-05-10 16:26:03,346 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0003-134302-0 from training. Duration: 26.8349375 +2023-05-10 16:26:06,366 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0009-15840-0_sp1.1 from training. Duration: 22.1090625 +2023-05-10 16:26:15,520 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0094-26379-0_sp0.9 from training. Duration: 26.6166875 +2023-05-10 16:26:34,902 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=561060.0, ans=0.125 +2023-05-10 16:26:36,184 WARNING [train.py:1182] (1/2) Exclude cut with ID 2046-178027-0000-53705-0_sp0.9 from training. Duration: 20.3055625 +2023-05-10 16:26:46,987 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.const_attention_rate, batch_count=561110.0, ans=0.025 +2023-05-10 16:27:00,428 WARNING [train.py:1182] (1/2) Exclude cut with ID 7205-50138-0008-5373-0_sp0.9 from training. Duration: 20.7 +2023-05-10 16:27:03,353 INFO [train.py:1021] (1/2) Epoch 31, batch 3150, loss[loss=0.1587, simple_loss=0.2348, pruned_loss=0.0413, over 36855.00 frames. ], tot_loss[loss=0.1712, simple_loss=0.258, pruned_loss=0.04221, over 7120907.65 frames. ], batch size: 84, lr: 3.52e-03, grad_scale: 16.0 +2023-05-10 16:27:10,960 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward3.hidden_balancer.prob, batch_count=561160.0, ans=0.125 +2023-05-10 16:27:15,797 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=561160.0, ans=0.1 +2023-05-10 16:27:44,592 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0019-146985-0 from training. Duration: 22.48 +2023-05-10 16:27:56,845 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=561310.0, ans=0.125 +2023-05-10 16:28:00,986 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0003-134302-0_sp0.9 from training. Duration: 29.816625 +2023-05-10 16:28:17,472 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=561360.0, ans=0.125 +2023-05-10 16:28:19,952 INFO [train.py:1021] (1/2) Epoch 31, batch 3200, loss[loss=0.1887, simple_loss=0.2799, pruned_loss=0.04872, over 36756.00 frames. ], tot_loss[loss=0.1712, simple_loss=0.2584, pruned_loss=0.04202, over 7146104.65 frames. ], batch size: 122, lr: 3.52e-03, grad_scale: 32.0 +2023-05-10 16:28:21,418 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.789e+02 3.475e+02 3.864e+02 4.298e+02 6.134e+02, threshold=7.729e+02, percent-clipped=0.0 +2023-05-10 16:28:23,068 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0005-134304-0_sp1.1 from training. Duration: 22.7590625 +2023-05-10 16:28:29,106 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0026-15857-0 from training. Duration: 22.555 +2023-05-10 16:28:38,326 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.ff2_skip_rate, batch_count=561460.0, ans=0.0 +2023-05-10 16:28:48,563 WARNING [train.py:1182] (1/2) Exclude cut with ID 1250-135782-0005-25975-0_sp0.9 from training. Duration: 21.688875 +2023-05-10 16:28:49,251 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=6.30 vs. limit=15.0 +2023-05-10 16:29:09,290 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.attention_skip_rate, batch_count=561560.0, ans=0.0 +2023-05-10 16:29:12,328 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.hidden_balancer.prob, batch_count=561560.0, ans=0.125 +2023-05-10 16:29:23,174 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_skip_rate, batch_count=561610.0, ans=0.0 +2023-05-10 16:29:24,520 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.out_combiner.scale_min, batch_count=561610.0, ans=0.2 +2023-05-10 16:29:25,782 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0038-41224-0_sp0.9 from training. Duration: 22.6 +2023-05-10 16:29:26,076 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module1.balancer2.prob, batch_count=561610.0, ans=0.125 +2023-05-10 16:29:36,292 INFO [train.py:1021] (1/2) Epoch 31, batch 3250, loss[loss=0.1886, simple_loss=0.2786, pruned_loss=0.04936, over 32292.00 frames. ], tot_loss[loss=0.1717, simple_loss=0.2589, pruned_loss=0.04222, over 7136297.11 frames. ], batch size: 170, lr: 3.51e-03, grad_scale: 32.0 +2023-05-10 16:30:05,999 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0009-15840-0 from training. Duration: 24.32 +2023-05-10 16:30:52,252 INFO [train.py:1021] (1/2) Epoch 31, batch 3300, loss[loss=0.1669, simple_loss=0.2618, pruned_loss=0.03601, over 37002.00 frames. ], tot_loss[loss=0.1722, simple_loss=0.2596, pruned_loss=0.04239, over 7121867.83 frames. ], batch size: 104, lr: 3.51e-03, grad_scale: 32.0 +2023-05-10 16:30:53,661 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.787e+02 3.413e+02 3.831e+02 4.452e+02 6.471e+02, threshold=7.662e+02, percent-clipped=0.0 +2023-05-10 16:31:04,360 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-276745-0093-13116-0_sp0.9 from training. Duration: 21.061125 +2023-05-10 16:31:06,007 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=561960.0, ans=0.125 +2023-05-10 16:31:09,387 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.conv_module2.whiten, num_groups=1, num_channels=256, metric=4.12 vs. limit=15.0 +2023-05-10 16:31:15,268 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass.skip_rate, batch_count=561960.0, ans=0.09899494936611666 +2023-05-10 16:31:19,477 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0024-15855-0_sp0.9 from training. Duration: 20.32225 +2023-05-10 16:31:19,749 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.balancer2.prob, batch_count=561960.0, ans=0.125 +2023-05-10 16:31:32,726 WARNING [train.py:1182] (1/2) Exclude cut with ID 3033-130750-0096-55598-0_sp1.1 from training. Duration: 0.7545625 +2023-05-10 16:31:36,217 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.bypass.scale_min, batch_count=562060.0, ans=0.2 +2023-05-10 16:31:41,329 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=562060.0, ans=0.125 +2023-05-10 16:31:49,191 WARNING [train.py:1182] (1/2) Exclude cut with ID 4295-39940-0007-92567-0_sp0.9 from training. Duration: 23.9333125 +2023-05-10 16:31:53,851 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=562110.0, ans=0.1 +2023-05-10 16:32:08,436 INFO [train.py:1021] (1/2) Epoch 31, batch 3350, loss[loss=0.1675, simple_loss=0.258, pruned_loss=0.03849, over 37012.00 frames. ], tot_loss[loss=0.1715, simple_loss=0.2589, pruned_loss=0.04204, over 7139368.69 frames. ], batch size: 104, lr: 3.51e-03, grad_scale: 32.0 +2023-05-10 16:32:19,557 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=7.10 vs. limit=15.0 +2023-05-10 16:32:22,225 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.balancer2.prob, batch_count=562210.0, ans=0.125 +2023-05-10 16:32:24,168 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=256, metric=9.95 vs. limit=22.5 +2023-05-10 16:32:24,956 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0008-134307-0_sp1.1 from training. Duration: 20.17275 +2023-05-10 16:32:28,141 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_skip_rate, batch_count=562210.0, ans=0.0 +2023-05-10 16:32:29,518 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0019-146985-0_sp1.1 from training. Duration: 20.436375 +2023-05-10 16:32:29,838 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=562210.0, ans=0.0 +2023-05-10 16:32:48,324 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module1.balancer1.prob, batch_count=562260.0, ans=0.125 +2023-05-10 16:33:03,171 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=562310.0, ans=0.125 +2023-05-10 16:33:24,003 INFO [train.py:1021] (1/2) Epoch 31, batch 3400, loss[loss=0.1772, simple_loss=0.2678, pruned_loss=0.04326, over 36404.00 frames. ], tot_loss[loss=0.1717, simple_loss=0.2593, pruned_loss=0.04208, over 7123969.31 frames. ], batch size: 126, lr: 3.51e-03, grad_scale: 32.0 +2023-05-10 16:33:26,170 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.848e+02 3.549e+02 3.917e+02 4.340e+02 6.268e+02, threshold=7.834e+02, percent-clipped=0.0 +2023-05-10 16:33:26,523 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module1.balancer2.prob, batch_count=562410.0, ans=0.125 +2023-05-10 16:33:45,149 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module2.balancer2.prob, batch_count=562460.0, ans=0.125 +2023-05-10 16:33:53,848 WARNING [train.py:1182] (1/2) Exclude cut with ID 4234-40345-0022-142709-0_sp0.9 from training. Duration: 23.1055625 +2023-05-10 16:33:55,458 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0007-12994-0_sp1.1 from training. Duration: 23.5 +2023-05-10 16:34:07,270 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0009-134308-0_sp0.9 from training. Duration: 26.62775 +2023-05-10 16:34:07,485 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass.skip_rate, batch_count=562510.0, ans=0.04949747468305833 +2023-05-10 16:34:09,008 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=562560.0, ans=0.0 +2023-05-10 16:34:10,368 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.balancer1.prob, batch_count=562560.0, ans=0.125 +2023-05-10 16:34:21,401 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0018-132285-0 from training. Duration: 21.105 +2023-05-10 16:34:27,912 WARNING [train.py:1182] (1/2) Exclude cut with ID 4511-76322-0006-80011-0_sp0.9 from training. Duration: 24.411125 +2023-05-10 16:34:41,243 INFO [train.py:1021] (1/2) Epoch 31, batch 3450, loss[loss=0.1538, simple_loss=0.2411, pruned_loss=0.0332, over 37025.00 frames. ], tot_loss[loss=0.1713, simple_loss=0.2587, pruned_loss=0.04198, over 7113888.53 frames. ], batch size: 99, lr: 3.51e-03, grad_scale: 32.0 +2023-05-10 16:34:53,485 WARNING [train.py:1182] (1/2) Exclude cut with ID 6758-72288-0033-108368-0_sp1.1 from training. Duration: 21.263625 +2023-05-10 16:35:00,122 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=256, metric=19.54 vs. limit=22.5 +2023-05-10 16:35:28,904 WARNING [train.py:1182] (1/2) Exclude cut with ID 4234-40345-0022-142709-0 from training. Duration: 20.795 +2023-05-10 16:35:39,362 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0021-15852-0 from training. Duration: 24.76 +2023-05-10 16:35:40,804 WARNING [train.py:1182] (1/2) Exclude cut with ID 3867-173237-0077-144769-0_sp0.9 from training. Duration: 22.25 +2023-05-10 16:35:45,686 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module2.balancer1.prob, batch_count=562860.0, ans=0.125 +2023-05-10 16:35:51,602 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([4.9977, 4.3526, 3.0578, 3.1836], device='cuda:1') +2023-05-10 16:35:55,252 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=5.72 vs. limit=15.0 +2023-05-10 16:35:57,265 INFO [train.py:1021] (1/2) Epoch 31, batch 3500, loss[loss=0.1673, simple_loss=0.2581, pruned_loss=0.0383, over 35817.00 frames. ], tot_loss[loss=0.1709, simple_loss=0.2585, pruned_loss=0.04164, over 7138850.85 frames. ], batch size: 133, lr: 3.51e-03, grad_scale: 32.0 +2023-05-10 16:35:58,770 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.956e+02 3.547e+02 4.086e+02 4.716e+02 7.284e+02, threshold=8.173e+02, percent-clipped=0.0 +2023-05-10 16:36:07,290 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0026-15857-0_sp1.1 from training. Duration: 20.5045625 +2023-05-10 16:36:29,202 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=563010.0, ans=0.1 +2023-05-10 16:36:35,313 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([4.9986, 4.1751, 4.5325, 4.6120], device='cuda:1') +2023-05-10 16:36:46,623 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.bypass.skip_rate, batch_count=563060.0, ans=0.04949747468305833 +2023-05-10 16:36:55,475 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.4332, 3.4987, 3.8167, 3.4089], device='cuda:1') +2023-05-10 16:37:12,362 INFO [train.py:1021] (1/2) Epoch 31, batch 3550, loss[loss=0.1735, simple_loss=0.2616, pruned_loss=0.04272, over 37177.00 frames. ], tot_loss[loss=0.1705, simple_loss=0.258, pruned_loss=0.04153, over 7137658.53 frames. ], batch size: 102, lr: 3.51e-03, grad_scale: 32.0 +2023-05-10 16:37:13,645 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=5.17 vs. limit=15.0 +2023-05-10 16:37:35,425 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.bypass.skip_rate, batch_count=563210.0, ans=0.09899494936611666 +2023-05-10 16:37:41,245 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=563260.0, ans=0.1 +2023-05-10 16:37:51,633 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=4.91 vs. limit=15.0 +2023-05-10 16:38:17,358 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.5.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 16:38:25,643 INFO [train.py:1021] (1/2) Epoch 31, batch 3600, loss[loss=0.1542, simple_loss=0.2346, pruned_loss=0.03692, over 36968.00 frames. ], tot_loss[loss=0.1706, simple_loss=0.2579, pruned_loss=0.04161, over 7144222.04 frames. ], batch size: 91, lr: 3.51e-03, grad_scale: 32.0 +2023-05-10 16:38:26,996 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.781e+02 3.444e+02 3.876e+02 4.601e+02 6.604e+02, threshold=7.752e+02, percent-clipped=0.0 +2023-05-10 16:38:34,187 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.const_attention_rate, batch_count=563410.0, ans=0.025 +2023-05-10 16:39:00,920 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.balancer2.prob, batch_count=563510.0, ans=0.125 +2023-05-10 16:39:03,561 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=563510.0, ans=0.0 +2023-05-10 16:39:31,974 WARNING [train.py:1182] (1/2) Exclude cut with ID 7859-102521-0017-7548-0_sp1.1 from training. Duration: 22.2954375 +2023-05-10 16:39:36,304 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.balancer2.prob, batch_count=563590.0, ans=0.125 +2023-05-10 16:39:38,984 INFO [train.py:1021] (1/2) Epoch 32, batch 0, loss[loss=0.1669, simple_loss=0.2643, pruned_loss=0.03481, over 37065.00 frames. ], tot_loss[loss=0.1669, simple_loss=0.2643, pruned_loss=0.03481, over 37065.00 frames. ], batch size: 103, lr: 3.45e-03, grad_scale: 32.0 +2023-05-10 16:39:38,984 INFO [train.py:1048] (1/2) Computing validation loss +2023-05-10 16:39:49,763 INFO [train.py:1057] (1/2) Epoch 32, validation: loss=0.1529, simple_loss=0.2541, pruned_loss=0.02586, over 944034.00 frames. +2023-05-10 16:39:49,764 INFO [train.py:1058] (1/2) Maximum memory allocated so far is 18662MB +2023-05-10 16:40:11,243 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=563640.0, ans=0.1 +2023-05-10 16:40:23,408 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=563690.0, ans=0.125 +2023-05-10 16:40:27,785 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 16:40:43,789 WARNING [train.py:1182] (1/2) Exclude cut with ID 298-126791-0067-24026-0_sp0.9 from training. Duration: 21.438875 +2023-05-10 16:40:49,821 WARNING [train.py:1182] (1/2) Exclude cut with ID 5652-39938-0025-23684-0_sp0.9 from training. Duration: 22.2055625 +2023-05-10 16:40:55,903 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module1.balancer2.prob, batch_count=563790.0, ans=0.125 +2023-05-10 16:41:06,144 INFO [train.py:1021] (1/2) Epoch 32, batch 50, loss[loss=0.1839, simple_loss=0.2764, pruned_loss=0.04573, over 35881.00 frames. ], tot_loss[loss=0.1682, simple_loss=0.2614, pruned_loss=0.03751, over 1629616.15 frames. ], batch size: 133, lr: 3.45e-03, grad_scale: 32.0 +2023-05-10 16:41:15,713 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.bypass.skip_rate, batch_count=563840.0, ans=0.09899494936611666 +2023-05-10 16:41:25,110 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.self_attn2.whiten, num_groups=1, num_channels=256, metric=6.44 vs. limit=22.5 +2023-05-10 16:41:28,767 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.520e+02 3.172e+02 3.790e+02 4.255e+02 6.631e+02, threshold=7.580e+02, percent-clipped=0.0 +2023-05-10 16:41:59,116 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module1.balancer2.prob, batch_count=563990.0, ans=0.125 +2023-05-10 16:42:22,854 INFO [train.py:1021] (1/2) Epoch 32, batch 100, loss[loss=0.165, simple_loss=0.2612, pruned_loss=0.03434, over 36908.00 frames. ], tot_loss[loss=0.1643, simple_loss=0.257, pruned_loss=0.03576, over 2893394.69 frames. ], batch size: 105, lr: 3.45e-03, grad_scale: 32.0 +2023-05-10 16:42:23,197 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=564090.0, ans=0.1 +2023-05-10 16:42:48,305 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module2.balancer2.prob, batch_count=564140.0, ans=0.125 +2023-05-10 16:43:25,454 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=192, metric=4.13 vs. limit=10.0 +2023-05-10 16:43:33,008 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module1.balancer1.prob, batch_count=564290.0, ans=0.125 +2023-05-10 16:43:39,874 INFO [train.py:1021] (1/2) Epoch 32, batch 150, loss[loss=0.1627, simple_loss=0.2612, pruned_loss=0.03213, over 36931.00 frames. ], tot_loss[loss=0.1636, simple_loss=0.2556, pruned_loss=0.03577, over 3856214.11 frames. ], batch size: 108, lr: 3.45e-03, grad_scale: 32.0 +2023-05-10 16:43:44,729 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=564340.0, ans=0.1 +2023-05-10 16:43:50,544 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.out_proj.dropout_p, batch_count=564340.0, ans=0.1 +2023-05-10 16:43:53,656 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module1.balancer1.prob, batch_count=564390.0, ans=0.125 +2023-05-10 16:43:59,554 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.4.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 16:44:01,995 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.446e+02 2.946e+02 3.285e+02 4.107e+02 5.787e+02, threshold=6.569e+02, percent-clipped=0.0 +2023-05-10 16:44:03,725 WARNING [train.py:1182] (1/2) Exclude cut with ID 7859-102521-0017-7548-0 from training. Duration: 24.525 +2023-05-10 16:44:21,148 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module2.balancer2.min_abs, batch_count=564440.0, ans=0.5 +2023-05-10 16:44:41,141 WARNING [train.py:1182] (1/2) Exclude cut with ID 3699-47246-0007-3408-0_sp0.9 from training. Duration: 20.26675 +2023-05-10 16:44:56,033 INFO [train.py:1021] (1/2) Epoch 32, batch 200, loss[loss=0.1632, simple_loss=0.2553, pruned_loss=0.0355, over 37167.00 frames. ], tot_loss[loss=0.1632, simple_loss=0.2555, pruned_loss=0.0354, over 4607264.89 frames. ], batch size: 102, lr: 3.45e-03, grad_scale: 32.0 +2023-05-10 16:44:56,163 WARNING [train.py:1182] (1/2) Exclude cut with ID 7859-102521-0017-7548-0_sp0.9 from training. Duration: 27.25 +2023-05-10 16:45:45,545 INFO [checkpoint.py:75] (1/2) Saving checkpoint to pruned_transducer_stateless7/exp1119-smaller-md1500/bad-model-1.pt +2023-05-10 16:45:46,559 INFO [train.py:1307] (1/2) Saving batch to pruned_transducer_stateless7/exp1119-smaller-md1500/batch-7aeff54e-808c-46a6-1f49-2fba47a1fca7.pt +2023-05-10 16:45:46,804 INFO [train.py:1313] (1/2) features shape: torch.Size([89, 1670, 80]) +2023-05-10 16:45:46,814 INFO [train.py:1317] (1/2) num tokens: 6945