diff --git "a/exp/log/log-train-2023-05-10-16-57-56-1" "b/exp/log/log-train-2023-05-10-16-57-56-1" new file mode 100644--- /dev/null +++ "b/exp/log/log-train-2023-05-10-16-57-56-1" @@ -0,0 +1,2856 @@ +2023-05-10 16:57:56,325 INFO [train.py:1091] (1/2) Training started +2023-05-10 16:57:56,325 INFO [train.py:1101] (1/2) Device: cuda:1 +2023-05-10 16:57:56,329 INFO [train.py:1110] (1/2) {'best_train_loss': inf, 'best_valid_loss': inf, 'best_train_epoch': -1, 'best_valid_epoch': -1, 'batch_idx_train': 0, 'log_interval': 50, 'reset_interval': 200, 'valid_interval': 3000, 'feature_dim': 80, 'subsampling_factor': 4, 'warm_step': 2000, 'env_info': {'k2-version': '1.23.4', 'k2-build-type': 'Release', 'k2-with-cuda': True, 'k2-git-sha1': '7efe024b23078ffa0bcb5598afff14f356edae7c', 'k2-git-date': 'Mon Jan 30 20:22:57 2023', 'lhotse-version': '1.12.0.dev+git.891bad1.clean', 'torch-version': '1.10.0+cu102', 'torch-cuda-available': True, 'torch-cuda-version': '10.2', 'python-version': '3.8', 'icefall-git-branch': 'from_dan_scaled_adam_exp1119', 'icefall-git-sha1': '432b2fa3-dirty', 'icefall-git-date': 'Mon May 8 18:46:45 2023', 'icefall-path': '/ceph-zw/workspace/zipformer/icefall_dan_streaming', 'k2-path': '/ceph-zw/workspace/k2/k2/k2/python/k2/__init__.py', 'lhotse-path': '/ceph-zw/workspace/share/lhotse/lhotse/__init__.py', 'hostname': 'de-74279-k2-train-6-0423201309-7c68fd68fb-6cszs', 'IP address': '10.177.28.83'}, 'world_size': 2, 'master_port': 12348, 'tensorboard': True, 'num_epochs': 35, 'start_epoch': 32, 'start_batch': 0, 'exp_dir': PosixPath('pruned_transducer_stateless7/exp1119-smaller-md1500'), 'bpe_model': 'data/lang_bpe_500/bpe.model', 'base_lr': 0.04, 'lr_batches': 7500, 'lr_epochs': 3.5, 'lr_warmup_start': 0.5, 'ref_duration': 600, 'context_size': 2, 'prune_range': 5, 'lm_scale': 0.25, 'am_scale': 0.0, 'simple_loss_scale': 0.5, 'seed': 42, 'print_diagnostics': False, 'inf_check': False, 'save_every_n': 4000, 'keep_last_k': 30, 'average_period': 200, 'use_fp16': True, 'num_encoder_layers': '2,2,2,2,2,2', 'downsampling_factor': '1,2,4,8,4,2', 'feedforward_dim': '512,768,768,768,768,768', 'num_heads': '4,4,4,8,4,4', 'encoder_dim': '192,256,256,256,256,256', 'query_head_dim': '32', 'value_head_dim': '12', 'pos_head_dim': '4', 'pos_dim': 48, 'encoder_unmasked_dim': '192,192,192,192,192,192', 'cnn_module_kernel': '31,31,15,15,15,31', 'decoder_dim': 512, 'joiner_dim': 512, 'causal': False, 'chunk_size': '16,32,64,-1', 'left_context_frames': '64,128,256,-1', 'full_libri': True, 'manifest_dir': PosixPath('data/fbank'), 'max_duration': 1500, 'bucketing_sampler': True, 'num_buckets': 30, 'concatenate_cuts': False, 'duration_factor': 1.0, 'gap': 1.0, 'on_the_fly_feats': False, 'shuffle': True, 'drop_last': True, 'return_cuts': True, 'num_workers': 2, 'enable_spec_aug': True, 'spec_aug_time_warp_factor': 80, 'enable_musan': True, 'input_strategy': 'PrecomputedFeatures', 'blank_id': 0, 'vocab_size': 500} +2023-05-10 16:57:56,329 INFO [train.py:1112] (1/2) About to create model +2023-05-10 16:57:56,788 INFO [train.py:1116] (1/2) Number of model parameters: 23285615 +2023-05-10 16:57:56,788 INFO [checkpoint.py:112] (1/2) Loading checkpoint from pruned_transducer_stateless7/exp1119-smaller-md1500/epoch-31.pt +2023-05-10 16:58:03,413 INFO [train.py:1131] (1/2) Using DDP +2023-05-10 16:58:04,092 INFO [train.py:1145] (1/2) Loading optimizer state dict +2023-05-10 16:58:04,333 INFO [train.py:1153] (1/2) Loading scheduler state dict +2023-05-10 16:58:04,333 INFO [asr_datamodule.py:409] (1/2) About to get train-clean-100 cuts +2023-05-10 16:58:04,354 INFO [asr_datamodule.py:416] (1/2) About to get train-clean-360 cuts +2023-05-10 16:58:04,356 INFO [asr_datamodule.py:423] (1/2) About to get train-other-500 cuts +2023-05-10 16:58:04,357 INFO [asr_datamodule.py:225] (1/2) Enable MUSAN +2023-05-10 16:58:04,357 INFO [asr_datamodule.py:226] (1/2) About to get Musan cuts +2023-05-10 16:58:06,653 INFO [asr_datamodule.py:254] (1/2) Enable SpecAugment +2023-05-10 16:58:06,653 INFO [asr_datamodule.py:255] (1/2) Time warp factor: 80 +2023-05-10 16:58:06,653 INFO [asr_datamodule.py:267] (1/2) Num frame mask: 10 +2023-05-10 16:58:06,654 INFO [asr_datamodule.py:280] (1/2) About to create train dataset +2023-05-10 16:58:06,654 INFO [asr_datamodule.py:309] (1/2) Using DynamicBucketingSampler. +2023-05-10 16:58:11,934 WARNING [train.py:1182] (1/2) Exclude cut with ID 7859-102521-0017-7548-0_sp1.1 from training. Duration: 22.2954375 +2023-05-10 16:58:13,592 INFO [asr_datamodule.py:324] (1/2) About to create train dataloader +2023-05-10 16:58:13,594 INFO [asr_datamodule.py:430] (1/2) About to get dev-clean cuts +2023-05-10 16:58:13,595 INFO [asr_datamodule.py:437] (1/2) About to get dev-other cuts +2023-05-10 16:58:13,596 INFO [asr_datamodule.py:355] (1/2) About to create dev dataset +2023-05-10 16:58:13,928 INFO [asr_datamodule.py:374] (1/2) About to create dev dataloader +2023-05-10 16:58:13,929 INFO [train.py:1329] (1/2) Sanity check -- see if any of the batches in epoch 1 would cause OOM. +2023-05-10 16:58:19,282 WARNING [train.py:1182] (1/2) Exclude cut with ID 7859-102521-0017-7548-0_sp1.1 from training. Duration: 22.2954375 +2023-05-10 16:58:26,390 WARNING [train.py:1182] (1/2) Exclude cut with ID 7859-102521-0017-7548-0_sp1.1 from training. Duration: 22.2954375 +2023-05-10 16:58:31,667 WARNING [train.py:1182] (1/2) Exclude cut with ID 298-126791-0067-24026-0_sp0.9 from training. Duration: 21.438875 +2023-05-10 16:58:31,924 WARNING [train.py:1182] (1/2) Exclude cut with ID 5652-39938-0025-23684-0_sp0.9 from training. Duration: 22.2055625 +2023-05-10 16:58:42,090 WARNING [train.py:1182] (1/2) Exclude cut with ID 7859-102521-0017-7548-0 from training. Duration: 24.525 +2023-05-10 16:58:44,544 WARNING [train.py:1182] (1/2) Exclude cut with ID 3699-47246-0007-3408-0_sp0.9 from training. Duration: 20.26675 +2023-05-10 16:58:45,218 WARNING [train.py:1182] (1/2) Exclude cut with ID 7859-102521-0017-7548-0_sp0.9 from training. Duration: 27.25 +2023-05-10 16:58:49,017 WARNING [train.py:1182] (1/2) Exclude cut with ID 6426-64292-0017-15984-0 from training. Duration: 21.68 +2023-05-10 16:58:49,708 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0007-59342-0 from training. Duration: 21.6300625 +2023-05-10 16:58:51,607 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0007-59342-0_sp0.9 from training. Duration: 24.033375 +2023-05-10 16:58:54,773 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0009-59344-0 from training. Duration: 22.905 +2023-05-10 16:58:54,852 WARNING [train.py:1182] (1/2) Exclude cut with ID 5622-44585-0006-90525-0_sp1.1 from training. Duration: 23.4318125 +2023-05-10 16:59:01,611 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0009-59344-0_sp1.1 from training. Duration: 20.82275 +2023-05-10 16:59:01,694 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0009-59344-0_sp0.9 from training. Duration: 25.45 +2023-05-10 16:59:05,550 WARNING [train.py:1182] (1/2) Exclude cut with ID 5622-44585-0006-90525-0 from training. Duration: 25.775 +2023-05-10 16:59:06,660 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0071-62375-0_sp0.9 from training. Duration: 22.25 +2023-05-10 16:59:08,027 WARNING [train.py:1182] (1/2) Exclude cut with ID 3972-170212-0014-23379-0 from training. Duration: 26.205 +2023-05-10 16:59:08,882 WARNING [train.py:1182] (1/2) Exclude cut with ID 5239-32139-0047-9341-0_sp0.9 from training. Duration: 30.1555625 +2023-05-10 16:59:09,151 WARNING [train.py:1182] (1/2) Exclude cut with ID 1265-135635-0050-6781-0_sp0.9 from training. Duration: 21.8333125 +2023-05-10 16:59:09,599 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0024-94914-0_sp1.1 from training. Duration: 20.6545625 +2023-05-10 16:59:12,352 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0045-39920-0_sp0.9 from training. Duration: 20.52225 +2023-05-10 16:59:13,380 WARNING [train.py:1182] (1/2) Exclude cut with ID 3972-170212-0014-23379-0_sp0.9 from training. Duration: 29.1166875 +2023-05-10 16:59:16,868 WARNING [train.py:1182] (1/2) Exclude cut with ID 543-133211-0007-59831-0_sp0.9 from training. Duration: 21.388875 +2023-05-10 16:59:18,681 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0024-94914-0 from training. Duration: 22.72 +2023-05-10 16:59:18,756 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0031-94921-0_sp0.9 from training. Duration: 22.7444375 +2023-05-10 16:59:20,994 WARNING [train.py:1182] (1/2) Exclude cut with ID 4133-6541-0027-40495-0_sp1.1 from training. Duration: 0.9681875 +2023-05-10 16:59:21,179 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62851-0022-91297-0_sp0.9 from training. Duration: 22.3166875 +2023-05-10 16:59:21,460 WARNING [train.py:1182] (1/2) Exclude cut with ID 543-133212-0015-59917-0_sp0.9 from training. Duration: 21.8166875 +2023-05-10 16:59:27,289 WARNING [train.py:1182] (1/2) Exclude cut with ID 4957-30119-0041-23990-0_sp0.9 from training. Duration: 20.22775 +2023-05-10 16:59:30,202 WARNING [train.py:1182] (1/2) Exclude cut with ID 5239-32139-0047-9341-0_sp1.1 from training. Duration: 24.67275 +2023-05-10 16:59:31,662 WARNING [train.py:1182] (1/2) Exclude cut with ID 3082-165428-0081-50734-0_sp0.9 from training. Duration: 21.8055625 +2023-05-10 16:59:33,526 WARNING [train.py:1182] (1/2) Exclude cut with ID 3340-169293-0054-76830-0_sp0.9 from training. Duration: 22.6666875 +2023-05-10 16:59:37,345 WARNING [train.py:1182] (1/2) Exclude cut with ID 2411-132532-0017-82279-0_sp1.1 from training. Duration: 0.9681875 +2023-05-10 16:59:38,693 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62850-0007-91323-0 from training. Duration: 22.485 +2023-05-10 16:59:40,568 WARNING [train.py:1182] (1/2) Exclude cut with ID 3972-170212-0014-23379-0_sp1.1 from training. Duration: 23.82275 +2023-05-10 16:59:41,251 WARNING [train.py:1182] (1/2) Exclude cut with ID 4860-13185-0032-76709-0 from training. Duration: 20.77 +2023-05-10 16:59:42,254 WARNING [train.py:1182] (1/2) Exclude cut with ID 6426-64292-0017-15984-0_sp0.9 from training. Duration: 24.088875 +2023-05-10 16:59:43,748 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62850-0007-91323-0_sp1.1 from training. Duration: 20.4409375 +2023-05-10 16:59:47,653 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0062-62366-0_sp0.9 from training. Duration: 22.511125 +2023-05-10 16:59:47,686 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0031-39906-0 from training. Duration: 20.675 +2023-05-10 16:59:53,532 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62850-0007-91323-0_sp0.9 from training. Duration: 24.9833125 +2023-05-10 16:59:55,738 WARNING [train.py:1182] (1/2) Exclude cut with ID 5239-32139-0047-9341-0 from training. Duration: 27.14 +2023-05-10 16:59:56,520 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0044-62348-0 from training. Duration: 22.44 +2023-05-10 17:00:00,413 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0060-62364-0_sp0.9 from training. Duration: 21.361125 +2023-05-10 17:00:00,739 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0079-62383-0_sp1.1 from training. Duration: 27.0318125 +2023-05-10 17:00:01,832 WARNING [train.py:1182] (1/2) Exclude cut with ID 5622-44585-0006-90525-0_sp0.9 from training. Duration: 28.638875 +2023-05-10 17:00:02,708 WARNING [train.py:1182] (1/2) Exclude cut with ID 3340-169293-0054-76830-0 from training. Duration: 20.4 +2023-05-10 17:00:04,280 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0071-62375-0 from training. Duration: 20.025 +2023-05-10 17:00:04,292 WARNING [train.py:1182] (1/2) Exclude cut with ID 2364-131735-0112-64612-0_sp0.9 from training. Duration: 20.488875 +2023-05-10 17:00:04,571 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0079-62383-0 from training. Duration: 29.735 +2023-05-10 17:00:09,308 WARNING [train.py:1182] (1/2) Exclude cut with ID 7276-92427-0014-12983-0_sp0.9 from training. Duration: 21.3055625 +2023-05-10 17:00:09,378 WARNING [train.py:1182] (1/2) Exclude cut with ID 1025-75365-0008-79168-0_sp0.9 from training. Duration: 22.0666875 +2023-05-10 17:00:15,309 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0062-62366-0 from training. Duration: 20.26 +2023-05-10 17:00:16,024 WARNING [train.py:1182] (1/2) Exclude cut with ID 5239-32139-0030-9324-0_sp0.9 from training. Duration: 21.3444375 +2023-05-10 17:00:18,925 WARNING [train.py:1182] (1/2) Exclude cut with ID 497-129325-0061-62254-0_sp1.1 from training. Duration: 0.97725 +2023-05-10 17:00:21,767 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0031-39906-0_sp0.9 from training. Duration: 22.97225 +2023-05-10 17:00:23,383 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0047-39922-0_sp0.9 from training. Duration: 21.97775 +2023-05-10 17:00:24,047 WARNING [train.py:1182] (1/2) Exclude cut with ID 1112-1043-0006-89194-0_sp0.9 from training. Duration: 21.8333125 +2023-05-10 17:00:24,576 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0031-94921-0 from training. Duration: 20.47 +2023-05-10 17:00:28,539 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0037-39912-0_sp0.9 from training. Duration: 20.67225 +2023-05-10 17:00:29,486 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0024-94914-0_sp0.9 from training. Duration: 25.2444375 +2023-05-10 17:00:30,775 WARNING [train.py:1182] (1/2) Exclude cut with ID 3340-169293-0021-76797-0_sp0.9 from training. Duration: 21.1445 +2023-05-10 17:00:35,286 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0079-62383-0_sp0.9 from training. Duration: 33.038875 +2023-05-10 17:00:37,203 WARNING [train.py:1182] (1/2) Exclude cut with ID 6426-64291-0000-16059-0_sp0.9 from training. Duration: 20.0944375 +2023-05-10 17:00:37,958 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0044-62348-0_sp1.1 from training. Duration: 20.4 +2023-05-10 17:00:38,354 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62851-0022-91297-0 from training. Duration: 20.085 +2023-05-10 17:00:38,915 WARNING [train.py:1182] (1/2) Exclude cut with ID 4860-13185-0032-76709-0_sp0.9 from training. Duration: 23.07775 +2023-05-10 17:00:42,096 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0044-62348-0_sp0.9 from training. Duration: 24.9333125 +2023-05-10 17:00:44,393 WARNING [train.py:1182] (1/2) Exclude cut with ID 5118-111612-0016-124680-0_sp0.9 from training. Duration: 20.388875 +2023-05-10 17:00:44,738 WARNING [train.py:1182] (1/2) Exclude cut with ID 432-122774-0017-62487-0_sp1.1 from training. Duration: 20.3590625 +2023-05-10 17:00:48,832 WARNING [train.py:1182] (1/2) Exclude cut with ID 3557-8342-0013-54691-0_sp1.1 from training. Duration: 0.836375 +2023-05-10 17:00:51,081 WARNING [train.py:1182] (1/2) Exclude cut with ID 8565-290391-0049-67394-0_sp0.9 from training. Duration: 21.3166875 +2023-05-10 17:00:53,261 WARNING [train.py:1182] (1/2) Exclude cut with ID 6533-399-0029-104863-0_sp0.9 from training. Duration: 22.1055625 +2023-05-10 17:00:53,792 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0094-26379-0_sp1.1 from training. Duration: 21.77725 +2023-05-10 17:00:54,759 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0005-134304-0_sp0.9 from training. Duration: 27.8166875 +2023-05-10 17:00:55,988 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0021-15852-0_sp1.1 from training. Duration: 22.5090625 +2023-05-10 17:00:56,287 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0005-134304-0 from training. Duration: 25.035 +2023-05-10 17:00:57,133 WARNING [train.py:1182] (1/2) Exclude cut with ID 774-127930-0014-10412-0_sp1.1 from training. Duration: 0.95 +2023-05-10 17:00:58,082 WARNING [train.py:1182] (1/2) Exclude cut with ID 3033-130750-0096-55598-0_sp0.9 from training. Duration: 0.92225 +2023-05-10 17:00:59,997 WARNING [train.py:1182] (1/2) Exclude cut with ID 4511-76322-0006-80011-0 from training. Duration: 21.97 +2023-05-10 17:01:00,918 WARNING [train.py:1182] (1/2) Exclude cut with ID 7492-105653-0055-62765-0_sp0.9 from training. Duration: 21.97225 +2023-05-10 17:01:00,957 WARNING [train.py:1182] (1/2) Exclude cut with ID 453-131332-0000-47844-0_sp0.9 from training. Duration: 25.3333125 +2023-05-10 17:01:01,469 WARNING [train.py:1182] (1/2) Exclude cut with ID 5172-29468-0015-19128-0_sp0.9 from training. Duration: 21.5055625 +2023-05-10 17:01:01,938 WARNING [train.py:1182] (1/2) Exclude cut with ID 453-131332-0000-47844-0_sp1.1 from training. Duration: 20.72725 +2023-05-10 17:01:03,535 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0030-130156-0_sp0.9 from training. Duration: 26.32775 +2023-05-10 17:01:05,702 WARNING [train.py:1182] (1/2) Exclude cut with ID 3867-173237-0077-144769-0 from training. Duration: 20.025 +2023-05-10 17:01:05,960 WARNING [train.py:1182] (1/2) Exclude cut with ID 6709-74022-0004-86860-0_sp1.1 from training. Duration: 0.9409375 +2023-05-10 17:01:05,969 WARNING [train.py:1182] (1/2) Exclude cut with ID 4757-1811-0023-62229-0_sp0.9 from training. Duration: 21.37775 +2023-05-10 17:01:06,974 WARNING [train.py:1182] (1/2) Exclude cut with ID 1250-135782-0004-25974-0_sp0.9 from training. Duration: 21.17225 +2023-05-10 17:01:06,984 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0021-15852-0_sp0.9 from training. Duration: 27.511125 +2023-05-10 17:01:08,572 WARNING [train.py:1182] (1/2) Exclude cut with ID 453-131332-0000-47844-0 from training. Duration: 22.8 +2023-05-10 17:01:08,796 WARNING [train.py:1182] (1/2) Exclude cut with ID 4964-30587-0040-44509-0 from training. Duration: 22.585 +2023-05-10 17:01:10,896 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0001-146967-0_sp0.9 from training. Duration: 22.0166875 +2023-05-10 17:01:11,717 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0003-134302-0_sp1.1 from training. Duration: 24.395375 +2023-05-10 17:01:12,057 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0017-41203-0_sp0.9 from training. Duration: 27.47775 +2023-05-10 17:01:12,282 WARNING [train.py:1182] (1/2) Exclude cut with ID 432-122774-0017-62487-0_sp0.9 from training. Duration: 24.8833125 +2023-05-10 17:01:12,438 WARNING [train.py:1182] (1/2) Exclude cut with ID 6758-72288-0033-108368-0 from training. Duration: 23.39 +2023-05-10 17:01:12,783 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0007-12994-0_sp0.9 from training. Duration: 28.72225 +2023-05-10 17:01:13,262 WARNING [train.py:1182] (1/2) Exclude cut with ID 585-294811-0110-133686-0_sp0.9 from training. Duration: 20.8944375 +2023-05-10 17:01:13,972 WARNING [train.py:1182] (1/2) Exclude cut with ID 5796-66357-0007-116447-0_sp0.9 from training. Duration: 23.8444375 +2023-05-10 17:01:15,198 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0007-12994-0 from training. Duration: 25.85 +2023-05-10 17:01:15,207 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0023-13010-0 from training. Duration: 21.39 +2023-05-10 17:01:15,733 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0014-15845-0 from training. Duration: 27.92 +2023-05-10 17:01:17,767 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0039-130165-0_sp0.9 from training. Duration: 20.661125 +2023-05-10 17:01:19,007 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0043-15874-0_sp0.9 from training. Duration: 20.07225 +2023-05-10 17:01:19,379 WARNING [train.py:1182] (1/2) Exclude cut with ID 1085-156170-0017-128270-0 from training. Duration: 21.01 +2023-05-10 17:01:23,461 WARNING [train.py:1182] (1/2) Exclude cut with ID 2195-150901-0045-59933-0 from training. Duration: 20.65 +2023-05-10 17:01:23,812 WARNING [train.py:1182] (1/2) Exclude cut with ID 5796-66357-0007-116447-0 from training. Duration: 21.46 +2023-05-10 17:01:26,080 WARNING [train.py:1182] (1/2) Exclude cut with ID 3557-8342-0013-54691-0 from training. Duration: 0.92 +2023-05-10 17:01:26,411 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0023-13010-0_sp0.9 from training. Duration: 23.7666875 +2023-05-10 17:01:28,213 WARNING [train.py:1182] (1/2) Exclude cut with ID 8544-281189-0060-101339-0_sp0.9 from training. Duration: 20.861125 +2023-05-10 17:01:28,711 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-65654-0031-41259-0_sp0.9 from training. Duration: 22.711125 +2023-05-10 17:01:31,445 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0043-132310-0_sp1.1 from training. Duration: 22.986375 +2023-05-10 17:01:32,199 WARNING [train.py:1182] (1/2) Exclude cut with ID 8040-260924-0003-80960-0_sp0.9 from training. Duration: 22.07225 +2023-05-10 17:01:32,432 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0045-26330-0_sp0.9 from training. Duration: 20.3055625 +2023-05-10 17:01:32,554 WARNING [train.py:1182] (1/2) Exclude cut with ID 6356-271890-0060-94317-0_sp0.9 from training. Duration: 20.72225 +2023-05-10 17:01:33,451 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0017-41203-0_sp1.1 from training. Duration: 22.4818125 +2023-05-10 17:01:34,814 WARNING [train.py:1182] (1/2) Exclude cut with ID 4964-30587-0040-44509-0_sp0.9 from training. Duration: 25.0944375 +2023-05-10 17:01:35,611 WARNING [train.py:1182] (1/2) Exclude cut with ID 6533-399-0047-104881-0 from training. Duration: 21.515 +2023-05-10 17:01:35,919 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0009-15840-0_sp0.9 from training. Duration: 27.02225 +2023-05-10 17:01:36,143 WARNING [train.py:1182] (1/2) Exclude cut with ID 432-122774-0010-62480-0_sp0.9 from training. Duration: 22.22225 +2023-05-10 17:01:36,494 WARNING [train.py:1182] (1/2) Exclude cut with ID 4964-30587-0085-44554-0_sp0.9 from training. Duration: 20.85 +2023-05-10 17:01:38,140 WARNING [train.py:1182] (1/2) Exclude cut with ID 4295-39940-0007-92567-0 from training. Duration: 21.54 +2023-05-10 17:01:38,383 WARNING [train.py:1182] (1/2) Exclude cut with ID 4964-30587-0040-44509-0_sp1.1 from training. Duration: 20.5318125 +2023-05-10 17:01:38,871 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0012-134311-0_sp0.9 from training. Duration: 21.9333125 +2023-05-10 17:01:41,952 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0025-130151-0_sp0.9 from training. Duration: 21.7944375 +2023-05-10 17:01:42,563 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0002-12989-0_sp0.9 from training. Duration: 22.4666875 +2023-05-10 17:01:42,937 WARNING [train.py:1182] (1/2) Exclude cut with ID 6121-9014-0076-24124-0 from training. Duration: 21.635 +2023-05-10 17:01:43,239 WARNING [train.py:1182] (1/2) Exclude cut with ID 6121-9014-0076-24124-0_sp0.9 from training. Duration: 24.038875 +2023-05-10 17:01:45,319 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0009-134308-0_sp1.1 from training. Duration: 21.786375 +2023-05-10 17:01:45,888 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0002-12989-0 from training. Duration: 20.22 +2023-05-10 17:01:52,681 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0043-132310-0 from training. Duration: 25.285 +2023-05-10 17:01:56,439 WARNING [train.py:1182] (1/2) Exclude cut with ID 811-130148-0001-63453-0_sp0.9 from training. Duration: 20.861125 +2023-05-10 17:01:57,526 WARNING [train.py:1182] (1/2) Exclude cut with ID 6010-56788-0055-90261-0 from training. Duration: 20.88 +2023-05-10 17:01:59,132 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0045-15876-0_sp0.9 from training. Duration: 23.4166875 +2023-05-10 17:02:03,498 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0006-134305-0 from training. Duration: 21.24 +2023-05-10 17:02:03,516 WARNING [train.py:1182] (1/2) Exclude cut with ID 6533-399-0047-104881-0_sp0.9 from training. Duration: 23.9055625 +2023-05-10 17:02:05,304 WARNING [train.py:1182] (1/2) Exclude cut with ID 6758-72288-0033-108368-0_sp0.9 from training. Duration: 25.988875 +2023-05-10 17:02:06,308 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0001-134300-0_sp0.9 from training. Duration: 20.67225 +2023-05-10 17:02:08,663 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0038-41224-0 from training. Duration: 20.34 +2023-05-10 17:02:12,912 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0026-15857-0_sp0.9 from training. Duration: 25.061125 +2023-05-10 17:02:13,474 WARNING [train.py:1182] (1/2) Exclude cut with ID 3033-130750-0096-55598-0 from training. Duration: 0.83 +2023-05-10 17:02:14,716 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0017-41203-0 from training. Duration: 24.73 +2023-05-10 17:02:15,302 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0009-134308-0 from training. Duration: 23.965 +2023-05-10 17:02:15,721 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0030-146996-0_sp0.9 from training. Duration: 22.088875 +2023-05-10 17:02:16,521 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0006-134305-0_sp0.9 from training. Duration: 23.6 +2023-05-10 17:02:22,082 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0024-13011-0 from training. Duration: 23.795 +2023-05-10 17:02:22,902 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0030-130156-0_sp1.1 from training. Duration: 21.5409375 +2023-05-10 17:02:23,036 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0019-146985-0_sp0.9 from training. Duration: 24.97775 +2023-05-10 17:02:24,204 WARNING [train.py:1182] (1/2) Exclude cut with ID 1085-156170-0017-128270-0_sp0.9 from training. Duration: 23.3444375 +2023-05-10 17:02:25,623 WARNING [train.py:1182] (1/2) Exclude cut with ID 6010-56788-0055-90261-0_sp0.9 from training. Duration: 23.2 +2023-05-10 17:02:25,886 WARNING [train.py:1182] (1/2) Exclude cut with ID 5653-46179-0060-117930-0_sp0.9 from training. Duration: 21.17225 +2023-05-10 17:02:26,840 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0008-134307-0_sp0.9 from training. Duration: 24.6555625 +2023-05-10 17:02:30,093 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-65654-0031-41259-0 from training. Duration: 20.44 +2023-05-10 17:02:30,822 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0018-132285-0_sp0.9 from training. Duration: 23.45 +2023-05-10 17:02:32,278 WARNING [train.py:1182] (1/2) Exclude cut with ID 6945-60535-0076-12784-0_sp0.9 from training. Duration: 20.52225 +2023-05-10 17:02:32,640 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0008-134307-0 from training. Duration: 22.19 +2023-05-10 17:02:33,138 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0014-15845-0_sp1.1 from training. Duration: 25.3818125 +2023-05-10 17:02:33,961 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0043-132310-0_sp0.9 from training. Duration: 28.0944375 +2023-05-10 17:02:34,259 WARNING [train.py:1182] (1/2) Exclude cut with ID 2195-150901-0045-59933-0_sp0.9 from training. Duration: 22.9444375 +2023-05-10 17:02:34,668 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0024-13011-0_sp1.1 from training. Duration: 21.6318125 +2023-05-10 17:02:36,045 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0030-130156-0 from training. Duration: 23.695 +2023-05-10 17:02:37,339 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0094-26379-0 from training. Duration: 23.955 +2023-05-10 17:02:39,056 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0024-13011-0_sp0.9 from training. Duration: 26.438875 +2023-05-10 17:02:41,232 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0021-26306-0_sp0.9 from training. Duration: 21.2444375 +2023-05-10 17:02:41,279 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0014-15845-0_sp0.9 from training. Duration: 31.02225 +2023-05-10 17:02:42,299 WARNING [train.py:1182] (1/2) Exclude cut with ID 432-122774-0017-62487-0 from training. Duration: 22.395 +2023-05-10 17:02:43,086 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0045-15876-0 from training. Duration: 21.075 +2023-05-10 17:02:43,330 WARNING [train.py:1182] (1/2) Exclude cut with ID 6482-98857-0025-147532-0_sp0.9 from training. Duration: 20.0055625 +2023-05-10 17:02:43,348 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0037-132304-0_sp0.9 from training. Duration: 22.05 +2023-05-10 17:02:43,360 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0003-134302-0 from training. Duration: 26.8349375 +2023-05-10 17:02:43,508 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0009-15840-0_sp1.1 from training. Duration: 22.1090625 +2023-05-10 17:02:43,835 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0094-26379-0_sp0.9 from training. Duration: 26.6166875 +2023-05-10 17:02:44,688 WARNING [train.py:1182] (1/2) Exclude cut with ID 2046-178027-0000-53705-0_sp0.9 from training. Duration: 20.3055625 +2023-05-10 17:02:45,793 WARNING [train.py:1182] (1/2) Exclude cut with ID 7205-50138-0008-5373-0_sp0.9 from training. Duration: 20.7 +2023-05-10 17:02:48,782 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0019-146985-0 from training. Duration: 22.48 +2023-05-10 17:02:49,648 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0003-134302-0_sp0.9 from training. Duration: 29.816625 +2023-05-10 17:02:50,632 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0005-134304-0_sp1.1 from training. Duration: 22.7590625 +2023-05-10 17:02:50,931 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0026-15857-0 from training. Duration: 22.555 +2023-05-10 17:02:51,977 WARNING [train.py:1182] (1/2) Exclude cut with ID 1250-135782-0005-25975-0_sp0.9 from training. Duration: 21.688875 +2023-05-10 17:02:54,323 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0038-41224-0_sp0.9 from training. Duration: 22.6 +2023-05-10 17:02:56,187 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0009-15840-0 from training. Duration: 24.32 +2023-05-10 17:02:59,048 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-276745-0093-13116-0_sp0.9 from training. Duration: 21.061125 +2023-05-10 17:03:00,403 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0024-15855-0_sp0.9 from training. Duration: 20.32225 +2023-05-10 17:03:01,105 WARNING [train.py:1182] (1/2) Exclude cut with ID 3033-130750-0096-55598-0_sp1.1 from training. Duration: 0.7545625 +2023-05-10 17:03:01,927 WARNING [train.py:1182] (1/2) Exclude cut with ID 4295-39940-0007-92567-0_sp0.9 from training. Duration: 23.9333125 +2023-05-10 17:03:03,612 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0008-134307-0_sp1.1 from training. Duration: 20.17275 +2023-05-10 17:03:03,899 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0019-146985-0_sp1.1 from training. Duration: 20.436375 +2023-05-10 17:03:08,635 WARNING [train.py:1182] (1/2) Exclude cut with ID 4234-40345-0022-142709-0_sp0.9 from training. Duration: 23.1055625 +2023-05-10 17:03:08,732 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0007-12994-0_sp1.1 from training. Duration: 23.5 +2023-05-10 17:03:09,284 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0009-134308-0_sp0.9 from training. Duration: 26.62775 +2023-05-10 17:03:09,888 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0018-132285-0 from training. Duration: 21.105 +2023-05-10 17:03:10,127 WARNING [train.py:1182] (1/2) Exclude cut with ID 4511-76322-0006-80011-0_sp0.9 from training. Duration: 24.411125 +2023-05-10 17:03:11,497 WARNING [train.py:1182] (1/2) Exclude cut with ID 6758-72288-0033-108368-0_sp1.1 from training. Duration: 21.263625 +2023-05-10 17:03:13,758 WARNING [train.py:1182] (1/2) Exclude cut with ID 4234-40345-0022-142709-0 from training. Duration: 20.795 +2023-05-10 17:03:14,253 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0021-15852-0 from training. Duration: 24.76 +2023-05-10 17:03:14,270 WARNING [train.py:1182] (1/2) Exclude cut with ID 3867-173237-0077-144769-0_sp0.9 from training. Duration: 22.25 +2023-05-10 17:03:15,535 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0026-15857-0_sp1.1 from training. Duration: 20.5045625 +2023-05-10 17:03:25,184 INFO [train.py:1357] (1/2) Maximum memory allocated so far is 17020MB +2023-05-10 17:03:28,163 INFO [train.py:1357] (1/2) Maximum memory allocated so far is 17962MB +2023-05-10 17:03:31,362 INFO [train.py:1357] (1/2) Maximum memory allocated so far is 17962MB +2023-05-10 17:03:34,598 INFO [train.py:1357] (1/2) Maximum memory allocated so far is 17962MB +2023-05-10 17:03:37,296 INFO [scaling.py:969] (1/2) Whitening: name=None, num_groups=1, num_channels=256, metric=8.43 vs. limit=7.5 +2023-05-10 17:03:37,760 INFO [train.py:1357] (1/2) Maximum memory allocated so far is 17962MB +2023-05-10 17:03:40,852 INFO [train.py:1357] (1/2) Maximum memory allocated so far is 17962MB +2023-05-10 17:03:40,878 INFO [train.py:1238] (1/2) Loading grad scaler state dict +2023-05-10 17:03:55,718 WARNING [train.py:1182] (1/2) Exclude cut with ID 7859-102521-0017-7548-0_sp1.1 from training. Duration: 22.2954375 +2023-05-10 17:04:01,305 INFO [train.py:1021] (1/2) Epoch 32, batch 0, loss[loss=0.167, simple_loss=0.2643, pruned_loss=0.03489, over 37065.00 frames. ], tot_loss[loss=0.167, simple_loss=0.2643, pruned_loss=0.03489, over 37065.00 frames. ], batch size: 103, lr: 3.45e-03, grad_scale: 32.0 +2023-05-10 17:04:01,305 INFO [train.py:1048] (1/2) Computing validation loss +2023-05-10 17:04:12,425 INFO [train.py:1057] (1/2) Epoch 32, validation: loss=0.1529, simple_loss=0.2541, pruned_loss=0.02587, over 944034.00 frames. +2023-05-10 17:04:12,426 INFO [train.py:1058] (1/2) Maximum memory allocated so far is 17962MB +2023-05-10 17:04:34,124 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module2.balancer1.prob, batch_count=563640.0, ans=0.125 +2023-05-10 17:04:34,197 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.ff3_skip_rate, batch_count=563640.0, ans=0.0 +2023-05-10 17:04:35,603 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.bypass_mid.scale_min, batch_count=563640.0, ans=0.2 +2023-05-10 17:04:37,172 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward3.hidden_balancer.prob, batch_count=563640.0, ans=0.125 +2023-05-10 17:05:00,771 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.balancer1.prob, batch_count=563740.0, ans=0.125 +2023-05-10 17:05:03,822 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=563740.0, ans=0.125 +2023-05-10 17:05:06,567 WARNING [train.py:1182] (1/2) Exclude cut with ID 298-126791-0067-24026-0_sp0.9 from training. Duration: 21.438875 +2023-05-10 17:05:08,989 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=11.27 vs. limit=15.0 +2023-05-10 17:05:12,700 WARNING [train.py:1182] (1/2) Exclude cut with ID 5652-39938-0025-23684-0_sp0.9 from training. Duration: 22.2055625 +2023-05-10 17:05:29,308 INFO [train.py:1021] (1/2) Epoch 32, batch 50, loss[loss=0.1841, simple_loss=0.2765, pruned_loss=0.04586, over 35881.00 frames. ], tot_loss[loss=0.1682, simple_loss=0.2614, pruned_loss=0.03751, over 1629616.15 frames. ], batch size: 133, lr: 3.45e-03, grad_scale: 32.0 +2023-05-10 17:05:39,711 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.bypass.skip_rate, batch_count=563840.0, ans=0.09899494936611666 +2023-05-10 17:05:53,974 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.483e+02 3.142e+02 3.708e+02 4.211e+02 6.063e+02, threshold=7.417e+02, percent-clipped=0.0 +2023-05-10 17:06:09,954 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=563940.0, ans=0.1 +2023-05-10 17:06:30,304 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=192, metric=4.20 vs. limit=10.0 +2023-05-10 17:06:48,912 INFO [train.py:1021] (1/2) Epoch 32, batch 100, loss[loss=0.165, simple_loss=0.2613, pruned_loss=0.03436, over 36908.00 frames. ], tot_loss[loss=0.1643, simple_loss=0.257, pruned_loss=0.03578, over 2893394.69 frames. ], batch size: 105, lr: 3.45e-03, grad_scale: 32.0 +2023-05-10 17:07:18,062 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.attention_skip_rate, batch_count=564190.0, ans=0.0 +2023-05-10 17:07:49,822 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=256, metric=8.83 vs. limit=22.5 +2023-05-10 17:08:05,325 INFO [train.py:1021] (1/2) Epoch 32, batch 150, loss[loss=0.1627, simple_loss=0.2612, pruned_loss=0.0321, over 36931.00 frames. ], tot_loss[loss=0.1635, simple_loss=0.2555, pruned_loss=0.03574, over 3856214.11 frames. ], batch size: 108, lr: 3.45e-03, grad_scale: 32.0 +2023-05-10 17:08:05,762 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.const_attention_rate, batch_count=564340.0, ans=0.025 +2023-05-10 17:08:08,801 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward3.hidden_balancer.prob, batch_count=564340.0, ans=0.125 +2023-05-10 17:08:17,703 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.bypass_mid.scale_min, batch_count=564340.0, ans=0.2 +2023-05-10 17:08:27,924 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.497e+02 2.908e+02 3.365e+02 3.948e+02 5.910e+02, threshold=6.730e+02, percent-clipped=0.0 +2023-05-10 17:08:28,398 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.2734, 3.7081, 3.5058, 3.7123, 3.1823, 2.8928, 3.3089, 2.8586], + device='cuda:1') +2023-05-10 17:08:29,689 WARNING [train.py:1182] (1/2) Exclude cut with ID 7859-102521-0017-7548-0 from training. Duration: 24.525 +2023-05-10 17:08:47,981 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=564440.0, ans=0.1 +2023-05-10 17:09:07,273 WARNING [train.py:1182] (1/2) Exclude cut with ID 3699-47246-0007-3408-0_sp0.9 from training. Duration: 20.26675 +2023-05-10 17:09:08,961 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.const_attention_rate, batch_count=564540.0, ans=0.025 +2023-05-10 17:09:22,286 INFO [train.py:1021] (1/2) Epoch 32, batch 200, loss[loss=0.1649, simple_loss=0.2572, pruned_loss=0.03626, over 37167.00 frames. ], tot_loss[loss=0.1631, simple_loss=0.2554, pruned_loss=0.03536, over 4607264.89 frames. ], batch size: 102, lr: 3.45e-03, grad_scale: 32.0 +2023-05-10 17:09:22,418 WARNING [train.py:1182] (1/2) Exclude cut with ID 7859-102521-0017-7548-0_sp0.9 from training. Duration: 27.25 +2023-05-10 17:09:27,798 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([5.2204, 4.5768, 3.4289, 3.4532], device='cuda:1') +2023-05-10 17:10:25,023 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=564790.0, ans=0.1 +2023-05-10 17:10:40,566 INFO [train.py:1021] (1/2) Epoch 32, batch 250, loss[loss=0.178, simple_loss=0.2708, pruned_loss=0.0426, over 36763.00 frames. ], tot_loss[loss=0.1621, simple_loss=0.2543, pruned_loss=0.03492, over 5196387.92 frames. ], batch size: 122, lr: 3.45e-03, grad_scale: 32.0 +2023-05-10 17:10:45,190 WARNING [train.py:1182] (1/2) Exclude cut with ID 6426-64292-0017-15984-0 from training. Duration: 21.68 +2023-05-10 17:10:55,992 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0007-59342-0 from training. Duration: 21.6300625 +2023-05-10 17:10:57,657 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.ff3_skip_rate, batch_count=564890.0, ans=0.0 +2023-05-10 17:11:03,383 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.318e+02 2.945e+02 3.312e+02 3.958e+02 5.852e+02, threshold=6.624e+02, percent-clipped=0.0 +2023-05-10 17:11:20,950 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0007-59342-0_sp0.9 from training. Duration: 24.033375 +2023-05-10 17:11:23,141 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=256, metric=3.05 vs. limit=15.0 +2023-05-10 17:11:32,441 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.const_attention_rate, batch_count=564990.0, ans=0.025 +2023-05-10 17:11:33,987 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=564990.0, ans=0.1 +2023-05-10 17:11:49,249 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.0.layers.0.self_attn_weights, attn_weights_entropy = tensor([6.3572, 5.6537, 5.5269, 6.1024], device='cuda:1') +2023-05-10 17:11:58,237 INFO [train.py:1021] (1/2) Epoch 32, batch 300, loss[loss=0.1419, simple_loss=0.2266, pruned_loss=0.0286, over 37088.00 frames. ], tot_loss[loss=0.1609, simple_loss=0.253, pruned_loss=0.03438, over 5658710.03 frames. ], batch size: 88, lr: 3.45e-03, grad_scale: 32.0 +2023-05-10 17:12:07,432 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=565090.0, ans=0.125 +2023-05-10 17:12:13,067 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=565140.0, ans=0.125 +2023-05-10 17:12:13,187 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([2.4998, 3.3559, 3.1593, 4.0381, 2.7212, 3.4988, 4.0692, 3.5583], + device='cuda:1') +2023-05-10 17:12:16,939 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.balancer1.prob, batch_count=565140.0, ans=0.125 +2023-05-10 17:12:24,200 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0009-59344-0 from training. Duration: 22.905 +2023-05-10 17:12:25,623 WARNING [train.py:1182] (1/2) Exclude cut with ID 5622-44585-0006-90525-0_sp1.1 from training. Duration: 23.4318125 +2023-05-10 17:12:35,661 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.bypass.skip_rate, batch_count=565190.0, ans=0.09899494936611666 +2023-05-10 17:12:43,104 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([2.5972, 3.3707, 3.2071, 4.0480, 2.0421, 3.5253, 4.0572, 3.5349], + device='cuda:1') +2023-05-10 17:13:14,671 INFO [train.py:1021] (1/2) Epoch 32, batch 350, loss[loss=0.1608, simple_loss=0.2525, pruned_loss=0.03454, over 37108.00 frames. ], tot_loss[loss=0.1626, simple_loss=0.2552, pruned_loss=0.03503, over 5995835.44 frames. ], batch size: 98, lr: 3.45e-03, grad_scale: 32.0 +2023-05-10 17:13:33,608 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.bypass_mid.scale_min, batch_count=565390.0, ans=0.2 +2023-05-10 17:13:37,558 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.509e+02 2.940e+02 3.395e+02 4.026e+02 5.839e+02, threshold=6.790e+02, percent-clipped=0.0 +2023-05-10 17:13:38,018 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=565390.0, ans=0.0 +2023-05-10 17:13:50,649 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=9.42 vs. limit=15.0 +2023-05-10 17:13:56,067 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=565440.0, ans=0.125 +2023-05-10 17:14:01,856 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.nonlin_attention.balancer.min_positive, batch_count=565490.0, ans=0.05 +2023-05-10 17:14:20,955 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.5.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([3.0533, 4.1086, 4.6051, 4.8566], device='cuda:1') +2023-05-10 17:14:31,050 INFO [train.py:1021] (1/2) Epoch 32, batch 400, loss[loss=0.1713, simple_loss=0.271, pruned_loss=0.03578, over 36856.00 frames. ], tot_loss[loss=0.1628, simple_loss=0.2554, pruned_loss=0.03511, over 6275887.28 frames. ], batch size: 111, lr: 3.45e-03, grad_scale: 32.0 +2023-05-10 17:14:32,548 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0009-59344-0_sp1.1 from training. Duration: 20.82275 +2023-05-10 17:14:34,063 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0009-59344-0_sp0.9 from training. Duration: 25.45 +2023-05-10 17:15:05,783 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=256, metric=13.89 vs. limit=22.5 +2023-05-10 17:15:19,687 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=565740.0, ans=0.125 +2023-05-10 17:15:33,261 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=565790.0, ans=0.1 +2023-05-10 17:15:37,608 WARNING [train.py:1182] (1/2) Exclude cut with ID 5622-44585-0006-90525-0 from training. Duration: 25.775 +2023-05-10 17:15:48,117 INFO [train.py:1021] (1/2) Epoch 32, batch 450, loss[loss=0.161, simple_loss=0.2558, pruned_loss=0.03311, over 36908.00 frames. ], tot_loss[loss=0.1635, simple_loss=0.2561, pruned_loss=0.03543, over 6499867.91 frames. ], batch size: 100, lr: 3.45e-03, grad_scale: 32.0 +2023-05-10 17:15:57,242 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0071-62375-0_sp0.9 from training. Duration: 22.25 +2023-05-10 17:16:10,835 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.412e+02 2.980e+02 3.396e+02 4.577e+02 8.079e+02, threshold=6.792e+02, percent-clipped=2.0 +2023-05-10 17:16:27,977 WARNING [train.py:1182] (1/2) Exclude cut with ID 3972-170212-0014-23379-0 from training. Duration: 26.205 +2023-05-10 17:16:44,837 WARNING [train.py:1182] (1/2) Exclude cut with ID 5239-32139-0047-9341-0_sp0.9 from training. Duration: 30.1555625 +2023-05-10 17:16:50,802 WARNING [train.py:1182] (1/2) Exclude cut with ID 1265-135635-0050-6781-0_sp0.9 from training. Duration: 21.8333125 +2023-05-10 17:17:00,368 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0024-94914-0_sp1.1 from training. Duration: 20.6545625 +2023-05-10 17:17:04,814 INFO [train.py:1021] (1/2) Epoch 32, batch 500, loss[loss=0.2009, simple_loss=0.2828, pruned_loss=0.0595, over 24649.00 frames. ], tot_loss[loss=0.1641, simple_loss=0.2568, pruned_loss=0.03567, over 6650187.16 frames. ], batch size: 233, lr: 3.44e-03, grad_scale: 32.0 +2023-05-10 17:17:13,356 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.balancer_ff2.min_abs, batch_count=566090.0, ans=0.1 +2023-05-10 17:17:17,800 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.nonlin_attention.balancer.max_positive, batch_count=566090.0, ans=0.95 +2023-05-10 17:17:22,223 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_skip_rate, batch_count=566140.0, ans=0.0 +2023-05-10 17:17:30,024 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.4.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 17:17:43,525 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward3.out_whiten.whitening_limit, batch_count=566190.0, ans=15.0 +2023-05-10 17:17:44,550 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0045-39920-0_sp0.9 from training. Duration: 20.52225 +2023-05-10 17:17:57,317 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.ff2_skip_rate, batch_count=566240.0, ans=0.0 +2023-05-10 17:18:06,691 WARNING [train.py:1182] (1/2) Exclude cut with ID 3972-170212-0014-23379-0_sp0.9 from training. Duration: 29.1166875 +2023-05-10 17:18:10,559 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=192, metric=4.19 vs. limit=10.0 +2023-05-10 17:18:21,897 INFO [train.py:1021] (1/2) Epoch 32, batch 550, loss[loss=0.2038, simple_loss=0.2892, pruned_loss=0.05925, over 25202.00 frames. ], tot_loss[loss=0.1649, simple_loss=0.2577, pruned_loss=0.03605, over 6784343.86 frames. ], batch size: 233, lr: 3.44e-03, grad_scale: 32.0 +2023-05-10 17:18:45,371 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.543e+02 3.234e+02 3.665e+02 4.549e+02 6.546e+02, threshold=7.330e+02, percent-clipped=0.0 +2023-05-10 17:18:47,419 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass.scale_min, batch_count=566390.0, ans=0.2 +2023-05-10 17:18:50,187 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.balancer2.prob, batch_count=566390.0, ans=0.125 +2023-05-10 17:19:03,388 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.0.conv_module1.whiten, num_groups=1, num_channels=192, metric=10.85 vs. limit=15.0 +2023-05-10 17:19:06,992 WARNING [train.py:1182] (1/2) Exclude cut with ID 543-133211-0007-59831-0_sp0.9 from training. Duration: 21.388875 +2023-05-10 17:19:19,257 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward2.hidden_balancer.prob, batch_count=566490.0, ans=0.125 +2023-05-10 17:19:39,115 INFO [train.py:1021] (1/2) Epoch 32, batch 600, loss[loss=0.1618, simple_loss=0.2556, pruned_loss=0.03395, over 37147.00 frames. ], tot_loss[loss=0.1651, simple_loss=0.2578, pruned_loss=0.03622, over 6871275.05 frames. ], batch size: 98, lr: 3.44e-03, grad_scale: 32.0 +2023-05-10 17:19:43,663 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0024-94914-0 from training. Duration: 22.72 +2023-05-10 17:19:45,129 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0031-94921-0_sp0.9 from training. Duration: 22.7444375 +2023-05-10 17:19:53,419 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.balancer2.prob, batch_count=566640.0, ans=0.125 +2023-05-10 17:20:28,159 WARNING [train.py:1182] (1/2) Exclude cut with ID 4133-6541-0027-40495-0_sp1.1 from training. Duration: 0.9681875 +2023-05-10 17:20:31,695 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62851-0022-91297-0_sp0.9 from training. Duration: 22.3166875 +2023-05-10 17:20:37,673 WARNING [train.py:1182] (1/2) Exclude cut with ID 543-133212-0015-59917-0_sp0.9 from training. Duration: 21.8166875 +2023-05-10 17:20:42,939 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.ff3_skip_rate, batch_count=566790.0, ans=0.0 +2023-05-10 17:20:56,029 INFO [train.py:1021] (1/2) Epoch 32, batch 650, loss[loss=0.146, simple_loss=0.2337, pruned_loss=0.02915, over 35876.00 frames. ], tot_loss[loss=0.1653, simple_loss=0.2578, pruned_loss=0.03634, over 6916908.96 frames. ], batch size: 79, lr: 3.44e-03, grad_scale: 32.0 +2023-05-10 17:20:57,803 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.ff3_skip_rate, batch_count=566840.0, ans=0.0 +2023-05-10 17:21:18,463 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.466e+02 2.963e+02 3.375e+02 3.962e+02 6.245e+02, threshold=6.751e+02, percent-clipped=0.0 +2023-05-10 17:21:18,826 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.attention_skip_rate, batch_count=566890.0, ans=0.0 +2023-05-10 17:21:30,128 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.3712, 4.1532, 3.8450, 4.1410, 3.4801, 3.1650, 3.6109, 3.1512], + device='cuda:1') +2023-05-10 17:21:35,384 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.balancer2.prob, batch_count=566940.0, ans=0.125 +2023-05-10 17:21:45,987 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=566990.0, ans=0.1 +2023-05-10 17:21:59,429 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.out_proj.dropout_p, batch_count=567040.0, ans=0.1 +2023-05-10 17:22:02,585 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.const_attention_rate, batch_count=567040.0, ans=0.025 +2023-05-10 17:22:12,921 INFO [train.py:1021] (1/2) Epoch 32, batch 700, loss[loss=0.1562, simple_loss=0.2428, pruned_loss=0.03486, over 37080.00 frames. ], tot_loss[loss=0.1659, simple_loss=0.2587, pruned_loss=0.03659, over 6960034.37 frames. ], batch size: 94, lr: 3.44e-03, grad_scale: 32.0 +2023-05-10 17:22:22,609 WARNING [train.py:1182] (1/2) Exclude cut with ID 4957-30119-0041-23990-0_sp0.9 from training. Duration: 20.22775 +2023-05-10 17:22:29,880 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=4.30 vs. limit=15.0 +2023-05-10 17:22:50,712 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_skip_rate, batch_count=567190.0, ans=0.0 +2023-05-10 17:22:55,264 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([4.9426, 4.2355, 2.9849, 2.8880], device='cuda:1') +2023-05-10 17:23:02,755 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=567240.0, ans=0.1 +2023-05-10 17:23:04,393 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_skip_rate, batch_count=567240.0, ans=0.0 +2023-05-10 17:23:10,160 WARNING [train.py:1182] (1/2) Exclude cut with ID 5239-32139-0047-9341-0_sp1.1 from training. Duration: 24.67275 +2023-05-10 17:23:30,842 INFO [train.py:1021] (1/2) Epoch 32, batch 750, loss[loss=0.1721, simple_loss=0.2678, pruned_loss=0.03823, over 37080.00 frames. ], tot_loss[loss=0.1662, simple_loss=0.2591, pruned_loss=0.03668, over 7005841.04 frames. ], batch size: 103, lr: 3.44e-03, grad_scale: 32.0 +2023-05-10 17:23:38,344 WARNING [train.py:1182] (1/2) Exclude cut with ID 3082-165428-0081-50734-0_sp0.9 from training. Duration: 21.8055625 +2023-05-10 17:23:47,683 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([1.9811, 4.1087, 3.8022, 4.1259, 3.4726, 3.2083, 3.6123, 3.0985], + device='cuda:1') +2023-05-10 17:23:53,189 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.571e+02 3.168e+02 3.531e+02 4.150e+02 6.097e+02, threshold=7.062e+02, percent-clipped=0.0 +2023-05-10 17:23:53,655 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.balancer2.prob, batch_count=567390.0, ans=0.125 +2023-05-10 17:24:01,128 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_skip_rate, batch_count=567440.0, ans=0.0 +2023-05-10 17:24:10,890 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=567440.0, ans=0.125 +2023-05-10 17:24:16,670 WARNING [train.py:1182] (1/2) Exclude cut with ID 3340-169293-0054-76830-0_sp0.9 from training. Duration: 22.6666875 +2023-05-10 17:24:25,087 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.bypass_mid.scale_min, batch_count=567490.0, ans=0.2 +2023-05-10 17:24:28,037 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.attention_skip_rate, batch_count=567490.0, ans=0.0 +2023-05-10 17:24:47,729 INFO [train.py:1021] (1/2) Epoch 32, batch 800, loss[loss=0.1676, simple_loss=0.262, pruned_loss=0.03664, over 37142.00 frames. ], tot_loss[loss=0.1667, simple_loss=0.2594, pruned_loss=0.03694, over 7026554.14 frames. ], batch size: 98, lr: 3.44e-03, grad_scale: 32.0 +2023-05-10 17:25:17,266 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module2.balancer1.prob, batch_count=567690.0, ans=0.125 +2023-05-10 17:25:22,855 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=256, metric=9.13 vs. limit=22.5 +2023-05-10 17:25:26,843 WARNING [train.py:1182] (1/2) Exclude cut with ID 2411-132532-0017-82279-0_sp1.1 from training. Duration: 0.9681875 +2023-05-10 17:25:28,910 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.whiten, num_groups=1, num_channels=256, metric=2.69 vs. limit=12.0 +2023-05-10 17:25:33,007 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module1.balancer2.prob, batch_count=567740.0, ans=0.125 +2023-05-10 17:25:52,730 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62850-0007-91323-0 from training. Duration: 22.485 +2023-05-10 17:26:04,568 INFO [train.py:1021] (1/2) Epoch 32, batch 850, loss[loss=0.176, simple_loss=0.2724, pruned_loss=0.03977, over 32127.00 frames. ], tot_loss[loss=0.1669, simple_loss=0.26, pruned_loss=0.03691, over 7042881.64 frames. ], batch size: 170, lr: 3.44e-03, grad_scale: 32.0 +2023-05-10 17:26:27,556 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.535e+02 3.001e+02 3.367e+02 4.253e+02 6.227e+02, threshold=6.734e+02, percent-clipped=0.0 +2023-05-10 17:26:33,586 WARNING [train.py:1182] (1/2) Exclude cut with ID 3972-170212-0014-23379-0_sp1.1 from training. Duration: 23.82275 +2023-05-10 17:26:43,046 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=567940.0, ans=0.125 +2023-05-10 17:26:49,311 WARNING [train.py:1182] (1/2) Exclude cut with ID 4860-13185-0032-76709-0 from training. Duration: 20.77 +2023-05-10 17:26:57,264 WARNING [train.py:1182] (1/2) Exclude cut with ID 6426-64292-0017-15984-0_sp0.9 from training. Duration: 24.088875 +2023-05-10 17:27:15,120 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.5.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.4540, 3.4616, 2.3674, 2.5270], device='cuda:1') +2023-05-10 17:27:21,990 INFO [train.py:1021] (1/2) Epoch 32, batch 900, loss[loss=0.1894, simple_loss=0.272, pruned_loss=0.05344, over 24349.00 frames. ], tot_loss[loss=0.1663, simple_loss=0.2594, pruned_loss=0.03664, over 7056759.40 frames. ], batch size: 234, lr: 3.44e-03, grad_scale: 32.0 +2023-05-10 17:27:25,458 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.balancer1.prob, batch_count=568090.0, ans=0.125 +2023-05-10 17:27:28,072 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62850-0007-91323-0_sp1.1 from training. Duration: 20.4409375 +2023-05-10 17:28:18,097 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module1.balancer1.prob, batch_count=568240.0, ans=0.125 +2023-05-10 17:28:39,597 INFO [train.py:1021] (1/2) Epoch 32, batch 950, loss[loss=0.1834, simple_loss=0.2784, pruned_loss=0.04419, over 37083.00 frames. ], tot_loss[loss=0.166, simple_loss=0.2592, pruned_loss=0.03641, over 7099752.18 frames. ], batch size: 110, lr: 3.44e-03, grad_scale: 32.0 +2023-05-10 17:28:43,131 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.1.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([4.4030, 3.7564, 4.0061, 3.8090], device='cuda:1') +2023-05-10 17:28:50,448 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0062-62366-0_sp0.9 from training. Duration: 22.511125 +2023-05-10 17:28:50,485 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0031-39906-0 from training. Duration: 20.675 +2023-05-10 17:28:50,719 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=568340.0, ans=0.0 +2023-05-10 17:28:59,900 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=568390.0, ans=0.1 +2023-05-10 17:29:03,984 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.545e+02 3.242e+02 3.648e+02 4.303e+02 7.127e+02, threshold=7.297e+02, percent-clipped=1.0 +2023-05-10 17:29:23,282 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_skip_rate, batch_count=568440.0, ans=0.0 +2023-05-10 17:29:42,051 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.0.layers.1.self_attn_weights, attn_weights_entropy = tensor([5.9977, 5.1999, 5.2548, 5.8314], device='cuda:1') +2023-05-10 17:29:51,069 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.0.layers.1.self_attn_weights, attn_weights_entropy = tensor([4.9642, 4.3997, 4.4483, 4.8720], device='cuda:1') +2023-05-10 17:29:53,903 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder_embed.convnext.out_balancer.prob, batch_count=568540.0, ans=0.125 +2023-05-10 17:29:56,824 INFO [train.py:1021] (1/2) Epoch 32, batch 1000, loss[loss=0.1461, simple_loss=0.2288, pruned_loss=0.03174, over 35800.00 frames. ], tot_loss[loss=0.1657, simple_loss=0.2588, pruned_loss=0.0363, over 7117103.09 frames. ], batch size: 79, lr: 3.44e-03, grad_scale: 16.0 +2023-05-10 17:30:00,245 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.out_proj.dropout_p, batch_count=568590.0, ans=0.1 +2023-05-10 17:30:19,437 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=6.67 vs. limit=15.0 +2023-05-10 17:30:33,091 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.bypass_mid.scale_min, batch_count=568690.0, ans=0.2 +2023-05-10 17:30:35,812 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62850-0007-91323-0_sp0.9 from training. Duration: 24.9833125 +2023-05-10 17:30:37,681 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.7042, 3.5697, 3.9500, 3.6103], device='cuda:1') +2023-05-10 17:30:39,647 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.conv_module2.whiten, num_groups=1, num_channels=192, metric=5.02 vs. limit=15.0 +2023-05-10 17:31:07,931 WARNING [train.py:1182] (1/2) Exclude cut with ID 5239-32139-0047-9341-0 from training. Duration: 27.14 +2023-05-10 17:31:13,844 INFO [train.py:1021] (1/2) Epoch 32, batch 1050, loss[loss=0.1631, simple_loss=0.2566, pruned_loss=0.03475, over 37020.00 frames. ], tot_loss[loss=0.1654, simple_loss=0.2584, pruned_loss=0.03617, over 7165341.29 frames. ], batch size: 99, lr: 3.44e-03, grad_scale: 16.0 +2023-05-10 17:31:25,095 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0044-62348-0 from training. Duration: 22.44 +2023-05-10 17:31:35,885 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_skip_rate, batch_count=568890.0, ans=0.0 +2023-05-10 17:31:38,603 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.450e+02 3.122e+02 3.761e+02 4.408e+02 8.134e+02, threshold=7.523e+02, percent-clipped=1.0 +2023-05-10 17:31:39,139 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([5.0727, 4.4124, 3.3851, 3.3849], device='cuda:1') +2023-05-10 17:32:22,532 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=569040.0, ans=0.125 +2023-05-10 17:32:26,962 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.attention_skip_rate, batch_count=569040.0, ans=0.0 +2023-05-10 17:32:31,204 INFO [train.py:1021] (1/2) Epoch 32, batch 1100, loss[loss=0.1633, simple_loss=0.2519, pruned_loss=0.03736, over 36854.00 frames. ], tot_loss[loss=0.1651, simple_loss=0.2582, pruned_loss=0.036, over 7203053.57 frames. ], batch size: 96, lr: 3.44e-03, grad_scale: 16.0 +2023-05-10 17:32:39,387 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module1.whiten.whitening_limit, batch_count=569090.0, ans=15.0 +2023-05-10 17:32:45,261 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0060-62364-0_sp0.9 from training. Duration: 21.361125 +2023-05-10 17:32:51,458 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.6517, 3.8933, 4.2872, 3.9214], device='cuda:1') +2023-05-10 17:32:52,630 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0079-62383-0_sp1.1 from training. Duration: 27.0318125 +2023-05-10 17:32:52,856 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.const_attention_rate, batch_count=569140.0, ans=0.025 +2023-05-10 17:33:01,601 WARNING [train.py:1182] (1/2) Exclude cut with ID 5622-44585-0006-90525-0_sp0.9 from training. Duration: 28.638875 +2023-05-10 17:33:01,872 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.balancer2.prob, batch_count=569190.0, ans=0.125 +2023-05-10 17:33:20,027 WARNING [train.py:1182] (1/2) Exclude cut with ID 3340-169293-0054-76830-0 from training. Duration: 20.4 +2023-05-10 17:33:27,778 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.balancer2.prob, batch_count=569240.0, ans=0.125 +2023-05-10 17:33:37,267 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=4.46 vs. limit=15.0 +2023-05-10 17:33:47,839 INFO [train.py:1021] (1/2) Epoch 32, batch 1150, loss[loss=0.1581, simple_loss=0.2507, pruned_loss=0.03274, over 37132.00 frames. ], tot_loss[loss=0.1643, simple_loss=0.2571, pruned_loss=0.03574, over 7206734.56 frames. ], batch size: 98, lr: 3.43e-03, grad_scale: 16.0 +2023-05-10 17:33:54,235 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.5.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.3929, 4.0050, 2.2909, 2.4935], device='cuda:1') +2023-05-10 17:33:55,363 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0071-62375-0 from training. Duration: 20.025 +2023-05-10 17:33:55,387 WARNING [train.py:1182] (1/2) Exclude cut with ID 2364-131735-0112-64612-0_sp0.9 from training. Duration: 20.488875 +2023-05-10 17:33:55,717 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.2871, 4.0722, 3.7152, 4.0689, 3.3279, 3.1003, 3.5354, 3.0189], + device='cuda:1') +2023-05-10 17:34:02,767 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0079-62383-0 from training. Duration: 29.735 +2023-05-10 17:34:12,455 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.465e+02 2.987e+02 3.683e+02 4.355e+02 7.253e+02, threshold=7.367e+02, percent-clipped=0.0 +2023-05-10 17:34:18,120 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=256, metric=18.25 vs. limit=22.5 +2023-05-10 17:34:46,892 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=569490.0, ans=0.1 +2023-05-10 17:34:48,409 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module1.balancer2.prob, batch_count=569540.0, ans=0.125 +2023-05-10 17:35:05,182 INFO [train.py:1021] (1/2) Epoch 32, batch 1200, loss[loss=0.1804, simple_loss=0.2773, pruned_loss=0.04173, over 35774.00 frames. ], tot_loss[loss=0.1645, simple_loss=0.2574, pruned_loss=0.03579, over 7213113.95 frames. ], batch size: 133, lr: 3.43e-03, grad_scale: 32.0 +2023-05-10 17:35:11,735 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=569590.0, ans=0.1 +2023-05-10 17:35:22,267 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.const_attention_rate, batch_count=569640.0, ans=0.025 +2023-05-10 17:35:26,726 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module2.balancer2.min_abs, batch_count=569640.0, ans=0.5 +2023-05-10 17:35:26,901 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.4783, 3.7469, 4.2076, 3.8406], device='cuda:1') +2023-05-10 17:35:31,041 WARNING [train.py:1182] (1/2) Exclude cut with ID 7276-92427-0014-12983-0_sp0.9 from training. Duration: 21.3055625 +2023-05-10 17:35:32,443 WARNING [train.py:1182] (1/2) Exclude cut with ID 1025-75365-0008-79168-0_sp0.9 from training. Duration: 22.0666875 +2023-05-10 17:35:50,365 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.whiten, num_groups=1, num_channels=256, metric=3.64 vs. limit=12.0 +2023-05-10 17:35:51,470 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.nonlin_attention.balancer.max_positive, batch_count=569740.0, ans=0.95 +2023-05-10 17:35:57,275 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.balancer2.prob, batch_count=569740.0, ans=0.125 +2023-05-10 17:36:13,355 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass.skip_rate, batch_count=569790.0, ans=0.04949747468305833 +2023-05-10 17:36:16,312 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module2.balancer1.prob, batch_count=569790.0, ans=0.125 +2023-05-10 17:36:19,463 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([4.8844, 4.1385, 4.3881, 4.4215], device='cuda:1') +2023-05-10 17:36:22,121 INFO [train.py:1021] (1/2) Epoch 32, batch 1250, loss[loss=0.1509, simple_loss=0.2371, pruned_loss=0.03232, over 35374.00 frames. ], tot_loss[loss=0.1649, simple_loss=0.2578, pruned_loss=0.03596, over 7222819.05 frames. ], batch size: 78, lr: 3.43e-03, grad_scale: 32.0 +2023-05-10 17:36:35,034 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.1.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([5.6820, 4.9922, 5.1523, 4.8181], device='cuda:1') +2023-05-10 17:36:46,561 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.503e+02 3.149e+02 3.582e+02 4.452e+02 7.782e+02, threshold=7.165e+02, percent-clipped=2.0 +2023-05-10 17:36:51,553 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.4222, 3.9686, 3.6666, 3.9675, 3.2973, 2.9804, 3.4465, 2.9141], + device='cuda:1') +2023-05-10 17:37:27,257 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0062-62366-0 from training. Duration: 20.26 +2023-05-10 17:37:39,428 INFO [train.py:1021] (1/2) Epoch 32, batch 1300, loss[loss=0.1641, simple_loss=0.2606, pruned_loss=0.03381, over 37094.00 frames. ], tot_loss[loss=0.1646, simple_loss=0.2577, pruned_loss=0.03578, over 7232558.46 frames. ], batch size: 110, lr: 3.43e-03, grad_scale: 16.0 +2023-05-10 17:37:42,487 WARNING [train.py:1182] (1/2) Exclude cut with ID 5239-32139-0030-9324-0_sp0.9 from training. Duration: 21.3444375 +2023-05-10 17:37:43,299 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=5.91 vs. limit=15.0 +2023-05-10 17:37:44,201 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.const_attention_rate, batch_count=570090.0, ans=0.025 +2023-05-10 17:37:44,324 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.bypass_mid.scale_min, batch_count=570090.0, ans=0.2 +2023-05-10 17:37:55,557 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=570140.0, ans=0.1 +2023-05-10 17:38:18,274 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.whiten, num_groups=1, num_channels=256, metric=2.99 vs. limit=12.0 +2023-05-10 17:38:36,990 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=256, metric=8.06 vs. limit=22.5 +2023-05-10 17:38:44,503 WARNING [train.py:1182] (1/2) Exclude cut with ID 497-129325-0061-62254-0_sp1.1 from training. Duration: 0.97725 +2023-05-10 17:38:56,717 INFO [train.py:1021] (1/2) Epoch 32, batch 1350, loss[loss=0.1682, simple_loss=0.2651, pruned_loss=0.03571, over 37117.00 frames. ], tot_loss[loss=0.1648, simple_loss=0.258, pruned_loss=0.03581, over 7250207.61 frames. ], batch size: 103, lr: 3.43e-03, grad_scale: 16.0 +2023-05-10 17:39:04,493 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module2.balancer1.min_positive, batch_count=570340.0, ans=0.025 +2023-05-10 17:39:22,594 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.495e+02 2.978e+02 3.574e+02 4.367e+02 7.804e+02, threshold=7.147e+02, percent-clipped=2.0 +2023-05-10 17:39:28,877 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0031-39906-0_sp0.9 from training. Duration: 22.97225 +2023-05-10 17:39:29,165 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.0.layers.1.self_attn_weights, attn_weights_entropy = tensor([6.1564, 5.2920, 5.4592, 6.0213], device='cuda:1') +2023-05-10 17:39:32,264 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.ff2_skip_rate, batch_count=570440.0, ans=0.0 +2023-05-10 17:39:33,806 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.out_proj.dropout_p, batch_count=570440.0, ans=0.1 +2023-05-10 17:39:40,539 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.bypass.scale_min, batch_count=570440.0, ans=0.2 +2023-05-10 17:39:50,994 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 17:39:53,999 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=570490.0, ans=0.125 +2023-05-10 17:39:57,787 INFO [scaling.py:969] (1/2) Whitening: name=encoder_embed.convnext.out_whiten, num_groups=1, num_channels=128, metric=4.57 vs. limit=5.0 +2023-05-10 17:39:59,829 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0047-39922-0_sp0.9 from training. Duration: 21.97775 +2023-05-10 17:40:11,727 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=6.41 vs. limit=15.0 +2023-05-10 17:40:13,948 INFO [train.py:1021] (1/2) Epoch 32, batch 1400, loss[loss=0.171, simple_loss=0.2686, pruned_loss=0.03666, over 37132.00 frames. ], tot_loss[loss=0.1643, simple_loss=0.2574, pruned_loss=0.0356, over 7257998.57 frames. ], batch size: 107, lr: 3.43e-03, grad_scale: 16.0 +2023-05-10 17:40:15,433 WARNING [train.py:1182] (1/2) Exclude cut with ID 1112-1043-0006-89194-0_sp0.9 from training. Duration: 21.8333125 +2023-05-10 17:40:15,724 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=570590.0, ans=0.1 +2023-05-10 17:40:25,869 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0031-94921-0 from training. Duration: 20.47 +2023-05-10 17:40:46,207 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=570690.0, ans=0.1 +2023-05-10 17:41:30,894 INFO [train.py:1021] (1/2) Epoch 32, batch 1450, loss[loss=0.172, simple_loss=0.2696, pruned_loss=0.03721, over 37086.00 frames. ], tot_loss[loss=0.1643, simple_loss=0.2575, pruned_loss=0.03555, over 7233900.47 frames. ], batch size: 110, lr: 3.43e-03, grad_scale: 16.0 +2023-05-10 17:41:34,006 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0037-39912-0_sp0.9 from training. Duration: 20.67225 +2023-05-10 17:41:41,839 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.1.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 17:41:53,594 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0024-94914-0_sp0.9 from training. Duration: 25.2444375 +2023-05-10 17:41:53,861 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.0.layers.1.self_attn_weights, attn_weights_entropy = tensor([6.3379, 5.5317, 5.6387, 6.2335], device='cuda:1') +2023-05-10 17:41:54,305 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=256, metric=8.79 vs. limit=22.5 +2023-05-10 17:41:56,629 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.567e+02 3.177e+02 3.993e+02 5.055e+02 7.885e+02, threshold=7.986e+02, percent-clipped=2.0 +2023-05-10 17:42:14,291 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward3.hidden_balancer.prob, batch_count=570940.0, ans=0.125 +2023-05-10 17:42:18,471 WARNING [train.py:1182] (1/2) Exclude cut with ID 3340-169293-0021-76797-0_sp0.9 from training. Duration: 21.1445 +2023-05-10 17:42:34,692 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.balancer1.prob, batch_count=571040.0, ans=0.125 +2023-05-10 17:42:47,841 INFO [train.py:1021] (1/2) Epoch 32, batch 1500, loss[loss=0.147, simple_loss=0.2317, pruned_loss=0.0311, over 36865.00 frames. ], tot_loss[loss=0.1642, simple_loss=0.2573, pruned_loss=0.0355, over 7217732.04 frames. ], batch size: 84, lr: 3.43e-03, grad_scale: 16.0 +2023-05-10 17:43:14,725 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=8.53 vs. limit=15.0 +2023-05-10 17:43:20,584 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.1.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 17:43:39,254 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0079-62383-0_sp0.9 from training. Duration: 33.038875 +2023-05-10 17:43:48,447 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=571290.0, ans=0.0 +2023-05-10 17:43:59,443 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.attention_skip_rate, batch_count=571290.0, ans=0.0 +2023-05-10 17:43:59,446 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=571290.0, ans=0.125 +2023-05-10 17:43:59,467 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.out_combiner.scale_min, batch_count=571290.0, ans=0.2 +2023-05-10 17:44:05,064 INFO [train.py:1021] (1/2) Epoch 32, batch 1550, loss[loss=0.1814, simple_loss=0.2723, pruned_loss=0.04529, over 36762.00 frames. ], tot_loss[loss=0.1641, simple_loss=0.2572, pruned_loss=0.0355, over 7215267.62 frames. ], batch size: 122, lr: 3.43e-03, grad_scale: 16.0 +2023-05-10 17:44:19,311 WARNING [train.py:1182] (1/2) Exclude cut with ID 6426-64291-0000-16059-0_sp0.9 from training. Duration: 20.0944375 +2023-05-10 17:44:30,979 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.531e+02 2.940e+02 3.301e+02 3.891e+02 5.385e+02, threshold=6.602e+02, percent-clipped=0.0 +2023-05-10 17:44:35,700 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0044-62348-0_sp1.1 from training. Duration: 20.4 +2023-05-10 17:44:43,284 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62851-0022-91297-0 from training. Duration: 20.085 +2023-05-10 17:44:54,511 WARNING [train.py:1182] (1/2) Exclude cut with ID 4860-13185-0032-76709-0_sp0.9 from training. Duration: 23.07775 +2023-05-10 17:45:22,065 INFO [train.py:1021] (1/2) Epoch 32, batch 1600, loss[loss=0.1571, simple_loss=0.2574, pruned_loss=0.02833, over 37005.00 frames. ], tot_loss[loss=0.1644, simple_loss=0.2574, pruned_loss=0.03568, over 7215813.04 frames. ], batch size: 104, lr: 3.43e-03, grad_scale: 32.0 +2023-05-10 17:45:34,469 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=571590.0, ans=0.0 +2023-05-10 17:45:40,550 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([2.8460, 3.5927, 3.3551, 4.2839, 2.6348, 3.6419, 4.3256, 3.7241], + device='cuda:1') +2023-05-10 17:45:43,256 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0044-62348-0_sp0.9 from training. Duration: 24.9333125 +2023-05-10 17:46:26,727 WARNING [train.py:1182] (1/2) Exclude cut with ID 5118-111612-0016-124680-0_sp0.9 from training. Duration: 20.388875 +2023-05-10 17:46:32,731 WARNING [train.py:1182] (1/2) Exclude cut with ID 432-122774-0017-62487-0_sp1.1 from training. Duration: 20.3590625 +2023-05-10 17:46:39,357 INFO [train.py:1021] (1/2) Epoch 32, batch 1650, loss[loss=0.1841, simple_loss=0.2779, pruned_loss=0.04508, over 36732.00 frames. ], tot_loss[loss=0.1641, simple_loss=0.2569, pruned_loss=0.03563, over 7215521.45 frames. ], batch size: 118, lr: 3.43e-03, grad_scale: 32.0 +2023-05-10 17:46:48,857 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module1.balancer2.min_abs, batch_count=571840.0, ans=0.5 +2023-05-10 17:47:05,613 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.640e+02 3.061e+02 3.373e+02 3.775e+02 6.783e+02, threshold=6.745e+02, percent-clipped=1.0 +2023-05-10 17:47:07,495 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module1.balancer2.prob, batch_count=571890.0, ans=0.125 +2023-05-10 17:47:25,871 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module1.balancer2.prob, batch_count=571990.0, ans=0.125 +2023-05-10 17:47:31,983 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.ff2_skip_rate, batch_count=571990.0, ans=0.0 +2023-05-10 17:47:48,839 WARNING [train.py:1182] (1/2) Exclude cut with ID 3557-8342-0013-54691-0_sp1.1 from training. Duration: 0.836375 +2023-05-10 17:47:51,990 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.balancer2.prob, batch_count=572040.0, ans=0.125 +2023-05-10 17:47:56,718 INFO [train.py:1021] (1/2) Epoch 32, batch 1700, loss[loss=0.1604, simple_loss=0.2493, pruned_loss=0.03577, over 37123.00 frames. ], tot_loss[loss=0.1646, simple_loss=0.2573, pruned_loss=0.03594, over 7219994.41 frames. ], batch size: 98, lr: 3.43e-03, grad_scale: 32.0 +2023-05-10 17:48:18,083 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module1.balancer2.prob, batch_count=572140.0, ans=0.125 +2023-05-10 17:48:33,375 WARNING [train.py:1182] (1/2) Exclude cut with ID 8565-290391-0049-67394-0_sp0.9 from training. Duration: 21.3166875 +2023-05-10 17:48:52,118 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=572240.0, ans=0.0 +2023-05-10 17:48:56,823 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.2293, 3.9691, 3.6941, 3.9896, 3.3228, 3.0757, 3.4787, 2.9872], + device='cuda:1') +2023-05-10 17:49:02,665 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_skip_rate, batch_count=572290.0, ans=0.0 +2023-05-10 17:49:05,392 WARNING [train.py:1182] (1/2) Exclude cut with ID 6533-399-0029-104863-0_sp0.9 from training. Duration: 22.1055625 +2023-05-10 17:49:12,864 INFO [train.py:1021] (1/2) Epoch 32, batch 1750, loss[loss=0.1874, simple_loss=0.2806, pruned_loss=0.04707, over 36777.00 frames. ], tot_loss[loss=0.1662, simple_loss=0.2583, pruned_loss=0.03701, over 7174939.12 frames. ], batch size: 118, lr: 3.43e-03, grad_scale: 32.0 +2023-05-10 17:49:14,798 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=572340.0, ans=0.0 +2023-05-10 17:49:16,036 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0094-26379-0_sp1.1 from training. Duration: 21.77725 +2023-05-10 17:49:31,893 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module2.balancer2.prob, batch_count=572390.0, ans=0.125 +2023-05-10 17:49:37,721 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0005-134304-0_sp0.9 from training. Duration: 27.8166875 +2023-05-10 17:49:39,082 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.795e+02 3.387e+02 3.734e+02 4.608e+02 7.393e+02, threshold=7.469e+02, percent-clipped=2.0 +2023-05-10 17:49:49,131 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass.skip_rate, batch_count=572440.0, ans=0.07 +2023-05-10 17:49:52,106 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module1.balancer2.prob, batch_count=572440.0, ans=0.125 +2023-05-10 17:50:03,776 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0021-15852-0_sp1.1 from training. Duration: 22.5090625 +2023-05-10 17:50:09,875 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0005-134304-0 from training. Duration: 25.035 +2023-05-10 17:50:19,665 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder_embed.conv.5.prob, batch_count=572540.0, ans=0.125 +2023-05-10 17:50:21,816 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=256, metric=7.35 vs. limit=22.5 +2023-05-10 17:50:22,973 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([4.9843, 4.2512, 4.5128, 4.5264], device='cuda:1') +2023-05-10 17:50:24,529 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.balancer1.prob, batch_count=572540.0, ans=0.125 +2023-05-10 17:50:30,099 INFO [train.py:1021] (1/2) Epoch 32, batch 1800, loss[loss=0.1603, simple_loss=0.2444, pruned_loss=0.03807, over 37055.00 frames. ], tot_loss[loss=0.1677, simple_loss=0.2592, pruned_loss=0.03814, over 7152730.99 frames. ], batch size: 94, lr: 3.43e-03, grad_scale: 32.0 +2023-05-10 17:50:30,190 WARNING [train.py:1182] (1/2) Exclude cut with ID 774-127930-0014-10412-0_sp1.1 from training. Duration: 0.95 +2023-05-10 17:50:35,670 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module1.balancer2.prob, batch_count=572590.0, ans=0.125 +2023-05-10 17:50:46,339 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.balancer2.prob, batch_count=572640.0, ans=0.125 +2023-05-10 17:50:50,433 WARNING [train.py:1182] (1/2) Exclude cut with ID 3033-130750-0096-55598-0_sp0.9 from training. Duration: 0.92225 +2023-05-10 17:50:58,347 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=572640.0, ans=0.1 +2023-05-10 17:51:08,871 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module2.balancer1.prob, batch_count=572690.0, ans=0.125 +2023-05-10 17:51:15,034 WARNING [train.py:1182] (1/2) Exclude cut with ID 4511-76322-0006-80011-0 from training. Duration: 21.97 +2023-05-10 17:51:18,180 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.bypass.scale_min, batch_count=572740.0, ans=0.2 +2023-05-10 17:51:21,106 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.0.layers.1.self_attn_weights, attn_weights_entropy = tensor([6.0612, 5.2415, 5.3406, 5.9226], device='cuda:1') +2023-05-10 17:51:36,667 WARNING [train.py:1182] (1/2) Exclude cut with ID 7492-105653-0055-62765-0_sp0.9 from training. Duration: 21.97225 +2023-05-10 17:51:38,158 WARNING [train.py:1182] (1/2) Exclude cut with ID 453-131332-0000-47844-0_sp0.9 from training. Duration: 25.3333125 +2023-05-10 17:51:41,444 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.ff2_skip_rate, batch_count=572790.0, ans=0.0 +2023-05-10 17:51:47,102 INFO [train.py:1021] (1/2) Epoch 32, batch 1850, loss[loss=0.1804, simple_loss=0.2707, pruned_loss=0.04511, over 37031.00 frames. ], tot_loss[loss=0.1692, simple_loss=0.26, pruned_loss=0.03921, over 7162745.66 frames. ], batch size: 116, lr: 3.42e-03, grad_scale: 32.0 +2023-05-10 17:51:47,290 WARNING [train.py:1182] (1/2) Exclude cut with ID 5172-29468-0015-19128-0_sp0.9 from training. Duration: 21.5055625 +2023-05-10 17:51:57,824 WARNING [train.py:1182] (1/2) Exclude cut with ID 453-131332-0000-47844-0_sp1.1 from training. Duration: 20.72725 +2023-05-10 17:52:13,254 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.984e+02 3.552e+02 4.027e+02 4.852e+02 6.431e+02, threshold=8.054e+02, percent-clipped=0.0 +2023-05-10 17:52:15,071 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.bypass.skip_rate, batch_count=572890.0, ans=0.07 +2023-05-10 17:52:31,381 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.whiten, num_groups=1, num_channels=192, metric=3.75 vs. limit=12.0 +2023-05-10 17:52:33,596 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0030-130156-0_sp0.9 from training. Duration: 26.32775 +2023-05-10 17:52:38,877 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=256, metric=3.12 vs. limit=15.0 +2023-05-10 17:52:53,617 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=573040.0, ans=0.125 +2023-05-10 17:52:58,230 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.2434, 4.1674, 3.8666, 4.2077, 3.4359, 3.2535, 3.6136, 3.0729], + device='cuda:1') +2023-05-10 17:53:04,512 INFO [train.py:1021] (1/2) Epoch 32, batch 1900, loss[loss=0.1427, simple_loss=0.2254, pruned_loss=0.02997, over 35814.00 frames. ], tot_loss[loss=0.1698, simple_loss=0.2596, pruned_loss=0.03999, over 7162686.77 frames. ], batch size: 79, lr: 3.42e-03, grad_scale: 32.0 +2023-05-10 17:53:09,211 WARNING [train.py:1182] (1/2) Exclude cut with ID 3867-173237-0077-144769-0 from training. Duration: 20.025 +2023-05-10 17:53:13,608 WARNING [train.py:1182] (1/2) Exclude cut with ID 6709-74022-0004-86860-0_sp1.1 from training. Duration: 0.9409375 +2023-05-10 17:53:13,619 WARNING [train.py:1182] (1/2) Exclude cut with ID 4757-1811-0023-62229-0_sp0.9 from training. Duration: 21.37775 +2023-05-10 17:53:20,511 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=573140.0, ans=0.125 +2023-05-10 17:53:28,110 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=573140.0, ans=0.1 +2023-05-10 17:53:34,320 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=573190.0, ans=0.1 +2023-05-10 17:53:37,116 WARNING [train.py:1182] (1/2) Exclude cut with ID 1250-135782-0004-25974-0_sp0.9 from training. Duration: 21.17225 +2023-05-10 17:53:37,128 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0021-15852-0_sp0.9 from training. Duration: 27.511125 +2023-05-10 17:54:09,672 WARNING [train.py:1182] (1/2) Exclude cut with ID 453-131332-0000-47844-0 from training. Duration: 22.8 +2023-05-10 17:54:14,150 WARNING [train.py:1182] (1/2) Exclude cut with ID 4964-30587-0040-44509-0 from training. Duration: 22.585 +2023-05-10 17:54:21,608 INFO [train.py:1021] (1/2) Epoch 32, batch 1950, loss[loss=0.1823, simple_loss=0.2697, pruned_loss=0.04745, over 36813.00 frames. ], tot_loss[loss=0.1707, simple_loss=0.2598, pruned_loss=0.04077, over 7147169.45 frames. ], batch size: 113, lr: 3.42e-03, grad_scale: 32.0 +2023-05-10 17:54:41,715 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=573390.0, ans=0.125 +2023-05-10 17:54:44,659 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.5.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([3.1321, 4.1696, 4.7449, 4.9094], device='cuda:1') +2023-05-10 17:54:47,139 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.993e+02 3.610e+02 3.995e+02 5.004e+02 6.503e+02, threshold=7.991e+02, percent-clipped=0.0 +2023-05-10 17:54:47,269 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0001-146967-0_sp0.9 from training. Duration: 22.0166875 +2023-05-10 17:55:04,970 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0003-134302-0_sp1.1 from training. Duration: 24.395375 +2023-05-10 17:55:10,150 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=256, metric=7.79 vs. limit=22.5 +2023-05-10 17:55:12,496 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0017-41203-0_sp0.9 from training. Duration: 27.47775 +2023-05-10 17:55:14,227 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module1.balancer2.prob, batch_count=573490.0, ans=0.125 +2023-05-10 17:55:16,971 WARNING [train.py:1182] (1/2) Exclude cut with ID 432-122774-0017-62487-0_sp0.9 from training. Duration: 24.8833125 +2023-05-10 17:55:18,812 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module2.balancer2.prob, batch_count=573490.0, ans=0.125 +2023-05-10 17:55:19,916 WARNING [train.py:1182] (1/2) Exclude cut with ID 6758-72288-0033-108368-0 from training. Duration: 23.39 +2023-05-10 17:55:24,641 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0007-12994-0_sp0.9 from training. Duration: 28.72225 +2023-05-10 17:55:33,622 WARNING [train.py:1182] (1/2) Exclude cut with ID 585-294811-0110-133686-0_sp0.9 from training. Duration: 20.8944375 +2023-05-10 17:55:35,299 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module2.balancer2.prob, batch_count=573540.0, ans=0.125 +2023-05-10 17:55:38,096 INFO [train.py:1021] (1/2) Epoch 32, batch 2000, loss[loss=0.1708, simple_loss=0.2658, pruned_loss=0.03793, over 37103.00 frames. ], tot_loss[loss=0.1712, simple_loss=0.2598, pruned_loss=0.04124, over 7141699.57 frames. ], batch size: 107, lr: 3.42e-03, grad_scale: 32.0 +2023-05-10 17:55:41,452 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass.skip_rate, batch_count=573590.0, ans=0.09899494936611666 +2023-05-10 17:55:49,425 WARNING [train.py:1182] (1/2) Exclude cut with ID 5796-66357-0007-116447-0_sp0.9 from training. Duration: 23.8444375 +2023-05-10 17:56:14,700 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=256, metric=19.22 vs. limit=22.5 +2023-05-10 17:56:16,855 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0007-12994-0 from training. Duration: 25.85 +2023-05-10 17:56:16,865 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0023-13010-0 from training. Duration: 21.39 +2023-05-10 17:56:21,900 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.balancer.max_positive, batch_count=573690.0, ans=0.95 +2023-05-10 17:56:25,818 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0014-15845-0 from training. Duration: 27.92 +2023-05-10 17:56:31,933 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([4.3482, 3.7386, 2.6825, 2.5196], device='cuda:1') +2023-05-10 17:56:33,362 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=573740.0, ans=0.125 +2023-05-10 17:56:53,552 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0039-130165-0_sp0.9 from training. Duration: 20.661125 +2023-05-10 17:56:55,009 INFO [train.py:1021] (1/2) Epoch 32, batch 2050, loss[loss=0.1561, simple_loss=0.2408, pruned_loss=0.03567, over 37067.00 frames. ], tot_loss[loss=0.1719, simple_loss=0.2602, pruned_loss=0.04177, over 7129873.69 frames. ], batch size: 94, lr: 3.42e-03, grad_scale: 32.0 +2023-05-10 17:56:58,339 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.1.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([5.5672, 5.4061, 4.8052, 5.2142], device='cuda:1') +2023-05-10 17:57:19,203 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0043-15874-0_sp0.9 from training. Duration: 20.07225 +2023-05-10 17:57:20,569 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.899e+02 3.582e+02 3.968e+02 4.690e+02 7.278e+02, threshold=7.937e+02, percent-clipped=0.0 +2023-05-10 17:57:25,043 WARNING [train.py:1182] (1/2) Exclude cut with ID 1085-156170-0017-128270-0 from training. Duration: 21.01 +2023-05-10 17:58:02,916 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.const_attention_rate, batch_count=574040.0, ans=0.025 +2023-05-10 17:58:08,955 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.nonlin_attention.balancer.min_positive, batch_count=574040.0, ans=0.05 +2023-05-10 17:58:11,560 INFO [train.py:1021] (1/2) Epoch 32, batch 2100, loss[loss=0.1643, simple_loss=0.2422, pruned_loss=0.04315, over 36842.00 frames. ], tot_loss[loss=0.1724, simple_loss=0.2605, pruned_loss=0.04212, over 7149856.97 frames. ], batch size: 89, lr: 3.42e-03, grad_scale: 32.0 +2023-05-10 17:58:15,048 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module2.balancer1.min_positive, batch_count=574090.0, ans=0.025 +2023-05-10 17:58:16,684 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=574090.0, ans=0.0 +2023-05-10 17:58:38,369 WARNING [train.py:1182] (1/2) Exclude cut with ID 2195-150901-0045-59933-0 from training. Duration: 20.65 +2023-05-10 17:58:46,006 WARNING [train.py:1182] (1/2) Exclude cut with ID 5796-66357-0007-116447-0 from training. Duration: 21.46 +2023-05-10 17:58:56,819 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.1.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 17:58:58,328 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass_mid.scale_min, batch_count=574240.0, ans=0.2 +2023-05-10 17:59:28,924 INFO [train.py:1021] (1/2) Epoch 32, batch 2150, loss[loss=0.1824, simple_loss=0.2719, pruned_loss=0.04642, over 36805.00 frames. ], tot_loss[loss=0.1727, simple_loss=0.2605, pruned_loss=0.04248, over 7123218.57 frames. ], batch size: 122, lr: 3.42e-03, grad_scale: 16.0 +2023-05-10 17:59:32,177 WARNING [train.py:1182] (1/2) Exclude cut with ID 3557-8342-0013-54691-0 from training. Duration: 0.92 +2023-05-10 17:59:41,115 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0023-13010-0_sp0.9 from training. Duration: 23.7666875 +2023-05-10 17:59:49,037 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.bypass.skip_rate, batch_count=574390.0, ans=0.09899494936611666 +2023-05-10 17:59:55,979 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.897e+02 3.687e+02 4.211e+02 5.213e+02 8.983e+02, threshold=8.422e+02, percent-clipped=1.0 +2023-05-10 18:00:18,541 WARNING [train.py:1182] (1/2) Exclude cut with ID 8544-281189-0060-101339-0_sp0.9 from training. Duration: 20.861125 +2023-05-10 18:00:29,270 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-65654-0031-41259-0_sp0.9 from training. Duration: 22.711125 +2023-05-10 18:00:45,602 INFO [train.py:1021] (1/2) Epoch 32, batch 2200, loss[loss=0.1462, simple_loss=0.228, pruned_loss=0.03219, over 36789.00 frames. ], tot_loss[loss=0.1729, simple_loss=0.2606, pruned_loss=0.04264, over 7113624.24 frames. ], batch size: 89, lr: 3.42e-03, grad_scale: 16.0 +2023-05-10 18:01:09,412 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.balancer2.prob, batch_count=574640.0, ans=0.125 +2023-05-10 18:01:15,083 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0043-132310-0_sp1.1 from training. Duration: 22.986375 +2023-05-10 18:01:16,822 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=574690.0, ans=0.125 +2023-05-10 18:01:31,465 WARNING [train.py:1182] (1/2) Exclude cut with ID 8040-260924-0003-80960-0_sp0.9 from training. Duration: 22.07225 +2023-05-10 18:01:34,583 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0045-26330-0_sp0.9 from training. Duration: 20.3055625 +2023-05-10 18:01:37,412 WARNING [train.py:1182] (1/2) Exclude cut with ID 6356-271890-0060-94317-0_sp0.9 from training. Duration: 20.72225 +2023-05-10 18:01:56,825 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0017-41203-0_sp1.1 from training. Duration: 22.4818125 +2023-05-10 18:01:57,131 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 18:02:01,820 INFO [train.py:1021] (1/2) Epoch 32, batch 2250, loss[loss=0.1734, simple_loss=0.264, pruned_loss=0.04146, over 32289.00 frames. ], tot_loss[loss=0.1729, simple_loss=0.2605, pruned_loss=0.04266, over 7114057.42 frames. ], batch size: 170, lr: 3.42e-03, grad_scale: 16.0 +2023-05-10 18:02:14,585 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.0.layers.0.self_attn_weights, attn_weights_entropy = tensor([5.0570, 4.7095, 4.5012, 5.0360], device='cuda:1') +2023-05-10 18:02:23,444 WARNING [train.py:1182] (1/2) Exclude cut with ID 4964-30587-0040-44509-0_sp0.9 from training. Duration: 25.0944375 +2023-05-10 18:02:23,770 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.1635, 3.9581, 3.6837, 3.9774, 3.3418, 2.9811, 3.4620, 3.0106], + device='cuda:1') +2023-05-10 18:02:27,924 WARNING [train.py:1182] (1/2) Exclude cut with ID 6533-399-0047-104881-0 from training. Duration: 21.515 +2023-05-10 18:02:29,307 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.614e+02 3.468e+02 3.904e+02 4.427e+02 6.935e+02, threshold=7.807e+02, percent-clipped=0.0 +2023-05-10 18:02:35,497 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0009-15840-0_sp0.9 from training. Duration: 27.02225 +2023-05-10 18:02:38,890 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module2.balancer1.prob, batch_count=574940.0, ans=0.125 +2023-05-10 18:02:40,091 WARNING [train.py:1182] (1/2) Exclude cut with ID 432-122774-0010-62480-0_sp0.9 from training. Duration: 22.22225 +2023-05-10 18:02:46,730 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=256, metric=8.47 vs. limit=22.5 +2023-05-10 18:02:47,635 WARNING [train.py:1182] (1/2) Exclude cut with ID 4964-30587-0085-44554-0_sp0.9 from training. Duration: 20.85 +2023-05-10 18:02:54,029 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=4.34 vs. limit=15.0 +2023-05-10 18:02:58,334 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([5.1233, 4.4391, 4.6361, 4.7057], device='cuda:1') +2023-05-10 18:03:14,637 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module2.balancer2.prob, batch_count=575040.0, ans=0.125 +2023-05-10 18:03:18,679 INFO [train.py:1021] (1/2) Epoch 32, batch 2300, loss[loss=0.1742, simple_loss=0.2682, pruned_loss=0.04008, over 32427.00 frames. ], tot_loss[loss=0.1727, simple_loss=0.2601, pruned_loss=0.04271, over 7113681.82 frames. ], batch size: 170, lr: 3.42e-03, grad_scale: 16.0 +2023-05-10 18:03:20,328 WARNING [train.py:1182] (1/2) Exclude cut with ID 4295-39940-0007-92567-0 from training. Duration: 21.54 +2023-05-10 18:03:23,415 WARNING [train.py:1182] (1/2) Exclude cut with ID 4964-30587-0040-44509-0_sp1.1 from training. Duration: 20.5318125 +2023-05-10 18:03:33,997 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0012-134311-0_sp0.9 from training. Duration: 21.9333125 +2023-05-10 18:04:25,061 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0025-130151-0_sp0.9 from training. Duration: 21.7944375 +2023-05-10 18:04:25,435 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=575290.0, ans=0.0 +2023-05-10 18:04:35,626 INFO [train.py:1021] (1/2) Epoch 32, batch 2350, loss[loss=0.185, simple_loss=0.2738, pruned_loss=0.04813, over 36688.00 frames. ], tot_loss[loss=0.1724, simple_loss=0.2598, pruned_loss=0.04246, over 7129239.46 frames. ], batch size: 122, lr: 3.42e-03, grad_scale: 16.0 +2023-05-10 18:04:36,046 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.out_combiner.scale_min, batch_count=575340.0, ans=0.2 +2023-05-10 18:04:37,292 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0002-12989-0_sp0.9 from training. Duration: 22.4666875 +2023-05-10 18:04:46,145 WARNING [train.py:1182] (1/2) Exclude cut with ID 6121-9014-0076-24124-0 from training. Duration: 21.635 +2023-05-10 18:04:52,091 WARNING [train.py:1182] (1/2) Exclude cut with ID 6121-9014-0076-24124-0_sp0.9 from training. Duration: 24.038875 +2023-05-10 18:05:04,103 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.769e+02 3.443e+02 3.959e+02 5.041e+02 9.434e+02, threshold=7.919e+02, percent-clipped=5.0 +2023-05-10 18:05:41,159 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0009-134308-0_sp1.1 from training. Duration: 21.786375 +2023-05-10 18:05:53,131 INFO [train.py:1021] (1/2) Epoch 32, batch 2400, loss[loss=0.1563, simple_loss=0.2412, pruned_loss=0.03572, over 37186.00 frames. ], tot_loss[loss=0.1716, simple_loss=0.2591, pruned_loss=0.0421, over 7161032.15 frames. ], batch size: 93, lr: 3.42e-03, grad_scale: 32.0 +2023-05-10 18:05:53,215 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0002-12989-0 from training. Duration: 20.22 +2023-05-10 18:05:54,966 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward3.hidden_balancer.prob, batch_count=575590.0, ans=0.125 +2023-05-10 18:06:21,993 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.bypass.skip_rate, batch_count=575690.0, ans=0.04949747468305833 +2023-05-10 18:06:39,610 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.nonlin_attention.balancer.prob, batch_count=575740.0, ans=0.125 +2023-05-10 18:07:09,736 INFO [train.py:1021] (1/2) Epoch 32, batch 2450, loss[loss=0.1562, simple_loss=0.2452, pruned_loss=0.03358, over 37067.00 frames. ], tot_loss[loss=0.1714, simple_loss=0.2588, pruned_loss=0.04207, over 7165176.19 frames. ], batch size: 94, lr: 3.42e-03, grad_scale: 32.0 +2023-05-10 18:07:14,765 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=575840.0, ans=0.0 +2023-05-10 18:07:37,351 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.782e+02 3.492e+02 3.883e+02 4.522e+02 7.316e+02, threshold=7.765e+02, percent-clipped=0.0 +2023-05-10 18:07:37,769 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=575890.0, ans=0.125 +2023-05-10 18:07:46,745 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=575940.0, ans=0.125 +2023-05-10 18:07:58,754 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0043-132310-0 from training. Duration: 25.285 +2023-05-10 18:07:59,069 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([1.9295, 3.7074, 3.4461, 3.7079, 3.1468, 2.9233, 3.3172, 2.8384], + device='cuda:1') +2023-05-10 18:08:10,924 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.ff3_skip_rate, batch_count=576040.0, ans=0.0 +2023-05-10 18:08:14,633 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module1.balancer2.prob, batch_count=576040.0, ans=0.125 +2023-05-10 18:08:16,186 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.3.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 18:08:16,215 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=576040.0, ans=0.125 +2023-05-10 18:08:26,039 INFO [train.py:1021] (1/2) Epoch 32, batch 2500, loss[loss=0.164, simple_loss=0.2553, pruned_loss=0.03628, over 37095.00 frames. ], tot_loss[loss=0.1713, simple_loss=0.2585, pruned_loss=0.04202, over 7145425.73 frames. ], batch size: 110, lr: 3.41e-03, grad_scale: 32.0 +2023-05-10 18:08:26,517 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.4.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 18:08:37,522 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.whiten, num_groups=1, num_channels=256, metric=3.79 vs. limit=12.0 +2023-05-10 18:08:47,094 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_skip_rate, batch_count=576140.0, ans=0.0 +2023-05-10 18:08:57,688 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass.skip_rate, batch_count=576190.0, ans=0.09899494936611666 +2023-05-10 18:09:07,211 WARNING [train.py:1182] (1/2) Exclude cut with ID 811-130148-0001-63453-0_sp0.9 from training. Duration: 20.861125 +2023-05-10 18:09:14,032 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.balancer2.prob, batch_count=576240.0, ans=0.125 +2023-05-10 18:09:15,529 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward3.hidden_balancer.prob, batch_count=576240.0, ans=0.125 +2023-05-10 18:09:24,879 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=3.17 vs. limit=15.0 +2023-05-10 18:09:31,858 WARNING [train.py:1182] (1/2) Exclude cut with ID 6010-56788-0055-90261-0 from training. Duration: 20.88 +2023-05-10 18:09:39,959 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=5.50 vs. limit=15.0 +2023-05-10 18:09:42,220 INFO [train.py:1021] (1/2) Epoch 32, batch 2550, loss[loss=0.2142, simple_loss=0.2959, pruned_loss=0.06622, over 24661.00 frames. ], tot_loss[loss=0.1709, simple_loss=0.2581, pruned_loss=0.04182, over 7159005.68 frames. ], batch size: 233, lr: 3.41e-03, grad_scale: 32.0 +2023-05-10 18:09:58,098 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=4.46 vs. limit=15.0 +2023-05-10 18:09:59,132 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=576390.0, ans=0.1 +2023-05-10 18:10:09,009 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0045-15876-0_sp0.9 from training. Duration: 23.4166875 +2023-05-10 18:10:10,395 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.845e+02 3.441e+02 3.881e+02 4.444e+02 7.007e+02, threshold=7.761e+02, percent-clipped=0.0 +2023-05-10 18:10:14,006 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.balancer2.prob, batch_count=576440.0, ans=0.125 +2023-05-10 18:10:31,214 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.self_attn1.whiten, num_groups=1, num_channels=192, metric=10.81 vs. limit=22.5 +2023-05-10 18:10:33,645 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.ff3_skip_rate, batch_count=576490.0, ans=0.0 +2023-05-10 18:10:45,426 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module1.balancer2.prob, batch_count=576540.0, ans=0.125 +2023-05-10 18:10:48,333 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass_mid.scale_min, batch_count=576540.0, ans=0.2 +2023-05-10 18:10:53,522 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=5.14 vs. limit=15.0 +2023-05-10 18:10:55,296 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=576540.0, ans=0.0 +2023-05-10 18:10:59,863 INFO [train.py:1021] (1/2) Epoch 32, batch 2600, loss[loss=0.2103, simple_loss=0.288, pruned_loss=0.0663, over 24339.00 frames. ], tot_loss[loss=0.1712, simple_loss=0.2585, pruned_loss=0.04195, over 7157675.56 frames. ], batch size: 233, lr: 3.41e-03, grad_scale: 32.0 +2023-05-10 18:11:27,033 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0006-134305-0 from training. Duration: 21.24 +2023-05-10 18:11:27,047 WARNING [train.py:1182] (1/2) Exclude cut with ID 6533-399-0047-104881-0_sp0.9 from training. Duration: 23.9055625 +2023-05-10 18:11:56,046 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=5.03 vs. limit=15.0 +2023-05-10 18:12:01,235 WARNING [train.py:1182] (1/2) Exclude cut with ID 6758-72288-0033-108368-0_sp0.9 from training. Duration: 25.988875 +2023-05-10 18:12:10,348 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0001-134300-0_sp0.9 from training. Duration: 20.67225 +2023-05-10 18:12:16,155 INFO [train.py:1021] (1/2) Epoch 32, batch 2650, loss[loss=0.1813, simple_loss=0.2684, pruned_loss=0.0471, over 34261.00 frames. ], tot_loss[loss=0.1717, simple_loss=0.2589, pruned_loss=0.04223, over 7094145.07 frames. ], batch size: 144, lr: 3.41e-03, grad_scale: 32.0 +2023-05-10 18:12:18,348 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=1.74 vs. limit=6.0 +2023-05-10 18:12:23,037 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.whiten, num_groups=1, num_channels=256, metric=5.39 vs. limit=12.0 +2023-05-10 18:12:42,558 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.balancer2.prob, batch_count=576890.0, ans=0.125 +2023-05-10 18:12:43,580 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.820e+02 3.378e+02 3.809e+02 4.419e+02 6.771e+02, threshold=7.618e+02, percent-clipped=0.0 +2023-05-10 18:12:53,612 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.5075, 4.0484, 3.7679, 4.0493, 3.3532, 3.1296, 3.5413, 2.9740], + device='cuda:1') +2023-05-10 18:13:02,198 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0038-41224-0 from training. Duration: 20.34 +2023-05-10 18:13:27,005 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=577040.0, ans=0.1 +2023-05-10 18:13:32,080 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass_mid.scale_min, batch_count=577090.0, ans=0.2 +2023-05-10 18:13:33,148 INFO [train.py:1021] (1/2) Epoch 32, batch 2700, loss[loss=0.1882, simple_loss=0.2748, pruned_loss=0.05085, over 36773.00 frames. ], tot_loss[loss=0.1712, simple_loss=0.2584, pruned_loss=0.042, over 7093537.47 frames. ], batch size: 122, lr: 3.41e-03, grad_scale: 32.0 +2023-05-10 18:13:54,771 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([2.9568, 3.4610, 3.2010, 4.0417, 2.0045, 3.4703, 4.0814, 3.5663], + device='cuda:1') +2023-05-10 18:14:17,124 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0026-15857-0_sp0.9 from training. Duration: 25.061125 +2023-05-10 18:14:23,845 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([4.7606, 4.0874, 2.5851, 2.8307], device='cuda:1') +2023-05-10 18:14:30,902 WARNING [train.py:1182] (1/2) Exclude cut with ID 3033-130750-0096-55598-0 from training. Duration: 0.83 +2023-05-10 18:14:33,313 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.conv_module2.whiten, num_groups=1, num_channels=256, metric=9.48 vs. limit=15.0 +2023-05-10 18:14:39,550 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.nonlin_attention.balancer.prob, batch_count=577290.0, ans=0.125 +2023-05-10 18:14:49,497 INFO [train.py:1021] (1/2) Epoch 32, batch 2750, loss[loss=0.1435, simple_loss=0.2245, pruned_loss=0.03127, over 36825.00 frames. ], tot_loss[loss=0.1708, simple_loss=0.2577, pruned_loss=0.04193, over 7088845.41 frames. ], batch size: 89, lr: 3.41e-03, grad_scale: 16.0 +2023-05-10 18:14:54,244 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0017-41203-0 from training. Duration: 24.73 +2023-05-10 18:15:07,751 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0009-134308-0 from training. Duration: 23.965 +2023-05-10 18:15:08,044 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 18:15:18,796 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.938e+02 3.337e+02 3.728e+02 4.297e+02 6.489e+02, threshold=7.456e+02, percent-clipped=0.0 +2023-05-10 18:15:18,955 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0030-146996-0_sp0.9 from training. Duration: 22.088875 +2023-05-10 18:15:34,710 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0006-134305-0_sp0.9 from training. Duration: 23.6 +2023-05-10 18:16:02,254 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=6.44 vs. limit=15.0 +2023-05-10 18:16:06,098 INFO [train.py:1021] (1/2) Epoch 32, batch 2800, loss[loss=0.1663, simple_loss=0.2521, pruned_loss=0.0403, over 37024.00 frames. ], tot_loss[loss=0.1708, simple_loss=0.2577, pruned_loss=0.04196, over 7098787.80 frames. ], batch size: 99, lr: 3.41e-03, grad_scale: 32.0 +2023-05-10 18:16:36,446 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.const_attention_rate, batch_count=577690.0, ans=0.025 +2023-05-10 18:17:21,428 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0024-13011-0 from training. Duration: 23.795 +2023-05-10 18:17:24,382 INFO [train.py:1021] (1/2) Epoch 32, batch 2850, loss[loss=0.204, simple_loss=0.2817, pruned_loss=0.06313, over 24806.00 frames. ], tot_loss[loss=0.1717, simple_loss=0.2589, pruned_loss=0.04224, over 7099363.36 frames. ], batch size: 233, lr: 3.41e-03, grad_scale: 32.0 +2023-05-10 18:17:39,455 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0030-130156-0_sp1.1 from training. Duration: 21.5409375 +2023-05-10 18:17:42,503 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0019-146985-0_sp0.9 from training. Duration: 24.97775 +2023-05-10 18:17:48,925 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=577890.0, ans=0.125 +2023-05-10 18:17:48,975 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass.scale_min, batch_count=577890.0, ans=0.2 +2023-05-10 18:17:53,037 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.770e+02 3.481e+02 3.849e+02 4.405e+02 5.844e+02, threshold=7.699e+02, percent-clipped=0.0 +2023-05-10 18:17:53,887 WARNING [train.py:1182] (1/2) Exclude cut with ID 1085-156170-0017-128270-0_sp0.9 from training. Duration: 23.3444375 +2023-05-10 18:17:58,584 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass.skip_rate, batch_count=577940.0, ans=0.07 +2023-05-10 18:18:23,077 WARNING [train.py:1182] (1/2) Exclude cut with ID 6010-56788-0055-90261-0_sp0.9 from training. Duration: 23.2 +2023-05-10 18:18:29,254 WARNING [train.py:1182] (1/2) Exclude cut with ID 5653-46179-0060-117930-0_sp0.9 from training. Duration: 21.17225 +2023-05-10 18:18:30,999 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.balancer1.min_positive, batch_count=578040.0, ans=0.025 +2023-05-10 18:18:41,175 INFO [train.py:1021] (1/2) Epoch 32, batch 2900, loss[loss=0.1732, simple_loss=0.2656, pruned_loss=0.04038, over 36834.00 frames. ], tot_loss[loss=0.1714, simple_loss=0.2584, pruned_loss=0.04216, over 7082663.64 frames. ], batch size: 113, lr: 3.41e-03, grad_scale: 32.0 +2023-05-10 18:18:49,624 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_skip_rate, batch_count=578090.0, ans=0.0 +2023-05-10 18:18:52,358 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0008-134307-0_sp0.9 from training. Duration: 24.6555625 +2023-05-10 18:18:54,172 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=578090.0, ans=0.125 +2023-05-10 18:19:06,325 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=578140.0, ans=0.1 +2023-05-10 18:19:06,368 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass.skip_rate, batch_count=578140.0, ans=0.04949747468305833 +2023-05-10 18:19:07,791 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.bypass.skip_rate, batch_count=578140.0, ans=0.09899494936611666 +2023-05-10 18:19:11,553 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=578190.0, ans=0.125 +2023-05-10 18:19:50,383 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-65654-0031-41259-0 from training. Duration: 20.44 +2023-05-10 18:19:57,667 INFO [train.py:1021] (1/2) Epoch 32, batch 2950, loss[loss=0.1701, simple_loss=0.26, pruned_loss=0.04009, over 36885.00 frames. ], tot_loss[loss=0.1713, simple_loss=0.2583, pruned_loss=0.04217, over 7073354.70 frames. ], batch size: 105, lr: 3.41e-03, grad_scale: 32.0 +2023-05-10 18:20:03,534 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.bypass.scale_min, batch_count=578340.0, ans=0.2 +2023-05-10 18:20:06,268 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0018-132285-0_sp0.9 from training. Duration: 23.45 +2023-05-10 18:20:27,104 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.763e+02 3.644e+02 4.066e+02 5.505e+02 8.791e+02, threshold=8.132e+02, percent-clipped=6.0 +2023-05-10 18:20:30,538 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module2.balancer2.prob, batch_count=578440.0, ans=0.125 +2023-05-10 18:20:37,250 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=256, metric=14.84 vs. limit=22.5 +2023-05-10 18:20:37,951 WARNING [train.py:1182] (1/2) Exclude cut with ID 6945-60535-0076-12784-0_sp0.9 from training. Duration: 20.52225 +2023-05-10 18:20:45,536 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0008-134307-0 from training. Duration: 22.19 +2023-05-10 18:20:45,863 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.2598, 4.1586, 3.8477, 4.1493, 3.3842, 3.3190, 3.6255, 3.1202], + device='cuda:1') +2023-05-10 18:20:47,388 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([1.9543, 3.9381, 3.6556, 3.9064, 3.2479, 3.0381, 3.4322, 2.8965], + device='cuda:1') +2023-05-10 18:20:53,400 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module2.balancer1.prob, batch_count=578490.0, ans=0.125 +2023-05-10 18:20:56,026 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0014-15845-0_sp1.1 from training. Duration: 25.3818125 +2023-05-10 18:21:04,508 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.bypass.skip_rate, batch_count=578540.0, ans=0.04949747468305833 +2023-05-10 18:21:14,504 INFO [train.py:1021] (1/2) Epoch 32, batch 3000, loss[loss=0.1835, simple_loss=0.2747, pruned_loss=0.04612, over 35809.00 frames. ], tot_loss[loss=0.1711, simple_loss=0.2583, pruned_loss=0.04197, over 7095328.75 frames. ], batch size: 133, lr: 3.41e-03, grad_scale: 32.0 +2023-05-10 18:21:14,505 INFO [train.py:1048] (1/2) Computing validation loss +2023-05-10 18:21:25,542 INFO [train.py:1057] (1/2) Epoch 32, validation: loss=0.1516, simple_loss=0.2526, pruned_loss=0.02526, over 944034.00 frames. +2023-05-10 18:21:25,542 INFO [train.py:1058] (1/2) Maximum memory allocated so far is 18671MB +2023-05-10 18:21:25,641 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0043-132310-0_sp0.9 from training. Duration: 28.0944375 +2023-05-10 18:21:31,414 WARNING [train.py:1182] (1/2) Exclude cut with ID 2195-150901-0045-59933-0_sp0.9 from training. Duration: 22.9444375 +2023-05-10 18:21:31,759 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module1.balancer1.min_positive, batch_count=578590.0, ans=0.025 +2023-05-10 18:21:35,008 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=256, metric=3.06 vs. limit=15.0 +2023-05-10 18:21:38,930 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0024-13011-0_sp1.1 from training. Duration: 21.6318125 +2023-05-10 18:21:42,938 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_skip_rate, batch_count=578640.0, ans=0.0 +2023-05-10 18:21:56,065 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0030-130156-0 from training. Duration: 23.695 +2023-05-10 18:22:22,307 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0094-26379-0 from training. Duration: 23.955 +2023-05-10 18:22:35,207 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.const_attention_rate, batch_count=578790.0, ans=0.025 +2023-05-10 18:22:42,218 INFO [train.py:1021] (1/2) Epoch 32, batch 3050, loss[loss=0.1779, simple_loss=0.2695, pruned_loss=0.04321, over 37057.00 frames. ], tot_loss[loss=0.1711, simple_loss=0.2582, pruned_loss=0.04205, over 7084252.94 frames. ], batch size: 110, lr: 3.41e-03, grad_scale: 32.0 +2023-05-10 18:22:42,587 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=578840.0, ans=0.0 +2023-05-10 18:22:51,717 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.bypass.scale_min, batch_count=578840.0, ans=0.2 +2023-05-10 18:22:57,526 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0024-13011-0_sp0.9 from training. Duration: 26.438875 +2023-05-10 18:23:11,146 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.767e+02 3.371e+02 3.716e+02 4.307e+02 6.059e+02, threshold=7.431e+02, percent-clipped=0.0 +2023-05-10 18:23:30,259 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_skip_rate, batch_count=578990.0, ans=0.0 +2023-05-10 18:23:36,799 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.conv_module2.whiten, num_groups=1, num_channels=256, metric=9.12 vs. limit=15.0 +2023-05-10 18:23:43,495 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0021-26306-0_sp0.9 from training. Duration: 21.2444375 +2023-05-10 18:23:44,892 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0014-15845-0_sp0.9 from training. Duration: 31.02225 +2023-05-10 18:23:56,062 WARNING [train.py:1182] (1/2) Exclude cut with ID 432-122774-0017-62487-0 from training. Duration: 22.395 +2023-05-10 18:23:59,105 INFO [train.py:1021] (1/2) Epoch 32, batch 3100, loss[loss=0.1751, simple_loss=0.2688, pruned_loss=0.04065, over 37031.00 frames. ], tot_loss[loss=0.171, simple_loss=0.2581, pruned_loss=0.04196, over 7101379.41 frames. ], batch size: 116, lr: 3.41e-03, grad_scale: 32.0 +2023-05-10 18:24:15,670 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0045-15876-0 from training. Duration: 21.075 +2023-05-10 18:24:20,404 WARNING [train.py:1182] (1/2) Exclude cut with ID 6482-98857-0025-147532-0_sp0.9 from training. Duration: 20.0055625 +2023-05-10 18:24:20,415 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0037-132304-0_sp0.9 from training. Duration: 22.05 +2023-05-10 18:24:22,419 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0003-134302-0 from training. Duration: 26.8349375 +2023-05-10 18:24:22,690 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass.scale_min, batch_count=579140.0, ans=0.2 +2023-05-10 18:24:25,418 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0009-15840-0_sp1.1 from training. Duration: 22.1090625 +2023-05-10 18:24:29,024 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=256, metric=4.87 vs. limit=15.0 +2023-05-10 18:24:32,827 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0094-26379-0_sp0.9 from training. Duration: 26.6166875 +2023-05-10 18:24:46,533 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.ff3_skip_rate, batch_count=579240.0, ans=0.0 +2023-05-10 18:24:48,048 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([5.0889, 4.4639, 3.3821, 3.0621], device='cuda:1') +2023-05-10 18:24:52,813 WARNING [train.py:1182] (1/2) Exclude cut with ID 2046-178027-0000-53705-0_sp0.9 from training. Duration: 20.3055625 +2023-05-10 18:25:02,238 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=579290.0, ans=0.125 +2023-05-10 18:25:13,914 WARNING [train.py:1182] (1/2) Exclude cut with ID 7205-50138-0008-5373-0_sp0.9 from training. Duration: 20.7 +2023-05-10 18:25:15,931 INFO [train.py:1021] (1/2) Epoch 32, batch 3150, loss[loss=0.1525, simple_loss=0.2325, pruned_loss=0.03621, over 37101.00 frames. ], tot_loss[loss=0.171, simple_loss=0.2583, pruned_loss=0.04191, over 7112328.89 frames. ], batch size: 88, lr: 3.41e-03, grad_scale: 32.0 +2023-05-10 18:25:30,049 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.attention_skip_rate, batch_count=579390.0, ans=0.0 +2023-05-10 18:25:35,276 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.whiten, num_groups=1, num_channels=256, metric=5.44 vs. limit=12.0 +2023-05-10 18:25:44,130 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.0.layers.0.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 18:25:45,297 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.772e+02 3.405e+02 3.688e+02 4.268e+02 5.994e+02, threshold=7.376e+02, percent-clipped=0.0 +2023-05-10 18:26:00,613 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0019-146985-0 from training. Duration: 22.48 +2023-05-10 18:26:16,369 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0003-134302-0_sp0.9 from training. Duration: 29.816625 +2023-05-10 18:26:16,618 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=579540.0, ans=0.1 +2023-05-10 18:26:29,974 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=579540.0, ans=0.125 +2023-05-10 18:26:32,664 INFO [train.py:1021] (1/2) Epoch 32, batch 3200, loss[loss=0.17, simple_loss=0.2585, pruned_loss=0.04079, over 36849.00 frames. ], tot_loss[loss=0.1711, simple_loss=0.2582, pruned_loss=0.04199, over 7104084.27 frames. ], batch size: 96, lr: 3.40e-03, grad_scale: 32.0 +2023-05-10 18:26:36,745 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=579590.0, ans=0.1 +2023-05-10 18:26:37,851 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0005-134304-0_sp1.1 from training. Duration: 22.7590625 +2023-05-10 18:26:43,693 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0026-15857-0 from training. Duration: 22.555 +2023-05-10 18:27:05,624 WARNING [train.py:1182] (1/2) Exclude cut with ID 1250-135782-0005-25975-0_sp0.9 from training. Duration: 21.688875 +2023-05-10 18:27:24,158 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=9.34 vs. limit=15.0 +2023-05-10 18:27:35,595 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=4.98 vs. limit=15.0 +2023-05-10 18:27:39,429 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.bypass.scale_min, batch_count=579790.0, ans=0.2 +2023-05-10 18:27:40,689 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0038-41224-0_sp0.9 from training. Duration: 22.6 +2023-05-10 18:27:41,511 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=3.13 vs. limit=15.0 +2023-05-10 18:27:49,845 INFO [train.py:1021] (1/2) Epoch 32, batch 3250, loss[loss=0.1743, simple_loss=0.2656, pruned_loss=0.04144, over 37169.00 frames. ], tot_loss[loss=0.1712, simple_loss=0.2583, pruned_loss=0.04205, over 7083629.00 frames. ], batch size: 102, lr: 3.40e-03, grad_scale: 32.0 +2023-05-10 18:28:19,576 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.914e+02 3.371e+02 3.798e+02 4.615e+02 8.150e+02, threshold=7.595e+02, percent-clipped=1.0 +2023-05-10 18:28:22,676 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0009-15840-0 from training. Duration: 24.32 +2023-05-10 18:28:24,630 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module2.balancer1.min_positive, batch_count=579940.0, ans=0.025 +2023-05-10 18:28:27,522 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass_mid.scale_min, batch_count=579940.0, ans=0.2 +2023-05-10 18:28:35,061 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=579990.0, ans=0.1 +2023-05-10 18:28:42,672 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=579990.0, ans=0.125 +2023-05-10 18:29:08,389 INFO [train.py:1021] (1/2) Epoch 32, batch 3300, loss[loss=0.1632, simple_loss=0.2458, pruned_loss=0.0403, over 37067.00 frames. ], tot_loss[loss=0.1708, simple_loss=0.2578, pruned_loss=0.04187, over 7110639.32 frames. ], batch size: 94, lr: 3.40e-03, grad_scale: 32.0 +2023-05-10 18:29:10,581 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=192, metric=3.71 vs. limit=10.0 +2023-05-10 18:29:14,797 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.ff3_skip_rate, batch_count=580090.0, ans=0.0 +2023-05-10 18:29:24,208 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-276745-0093-13116-0_sp0.9 from training. Duration: 21.061125 +2023-05-10 18:29:36,180 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0024-15855-0_sp0.9 from training. Duration: 20.32225 +2023-05-10 18:29:45,042 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=7.37 vs. limit=15.0 +2023-05-10 18:29:50,546 WARNING [train.py:1182] (1/2) Exclude cut with ID 3033-130750-0096-55598-0_sp1.1 from training. Duration: 0.7545625 +2023-05-10 18:29:56,999 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.1.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 18:30:02,907 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=580240.0, ans=0.1 +2023-05-10 18:30:07,780 WARNING [train.py:1182] (1/2) Exclude cut with ID 4295-39940-0007-92567-0_sp0.9 from training. Duration: 23.9333125 +2023-05-10 18:30:25,089 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.1.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([5.5972, 5.4011, 4.8170, 5.2267], device='cuda:1') +2023-05-10 18:30:26,225 INFO [train.py:1021] (1/2) Epoch 32, batch 3350, loss[loss=0.1494, simple_loss=0.2363, pruned_loss=0.03126, over 37172.00 frames. ], tot_loss[loss=0.1707, simple_loss=0.2576, pruned_loss=0.04187, over 7101296.16 frames. ], batch size: 93, lr: 3.40e-03, grad_scale: 16.0 +2023-05-10 18:30:43,294 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0008-134307-0_sp1.1 from training. Duration: 20.17275 +2023-05-10 18:30:43,656 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.ff3_skip_rate, batch_count=580390.0, ans=0.0 +2023-05-10 18:30:49,203 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0019-146985-0_sp1.1 from training. Duration: 20.436375 +2023-05-10 18:30:56,696 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.884e+02 3.446e+02 3.805e+02 4.083e+02 7.157e+02, threshold=7.609e+02, percent-clipped=0.0 +2023-05-10 18:31:00,051 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.balancer2.prob, batch_count=580440.0, ans=0.125 +2023-05-10 18:31:20,656 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=3.34 vs. limit=15.0 +2023-05-10 18:31:42,952 INFO [train.py:1021] (1/2) Epoch 32, batch 3400, loss[loss=0.171, simple_loss=0.2663, pruned_loss=0.03791, over 36870.00 frames. ], tot_loss[loss=0.1707, simple_loss=0.2578, pruned_loss=0.0418, over 7102774.70 frames. ], batch size: 111, lr: 3.40e-03, grad_scale: 16.0 +2023-05-10 18:32:07,509 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.conv_module1.whiten, num_groups=1, num_channels=192, metric=8.51 vs. limit=15.0 +2023-05-10 18:32:08,402 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module2.balancer2.prob, batch_count=580640.0, ans=0.125 +2023-05-10 18:32:12,970 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward2.hidden_balancer.prob, batch_count=580690.0, ans=0.125 +2023-05-10 18:32:14,044 WARNING [train.py:1182] (1/2) Exclude cut with ID 4234-40345-0022-142709-0_sp0.9 from training. Duration: 23.1055625 +2023-05-10 18:32:15,569 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0007-12994-0_sp1.1 from training. Duration: 23.5 +2023-05-10 18:32:15,880 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.out_combiner.scale_min, batch_count=580690.0, ans=0.2 +2023-05-10 18:32:26,269 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0009-134308-0_sp0.9 from training. Duration: 26.62775 +2023-05-10 18:32:41,823 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0018-132285-0 from training. Duration: 21.105 +2023-05-10 18:32:47,051 WARNING [train.py:1182] (1/2) Exclude cut with ID 4511-76322-0006-80011-0_sp0.9 from training. Duration: 24.411125 +2023-05-10 18:32:50,550 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.conv_module2.whiten, num_groups=1, num_channels=256, metric=6.59 vs. limit=15.0 +2023-05-10 18:33:00,450 INFO [train.py:1021] (1/2) Epoch 32, batch 3450, loss[loss=0.1643, simple_loss=0.2547, pruned_loss=0.03696, over 36935.00 frames. ], tot_loss[loss=0.1709, simple_loss=0.2579, pruned_loss=0.04197, over 7066131.15 frames. ], batch size: 108, lr: 3.40e-03, grad_scale: 16.0 +2023-05-10 18:33:11,244 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.0.layers.1.self_attn_weights, attn_weights_entropy = tensor([5.8722, 5.0787, 5.1762, 5.7167], device='cuda:1') +2023-05-10 18:33:11,558 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.whiten, num_groups=1, num_channels=256, metric=3.32 vs. limit=12.0 +2023-05-10 18:33:14,066 WARNING [train.py:1182] (1/2) Exclude cut with ID 6758-72288-0033-108368-0_sp1.1 from training. Duration: 21.263625 +2023-05-10 18:33:18,896 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=580890.0, ans=0.125 +2023-05-10 18:33:30,896 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.845e+02 3.593e+02 4.140e+02 4.922e+02 7.337e+02, threshold=8.279e+02, percent-clipped=0.0 +2023-05-10 18:33:32,818 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module1.balancer1.prob, batch_count=580940.0, ans=0.125 +2023-05-10 18:33:48,295 WARNING [train.py:1182] (1/2) Exclude cut with ID 4234-40345-0022-142709-0 from training. Duration: 20.795 +2023-05-10 18:33:51,625 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.ff3_skip_rate, batch_count=580990.0, ans=0.0 +2023-05-10 18:34:00,285 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0021-15852-0 from training. Duration: 24.76 +2023-05-10 18:34:00,311 WARNING [train.py:1182] (1/2) Exclude cut with ID 3867-173237-0077-144769-0_sp0.9 from training. Duration: 22.25 +2023-05-10 18:34:14,248 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.nonlin_attention.balancer.min_positive, batch_count=581040.0, ans=0.05 +2023-05-10 18:34:14,291 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=581040.0, ans=0.1 +2023-05-10 18:34:16,896 INFO [train.py:1021] (1/2) Epoch 32, batch 3500, loss[loss=0.1694, simple_loss=0.2498, pruned_loss=0.04455, over 37171.00 frames. ], tot_loss[loss=0.1706, simple_loss=0.2575, pruned_loss=0.04189, over 7064778.31 frames. ], batch size: 93, lr: 3.40e-03, grad_scale: 16.0 +2023-05-10 18:34:29,179 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.out_combiner.scale_min, batch_count=581090.0, ans=0.2 +2023-05-10 18:34:30,314 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0026-15857-0_sp1.1 from training. Duration: 20.5045625 +2023-05-10 18:34:31,266 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.out_whiten.whitening_limit, batch_count=581090.0, ans=15.0 +2023-05-10 18:35:27,520 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_skip_rate, batch_count=581290.0, ans=0.0 +2023-05-10 18:35:32,776 INFO [train.py:1021] (1/2) Epoch 32, batch 3550, loss[loss=0.1721, simple_loss=0.2656, pruned_loss=0.03933, over 36945.00 frames. ], tot_loss[loss=0.1709, simple_loss=0.2581, pruned_loss=0.04183, over 7077923.51 frames. ], batch size: 108, lr: 3.40e-03, grad_scale: 16.0 +2023-05-10 18:35:50,763 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.const_attention_rate, batch_count=581390.0, ans=0.025 +2023-05-10 18:36:02,136 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.868e+02 3.362e+02 3.647e+02 3.967e+02 5.121e+02, threshold=7.293e+02, percent-clipped=0.0 +2023-05-10 18:36:13,319 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=192, metric=6.25 vs. limit=15.0 +2023-05-10 18:36:44,902 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.balancer1.min_positive, batch_count=581590.0, ans=0.025 +2023-05-10 18:36:45,979 INFO [train.py:1021] (1/2) Epoch 32, batch 3600, loss[loss=0.1573, simple_loss=0.2327, pruned_loss=0.04102, over 35768.00 frames. ], tot_loss[loss=0.1704, simple_loss=0.2575, pruned_loss=0.0416, over 7111792.82 frames. ], batch size: 79, lr: 3.40e-03, grad_scale: 32.0 +2023-05-10 18:37:09,077 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.const_attention_rate, batch_count=581640.0, ans=0.025 +2023-05-10 18:37:10,521 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=581640.0, ans=0.1 +2023-05-10 18:37:14,842 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=581690.0, ans=0.1 +2023-05-10 18:37:27,309 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=581690.0, ans=0.0 +2023-05-10 18:37:28,981 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=9.37 vs. limit=15.0 +2023-05-10 18:37:29,059 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=256, metric=8.11 vs. limit=15.0 +2023-05-10 18:37:53,546 WARNING [train.py:1182] (1/2) Exclude cut with ID 7859-102521-0017-7548-0_sp1.1 from training. Duration: 22.2954375 +2023-05-10 18:38:00,564 INFO [train.py:1021] (1/2) Epoch 33, batch 0, loss[loss=0.1494, simple_loss=0.2367, pruned_loss=0.03101, over 35483.00 frames. ], tot_loss[loss=0.1494, simple_loss=0.2367, pruned_loss=0.03101, over 35483.00 frames. ], batch size: 78, lr: 3.35e-03, grad_scale: 32.0 +2023-05-10 18:38:00,564 INFO [train.py:1048] (1/2) Computing validation loss +2023-05-10 18:38:11,256 INFO [train.py:1057] (1/2) Epoch 33, validation: loss=0.1528, simple_loss=0.2536, pruned_loss=0.02601, over 944034.00 frames. +2023-05-10 18:38:11,257 INFO [train.py:1058] (1/2) Maximum memory allocated so far is 18671MB +2023-05-10 18:38:15,394 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=144, metric=5.52 vs. limit=10.0 +2023-05-10 18:38:19,206 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module2.balancer1.prob, batch_count=581770.0, ans=0.125 +2023-05-10 18:38:23,703 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.balancer2.prob, batch_count=581770.0, ans=0.125 +2023-05-10 18:39:03,135 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=256, metric=4.48 vs. limit=15.0 +2023-05-10 18:39:03,755 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.383e+02 3.408e+02 3.904e+02 4.709e+02 7.307e+02, threshold=7.808e+02, percent-clipped=1.0 +2023-05-10 18:39:03,943 WARNING [train.py:1182] (1/2) Exclude cut with ID 298-126791-0067-24026-0_sp0.9 from training. Duration: 21.438875 +2023-05-10 18:39:05,684 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=581920.0, ans=0.125 +2023-05-10 18:39:09,739 WARNING [train.py:1182] (1/2) Exclude cut with ID 5652-39938-0025-23684-0_sp0.9 from training. Duration: 22.2055625 +2023-05-10 18:39:26,981 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.5.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 18:39:27,978 INFO [train.py:1021] (1/2) Epoch 33, batch 50, loss[loss=0.1718, simple_loss=0.2729, pruned_loss=0.03532, over 36843.00 frames. ], tot_loss[loss=0.165, simple_loss=0.2581, pruned_loss=0.03598, over 1606284.45 frames. ], batch size: 111, lr: 3.34e-03, grad_scale: 32.0 +2023-05-10 18:39:28,404 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.balancer1.prob, batch_count=582020.0, ans=0.125 +2023-05-10 18:39:46,362 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.out_combiner.scale_min, batch_count=582070.0, ans=0.2 +2023-05-10 18:39:49,436 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module2.balancer2.prob, batch_count=582070.0, ans=0.125 +2023-05-10 18:40:06,231 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.5.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.3197, 4.5675, 2.1594, 2.5476], device='cuda:1') +2023-05-10 18:40:30,361 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.bypass.skip_rate, batch_count=582220.0, ans=0.04949747468305833 +2023-05-10 18:40:38,307 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.0.layers.1.self_attn_weights, attn_weights_entropy = tensor([5.9514, 5.1861, 5.2640, 5.8213], device='cuda:1') +2023-05-10 18:40:45,126 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_skip_rate, batch_count=582270.0, ans=0.0 +2023-05-10 18:40:46,132 INFO [train.py:1021] (1/2) Epoch 33, batch 100, loss[loss=0.1912, simple_loss=0.276, pruned_loss=0.05317, over 23305.00 frames. ], tot_loss[loss=0.1643, simple_loss=0.2566, pruned_loss=0.036, over 2842758.69 frames. ], batch size: 233, lr: 3.34e-03, grad_scale: 32.0 +2023-05-10 18:41:38,525 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.556e+02 3.167e+02 4.003e+02 5.667e+02 8.688e+02, threshold=8.006e+02, percent-clipped=3.0 +2023-05-10 18:41:46,642 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_skip_rate, batch_count=582470.0, ans=0.0 +2023-05-10 18:41:52,595 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.const_attention_rate, batch_count=582470.0, ans=0.025 +2023-05-10 18:41:58,619 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.0.layers.0.self_attn_weights, attn_weights_entropy = tensor([6.3271, 5.6570, 5.5151, 6.0758], device='cuda:1') +2023-05-10 18:42:02,893 INFO [train.py:1021] (1/2) Epoch 33, batch 150, loss[loss=0.1718, simple_loss=0.2711, pruned_loss=0.03622, over 37132.00 frames. ], tot_loss[loss=0.1635, simple_loss=0.2559, pruned_loss=0.03554, over 3815809.66 frames. ], batch size: 107, lr: 3.34e-03, grad_scale: 32.0 +2023-05-10 18:42:18,548 WARNING [train.py:1182] (1/2) Exclude cut with ID 7859-102521-0017-7548-0 from training. Duration: 24.525 +2023-05-10 18:42:41,131 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_skip_rate, batch_count=582620.0, ans=0.0 +2023-05-10 18:42:52,014 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.attention_skip_rate, batch_count=582670.0, ans=0.0 +2023-05-10 18:42:56,165 WARNING [train.py:1182] (1/2) Exclude cut with ID 3699-47246-0007-3408-0_sp0.9 from training. Duration: 20.26675 +2023-05-10 18:43:11,244 WARNING [train.py:1182] (1/2) Exclude cut with ID 7859-102521-0017-7548-0_sp0.9 from training. Duration: 27.25 +2023-05-10 18:43:20,907 INFO [train.py:1021] (1/2) Epoch 33, batch 200, loss[loss=0.1511, simple_loss=0.2347, pruned_loss=0.03373, over 36819.00 frames. ], tot_loss[loss=0.1618, simple_loss=0.2538, pruned_loss=0.03493, over 4565476.07 frames. ], batch size: 84, lr: 3.34e-03, grad_scale: 32.0 +2023-05-10 18:43:21,278 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module1.balancer1.prob, batch_count=582770.0, ans=0.125 +2023-05-10 18:43:44,619 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=582820.0, ans=0.0 +2023-05-10 18:43:47,447 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.ff3_skip_rate, batch_count=582820.0, ans=0.0 +2023-05-10 18:43:58,039 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module1.balancer1.prob, batch_count=582870.0, ans=0.125 +2023-05-10 18:43:59,679 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module1.balancer1.prob, batch_count=582870.0, ans=0.125 +2023-05-10 18:44:01,138 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.attention_skip_rate, batch_count=582870.0, ans=0.0 +2023-05-10 18:44:13,361 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.396e+02 2.912e+02 3.295e+02 3.978e+02 5.997e+02, threshold=6.589e+02, percent-clipped=0.0 +2023-05-10 18:44:30,932 WARNING [train.py:1182] (1/2) Exclude cut with ID 6426-64292-0017-15984-0 from training. Duration: 21.68 +2023-05-10 18:44:38,567 INFO [train.py:1021] (1/2) Epoch 33, batch 250, loss[loss=0.1651, simple_loss=0.2602, pruned_loss=0.035, over 36940.00 frames. ], tot_loss[loss=0.1622, simple_loss=0.2543, pruned_loss=0.03506, over 5148554.92 frames. ], batch size: 108, lr: 3.34e-03, grad_scale: 32.0 +2023-05-10 18:44:44,758 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0007-59342-0 from training. Duration: 21.6300625 +2023-05-10 18:44:57,884 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=2.18 vs. limit=6.0 +2023-05-10 18:45:07,957 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0007-59342-0_sp0.9 from training. Duration: 24.033375 +2023-05-10 18:45:29,953 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.out_proj.dropout_p, batch_count=583170.0, ans=0.1 +2023-05-10 18:45:41,365 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=192, metric=4.08 vs. limit=10.0 +2023-05-10 18:45:45,431 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass.scale_min, batch_count=583220.0, ans=0.2 +2023-05-10 18:45:53,134 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=583220.0, ans=0.1 +2023-05-10 18:45:55,631 INFO [train.py:1021] (1/2) Epoch 33, batch 300, loss[loss=0.1729, simple_loss=0.2703, pruned_loss=0.03772, over 36329.00 frames. ], tot_loss[loss=0.1621, simple_loss=0.2544, pruned_loss=0.03489, over 5586556.50 frames. ], batch size: 126, lr: 3.34e-03, grad_scale: 32.0 +2023-05-10 18:46:11,669 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0009-59344-0 from training. Duration: 22.905 +2023-05-10 18:46:13,158 WARNING [train.py:1182] (1/2) Exclude cut with ID 5622-44585-0006-90525-0_sp1.1 from training. Duration: 23.4318125 +2023-05-10 18:46:16,637 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_skip_rate, batch_count=583320.0, ans=0.0 +2023-05-10 18:46:18,146 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.const_attention_rate, batch_count=583320.0, ans=0.025 +2023-05-10 18:46:48,675 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.385e+02 2.986e+02 3.374e+02 4.018e+02 6.989e+02, threshold=6.748e+02, percent-clipped=3.0 +2023-05-10 18:47:07,772 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=583470.0, ans=0.0 +2023-05-10 18:47:07,806 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module1.balancer2.prob, batch_count=583470.0, ans=0.125 +2023-05-10 18:47:11,471 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.conv_module2.whiten, num_groups=1, num_channels=256, metric=6.19 vs. limit=15.0 +2023-05-10 18:47:13,559 INFO [train.py:1021] (1/2) Epoch 33, batch 350, loss[loss=0.1495, simple_loss=0.2356, pruned_loss=0.03167, over 37110.00 frames. ], tot_loss[loss=0.1622, simple_loss=0.2543, pruned_loss=0.03499, over 5954979.67 frames. ], batch size: 88, lr: 3.34e-03, grad_scale: 32.0 +2023-05-10 18:48:17,462 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.balancer1.prob, batch_count=583720.0, ans=0.125 +2023-05-10 18:48:18,967 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module1.balancer1.prob, batch_count=583720.0, ans=0.125 +2023-05-10 18:48:20,160 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0009-59344-0_sp1.1 from training. Duration: 20.82275 +2023-05-10 18:48:21,719 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0009-59344-0_sp0.9 from training. Duration: 25.45 +2023-05-10 18:48:30,754 INFO [train.py:1021] (1/2) Epoch 33, batch 400, loss[loss=0.1537, simple_loss=0.245, pruned_loss=0.03119, over 37165.00 frames. ], tot_loss[loss=0.1621, simple_loss=0.2546, pruned_loss=0.03484, over 6252858.41 frames. ], batch size: 93, lr: 3.34e-03, grad_scale: 32.0 +2023-05-10 18:48:43,254 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.out_proj.dropout_p, batch_count=583770.0, ans=0.1 +2023-05-10 18:49:07,788 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.1.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([4.4044, 3.7592, 3.9689, 3.8100], device='cuda:1') +2023-05-10 18:49:21,502 WARNING [train.py:1182] (1/2) Exclude cut with ID 5622-44585-0006-90525-0 from training. Duration: 25.775 +2023-05-10 18:49:25,282 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.conv_module2.whiten, num_groups=1, num_channels=256, metric=5.35 vs. limit=15.0 +2023-05-10 18:49:25,901 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.504e+02 3.010e+02 3.399e+02 3.982e+02 7.209e+02, threshold=6.797e+02, percent-clipped=3.0 +2023-05-10 18:49:31,383 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.self_attn1.whiten, num_groups=1, num_channels=192, metric=11.28 vs. limit=22.5 +2023-05-10 18:49:46,346 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0071-62375-0_sp0.9 from training. Duration: 22.25 +2023-05-10 18:49:47,825 INFO [train.py:1021] (1/2) Epoch 33, batch 450, loss[loss=0.1573, simple_loss=0.2448, pruned_loss=0.03494, over 37174.00 frames. ], tot_loss[loss=0.1628, simple_loss=0.2553, pruned_loss=0.03516, over 6462757.78 frames. ], batch size: 93, lr: 3.34e-03, grad_scale: 16.0 +2023-05-10 18:49:57,155 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=584020.0, ans=0.1 +2023-05-10 18:50:00,134 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.balancer2.prob, batch_count=584020.0, ans=0.125 +2023-05-10 18:50:14,178 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward3.hidden_balancer.prob, batch_count=584070.0, ans=0.125 +2023-05-10 18:50:15,385 WARNING [train.py:1182] (1/2) Exclude cut with ID 3972-170212-0014-23379-0 from training. Duration: 26.205 +2023-05-10 18:50:33,460 WARNING [train.py:1182] (1/2) Exclude cut with ID 5239-32139-0047-9341-0_sp0.9 from training. Duration: 30.1555625 +2023-05-10 18:50:33,806 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([5.0448, 4.4279, 3.1312, 3.2002], device='cuda:1') +2023-05-10 18:50:37,895 WARNING [train.py:1182] (1/2) Exclude cut with ID 1265-135635-0050-6781-0_sp0.9 from training. Duration: 21.8333125 +2023-05-10 18:50:46,015 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.out_proj.dropout_p, batch_count=584170.0, ans=0.1 +2023-05-10 18:50:48,759 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0024-94914-0_sp1.1 from training. Duration: 20.6545625 +2023-05-10 18:51:04,429 INFO [train.py:1021] (1/2) Epoch 33, batch 500, loss[loss=0.1798, simple_loss=0.2761, pruned_loss=0.0417, over 36277.00 frames. ], tot_loss[loss=0.1638, simple_loss=0.2566, pruned_loss=0.03556, over 6641365.61 frames. ], batch size: 126, lr: 3.34e-03, grad_scale: 16.0 +2023-05-10 18:51:11,397 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=3.44 vs. limit=6.0 +2023-05-10 18:51:34,687 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0045-39920-0_sp0.9 from training. Duration: 20.52225 +2023-05-10 18:51:41,815 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.ff3_skip_rate, batch_count=584370.0, ans=0.0 +2023-05-10 18:51:56,864 WARNING [train.py:1182] (1/2) Exclude cut with ID 3972-170212-0014-23379-0_sp0.9 from training. Duration: 29.1166875 +2023-05-10 18:51:58,730 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=584420.0, ans=0.125 +2023-05-10 18:51:58,778 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.bypass_mid.scale_min, batch_count=584420.0, ans=0.2 +2023-05-10 18:52:00,296 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.480e+02 2.859e+02 3.198e+02 3.606e+02 6.398e+02, threshold=6.396e+02, percent-clipped=0.0 +2023-05-10 18:52:08,208 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module1.balancer2.prob, batch_count=584470.0, ans=0.125 +2023-05-10 18:52:21,409 INFO [train.py:1021] (1/2) Epoch 33, batch 550, loss[loss=0.1623, simple_loss=0.2604, pruned_loss=0.03215, over 36907.00 frames. ], tot_loss[loss=0.1644, simple_loss=0.2574, pruned_loss=0.0357, over 6780745.57 frames. ], batch size: 105, lr: 3.34e-03, grad_scale: 16.0 +2023-05-10 18:52:29,231 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.1.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([5.5776, 4.9056, 5.0549, 4.7194], device='cuda:1') +2023-05-10 18:52:29,352 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=584520.0, ans=0.125 +2023-05-10 18:52:32,646 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([1.9543, 2.9677, 4.4257, 2.8912], device='cuda:1') +2023-05-10 18:52:34,092 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module2.balancer2.prob, batch_count=584520.0, ans=0.125 +2023-05-10 18:52:37,084 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.bypass.skip_rate, batch_count=584570.0, ans=0.09899494936611666 +2023-05-10 18:52:52,175 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_skip_rate, batch_count=584620.0, ans=0.0 +2023-05-10 18:52:58,394 WARNING [train.py:1182] (1/2) Exclude cut with ID 543-133211-0007-59831-0_sp0.9 from training. Duration: 21.388875 +2023-05-10 18:52:58,628 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.const_attention_rate, batch_count=584620.0, ans=0.025 +2023-05-10 18:53:00,225 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.balancer2.prob, batch_count=584620.0, ans=0.125 +2023-05-10 18:53:19,593 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.1033, 4.0089, 3.7364, 4.0313, 3.4100, 3.0580, 3.4687, 3.0283], + device='cuda:1') +2023-05-10 18:53:37,844 INFO [train.py:1021] (1/2) Epoch 33, batch 600, loss[loss=0.1473, simple_loss=0.2351, pruned_loss=0.02975, over 36811.00 frames. ], tot_loss[loss=0.1642, simple_loss=0.2572, pruned_loss=0.03565, over 6891597.58 frames. ], batch size: 89, lr: 3.34e-03, grad_scale: 16.0 +2023-05-10 18:53:37,951 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0024-94914-0 from training. Duration: 22.72 +2023-05-10 18:53:38,017 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0031-94921-0_sp0.9 from training. Duration: 22.7444375 +2023-05-10 18:54:08,761 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.1.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 18:54:10,610 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=1.86 vs. limit=6.0 +2023-05-10 18:54:14,704 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.bypass.skip_rate, batch_count=584870.0, ans=0.09899494936611666 +2023-05-10 18:54:16,156 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module1.balancer2.prob, batch_count=584870.0, ans=0.125 +2023-05-10 18:54:21,045 WARNING [train.py:1182] (1/2) Exclude cut with ID 4133-6541-0027-40495-0_sp1.1 from training. Duration: 0.9681875 +2023-05-10 18:54:24,004 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62851-0022-91297-0_sp0.9 from training. Duration: 22.3166875 +2023-05-10 18:54:31,287 WARNING [train.py:1182] (1/2) Exclude cut with ID 543-133212-0015-59917-0_sp0.9 from training. Duration: 21.8166875 +2023-05-10 18:54:32,668 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.596e+02 3.201e+02 3.589e+02 4.184e+02 7.009e+02, threshold=7.178e+02, percent-clipped=1.0 +2023-05-10 18:54:38,034 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=6.50 vs. limit=15.0 +2023-05-10 18:54:47,496 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=584970.0, ans=0.1 +2023-05-10 18:54:54,836 INFO [train.py:1021] (1/2) Epoch 33, batch 650, loss[loss=0.1766, simple_loss=0.2728, pruned_loss=0.04026, over 32364.00 frames. ], tot_loss[loss=0.1649, simple_loss=0.2576, pruned_loss=0.03606, over 6930973.78 frames. ], batch size: 170, lr: 3.34e-03, grad_scale: 16.0 +2023-05-10 18:55:34,714 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.ff3_skip_rate, batch_count=585120.0, ans=0.0 +2023-05-10 18:55:52,276 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([2.0178, 2.4449, 3.3885, 2.6200], device='cuda:1') +2023-05-10 18:56:12,004 INFO [train.py:1021] (1/2) Epoch 33, batch 700, loss[loss=0.1437, simple_loss=0.2294, pruned_loss=0.02902, over 36792.00 frames. ], tot_loss[loss=0.1652, simple_loss=0.258, pruned_loss=0.03614, over 6985604.98 frames. ], batch size: 89, lr: 3.34e-03, grad_scale: 16.0 +2023-05-10 18:56:18,187 WARNING [train.py:1182] (1/2) Exclude cut with ID 4957-30119-0041-23990-0_sp0.9 from training. Duration: 20.22775 +2023-05-10 18:56:42,137 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.conv_module2.whiten, num_groups=1, num_channels=256, metric=8.21 vs. limit=15.0 +2023-05-10 18:56:44,589 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.balancer_ff2.min_abs, batch_count=585370.0, ans=0.1 +2023-05-10 18:56:44,629 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.bypass.scale_min, batch_count=585370.0, ans=0.2 +2023-05-10 18:56:53,541 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.balancer2.prob, batch_count=585370.0, ans=0.125 +2023-05-10 18:57:04,191 WARNING [train.py:1182] (1/2) Exclude cut with ID 5239-32139-0047-9341-0_sp1.1 from training. Duration: 24.67275 +2023-05-10 18:57:07,055 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.387e+02 3.090e+02 3.505e+02 4.266e+02 8.424e+02, threshold=7.010e+02, percent-clipped=2.0 +2023-05-10 18:57:08,886 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.ff2_skip_rate, batch_count=585420.0, ans=0.0 +2023-05-10 18:57:13,486 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.ff2_skip_rate, batch_count=585470.0, ans=0.0 +2023-05-10 18:57:16,391 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.const_attention_rate, batch_count=585470.0, ans=0.025 +2023-05-10 18:57:16,552 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=585470.0, ans=0.125 +2023-05-10 18:57:28,083 INFO [train.py:1021] (1/2) Epoch 33, batch 750, loss[loss=0.1514, simple_loss=0.2449, pruned_loss=0.02895, over 37030.00 frames. ], tot_loss[loss=0.1653, simple_loss=0.2585, pruned_loss=0.03608, over 7039190.89 frames. ], batch size: 99, lr: 3.33e-03, grad_scale: 16.0 +2023-05-10 18:57:34,902 WARNING [train.py:1182] (1/2) Exclude cut with ID 3082-165428-0081-50734-0_sp0.9 from training. Duration: 21.8055625 +2023-05-10 18:58:05,765 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module1.balancer2.min_positive, batch_count=585620.0, ans=0.05 +2023-05-10 18:58:07,482 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.bypass_mid.scale_min, batch_count=585620.0, ans=0.2 +2023-05-10 18:58:09,637 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.0.whiten, num_groups=1, num_channels=192, metric=4.48 vs. limit=12.0 +2023-05-10 18:58:14,518 WARNING [train.py:1182] (1/2) Exclude cut with ID 3340-169293-0054-76830-0_sp0.9 from training. Duration: 22.6666875 +2023-05-10 18:58:36,393 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.const_attention_rate, batch_count=585720.0, ans=0.025 +2023-05-10 18:58:39,396 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.balancer2.prob, batch_count=585720.0, ans=0.125 +2023-05-10 18:58:45,105 INFO [train.py:1021] (1/2) Epoch 33, batch 800, loss[loss=0.1638, simple_loss=0.2604, pruned_loss=0.03355, over 37006.00 frames. ], tot_loss[loss=0.1649, simple_loss=0.2581, pruned_loss=0.03583, over 7094502.79 frames. ], batch size: 104, lr: 3.33e-03, grad_scale: 32.0 +2023-05-10 18:58:46,967 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=585770.0, ans=0.125 +2023-05-10 18:58:51,966 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=585770.0, ans=0.125 +2023-05-10 18:58:52,049 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module1.balancer2.prob, batch_count=585770.0, ans=0.125 +2023-05-10 18:59:19,117 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=585870.0, ans=0.125 +2023-05-10 18:59:21,891 WARNING [train.py:1182] (1/2) Exclude cut with ID 2411-132532-0017-82279-0_sp1.1 from training. Duration: 0.9681875 +2023-05-10 18:59:36,487 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.attention_skip_rate, batch_count=585920.0, ans=0.0 +2023-05-10 18:59:40,566 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.563e+02 2.993e+02 3.321e+02 3.995e+02 6.485e+02, threshold=6.641e+02, percent-clipped=0.0 +2023-05-10 18:59:47,429 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.const_attention_rate, batch_count=585970.0, ans=0.025 +2023-05-10 18:59:50,212 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62850-0007-91323-0 from training. Duration: 22.485 +2023-05-10 19:00:02,366 INFO [train.py:1021] (1/2) Epoch 33, batch 850, loss[loss=0.1689, simple_loss=0.266, pruned_loss=0.03593, over 37096.00 frames. ], tot_loss[loss=0.165, simple_loss=0.2583, pruned_loss=0.03583, over 7127097.99 frames. ], batch size: 103, lr: 3.33e-03, grad_scale: 32.0 +2023-05-10 19:00:02,846 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module2.balancer2.prob, batch_count=586020.0, ans=0.125 +2023-05-10 19:00:04,350 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([1.9327, 3.3021, 4.5450, 3.0388], device='cuda:1') +2023-05-10 19:00:24,393 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.conv_module2.whiten, num_groups=1, num_channels=256, metric=4.25 vs. limit=15.0 +2023-05-10 19:00:29,902 WARNING [train.py:1182] (1/2) Exclude cut with ID 3972-170212-0014-23379-0_sp1.1 from training. Duration: 23.82275 +2023-05-10 19:00:34,728 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.5.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 19:00:42,295 WARNING [train.py:1182] (1/2) Exclude cut with ID 4860-13185-0032-76709-0 from training. Duration: 20.77 +2023-05-10 19:00:49,773 WARNING [train.py:1182] (1/2) Exclude cut with ID 6426-64292-0017-15984-0_sp0.9 from training. Duration: 24.088875 +2023-05-10 19:00:51,721 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=586170.0, ans=0.125 +2023-05-10 19:00:51,735 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=586170.0, ans=0.1 +2023-05-10 19:00:53,236 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.const_attention_rate, batch_count=586170.0, ans=0.025 +2023-05-10 19:01:19,111 INFO [train.py:1021] (1/2) Epoch 33, batch 900, loss[loss=0.1467, simple_loss=0.2323, pruned_loss=0.03052, over 36933.00 frames. ], tot_loss[loss=0.1656, simple_loss=0.259, pruned_loss=0.0361, over 7133804.12 frames. ], batch size: 86, lr: 3.33e-03, grad_scale: 32.0 +2023-05-10 19:01:20,760 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62850-0007-91323-0_sp1.1 from training. Duration: 20.4409375 +2023-05-10 19:01:24,009 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module2.balancer1.prob, batch_count=586270.0, ans=0.125 +2023-05-10 19:01:44,935 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.0.self_attn1.whiten, num_groups=1, num_channels=192, metric=11.89 vs. limit=22.5 +2023-05-10 19:01:45,620 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.balancer2.prob, batch_count=586320.0, ans=0.125 +2023-05-10 19:02:03,199 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=192, metric=5.41 vs. limit=15.0 +2023-05-10 19:02:14,743 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.553e+02 3.095e+02 3.529e+02 4.266e+02 6.399e+02, threshold=7.058e+02, percent-clipped=0.0 +2023-05-10 19:02:15,253 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module1.balancer1.prob, batch_count=586420.0, ans=0.125 +2023-05-10 19:02:36,566 INFO [train.py:1021] (1/2) Epoch 33, batch 950, loss[loss=0.1774, simple_loss=0.2698, pruned_loss=0.04247, over 36828.00 frames. ], tot_loss[loss=0.1647, simple_loss=0.2578, pruned_loss=0.03578, over 7142328.08 frames. ], batch size: 113, lr: 3.33e-03, grad_scale: 32.0 +2023-05-10 19:02:44,369 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0062-62366-0_sp0.9 from training. Duration: 22.511125 +2023-05-10 19:02:44,695 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([2.8083, 3.5262, 3.3179, 4.1997, 2.7102, 3.6064, 4.2456, 3.6877], + device='cuda:1') +2023-05-10 19:02:45,838 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0031-39906-0 from training. Duration: 20.675 +2023-05-10 19:02:49,016 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=586520.0, ans=0.0 +2023-05-10 19:02:55,666 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=256, metric=5.16 vs. limit=15.0 +2023-05-10 19:03:08,678 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=2.75 vs. limit=6.0 +2023-05-10 19:03:15,574 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.bypass_mid.scale_min, batch_count=586620.0, ans=0.2 +2023-05-10 19:03:21,544 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.out_combiner.scale_min, batch_count=586670.0, ans=0.2 +2023-05-10 19:03:34,652 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=5.19 vs. limit=15.0 +2023-05-10 19:03:38,790 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=586720.0, ans=0.125 +2023-05-10 19:03:53,449 INFO [train.py:1021] (1/2) Epoch 33, batch 1000, loss[loss=0.1682, simple_loss=0.2678, pruned_loss=0.03428, over 37098.00 frames. ], tot_loss[loss=0.1645, simple_loss=0.2575, pruned_loss=0.03578, over 7144241.02 frames. ], batch size: 107, lr: 3.33e-03, grad_scale: 32.0 +2023-05-10 19:03:59,759 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.1.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([5.6181, 5.4416, 4.7437, 5.2129], device='cuda:1') +2023-05-10 19:04:07,948 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=586820.0, ans=0.125 +2023-05-10 19:04:16,187 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module2.balancer2.prob, batch_count=586820.0, ans=0.125 +2023-05-10 19:04:29,402 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62850-0007-91323-0_sp0.9 from training. Duration: 24.9833125 +2023-05-10 19:04:50,437 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.390e+02 3.033e+02 3.593e+02 4.684e+02 8.652e+02, threshold=7.186e+02, percent-clipped=4.0 +2023-05-10 19:05:03,436 WARNING [train.py:1182] (1/2) Exclude cut with ID 5239-32139-0047-9341-0 from training. Duration: 27.14 +2023-05-10 19:05:10,344 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.ff2_skip_rate, batch_count=587020.0, ans=0.0 +2023-05-10 19:05:11,414 INFO [train.py:1021] (1/2) Epoch 33, batch 1050, loss[loss=0.1622, simple_loss=0.2556, pruned_loss=0.03441, over 37110.00 frames. ], tot_loss[loss=0.1647, simple_loss=0.2578, pruned_loss=0.03576, over 7158668.67 frames. ], batch size: 98, lr: 3.33e-03, grad_scale: 16.0 +2023-05-10 19:05:20,598 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0044-62348-0 from training. Duration: 22.44 +2023-05-10 19:05:26,845 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module1.balancer1.prob, batch_count=587070.0, ans=0.125 +2023-05-10 19:05:29,903 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.ff2_skip_rate, batch_count=587070.0, ans=0.0 +2023-05-10 19:05:45,179 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module2.balancer1.prob, batch_count=587120.0, ans=0.125 +2023-05-10 19:05:48,060 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward3.hidden_balancer.prob, batch_count=587120.0, ans=0.125 +2023-05-10 19:06:27,994 INFO [train.py:1021] (1/2) Epoch 33, batch 1100, loss[loss=0.1745, simple_loss=0.2752, pruned_loss=0.03685, over 34815.00 frames. ], tot_loss[loss=0.1649, simple_loss=0.258, pruned_loss=0.03586, over 7152868.32 frames. ], batch size: 145, lr: 3.33e-03, grad_scale: 16.0 +2023-05-10 19:06:43,296 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0060-62364-0_sp0.9 from training. Duration: 21.361125 +2023-05-10 19:06:49,299 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0079-62383-0_sp1.1 from training. Duration: 27.0318125 +2023-05-10 19:06:53,022 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=587320.0, ans=0.1 +2023-05-10 19:07:00,781 WARNING [train.py:1182] (1/2) Exclude cut with ID 5622-44585-0006-90525-0_sp0.9 from training. Duration: 28.638875 +2023-05-10 19:07:14,591 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_skip_rate, batch_count=587420.0, ans=0.0 +2023-05-10 19:07:15,813 WARNING [train.py:1182] (1/2) Exclude cut with ID 3340-169293-0054-76830-0 from training. Duration: 20.4 +2023-05-10 19:07:15,981 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.hidden_balancer.prob, batch_count=587420.0, ans=0.125 +2023-05-10 19:07:24,597 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.558e+02 3.284e+02 4.013e+02 4.961e+02 8.405e+02, threshold=8.026e+02, percent-clipped=8.0 +2023-05-10 19:07:25,473 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=256, metric=16.04 vs. limit=22.5 +2023-05-10 19:07:44,197 INFO [train.py:1021] (1/2) Epoch 33, batch 1150, loss[loss=0.1829, simple_loss=0.2763, pruned_loss=0.04474, over 36798.00 frames. ], tot_loss[loss=0.1645, simple_loss=0.2573, pruned_loss=0.03585, over 7160009.96 frames. ], batch size: 122, lr: 3.33e-03, grad_scale: 16.0 +2023-05-10 19:07:50,797 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0071-62375-0 from training. Duration: 20.025 +2023-05-10 19:07:50,811 WARNING [train.py:1182] (1/2) Exclude cut with ID 2364-131735-0112-64612-0_sp0.9 from training. Duration: 20.488875 +2023-05-10 19:07:58,777 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0079-62383-0 from training. Duration: 29.735 +2023-05-10 19:08:26,169 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.ff2_skip_rate, batch_count=587620.0, ans=0.0 +2023-05-10 19:08:29,052 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=587670.0, ans=0.0 +2023-05-10 19:08:32,096 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward3.hidden_balancer.prob, batch_count=587670.0, ans=0.125 +2023-05-10 19:08:32,220 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=587670.0, ans=0.125 +2023-05-10 19:08:46,636 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.1.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([5.3705, 5.1749, 4.5231, 4.8864], device='cuda:1') +2023-05-10 19:08:47,277 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=3.90 vs. limit=6.0 +2023-05-10 19:08:48,162 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.ff2_skip_rate, batch_count=587720.0, ans=0.0 +2023-05-10 19:09:01,329 INFO [train.py:1021] (1/2) Epoch 33, batch 1200, loss[loss=0.1768, simple_loss=0.2762, pruned_loss=0.03867, over 36323.00 frames. ], tot_loss[loss=0.1646, simple_loss=0.2575, pruned_loss=0.03586, over 7160867.57 frames. ], batch size: 126, lr: 3.33e-03, grad_scale: 32.0 +2023-05-10 19:09:22,377 WARNING [train.py:1182] (1/2) Exclude cut with ID 7276-92427-0014-12983-0_sp0.9 from training. Duration: 21.3055625 +2023-05-10 19:09:22,450 WARNING [train.py:1182] (1/2) Exclude cut with ID 1025-75365-0008-79168-0_sp0.9 from training. Duration: 22.0666875 +2023-05-10 19:09:27,440 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=192, metric=4.72 vs. limit=10.0 +2023-05-10 19:09:45,070 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=587870.0, ans=0.125 +2023-05-10 19:09:48,563 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=6.31 vs. limit=15.0 +2023-05-10 19:09:59,791 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.376e+02 3.026e+02 3.661e+02 4.353e+02 7.377e+02, threshold=7.322e+02, percent-clipped=0.0 +2023-05-10 19:10:02,444 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.0.self_attn2.whiten, num_groups=1, num_channels=192, metric=10.62 vs. limit=22.5 +2023-05-10 19:10:18,149 INFO [train.py:1021] (1/2) Epoch 33, batch 1250, loss[loss=0.1501, simple_loss=0.2331, pruned_loss=0.03348, over 36937.00 frames. ], tot_loss[loss=0.1655, simple_loss=0.2585, pruned_loss=0.0362, over 7140901.80 frames. ], batch size: 86, lr: 3.33e-03, grad_scale: 16.0 +2023-05-10 19:10:35,876 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=588070.0, ans=0.1 +2023-05-10 19:11:01,038 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.bypass_mid.scale_min, batch_count=588120.0, ans=0.2 +2023-05-10 19:11:07,057 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=588170.0, ans=0.125 +2023-05-10 19:11:15,986 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.nonlin_attention.balancer.prob, batch_count=588170.0, ans=0.125 +2023-05-10 19:11:23,248 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0062-62366-0 from training. Duration: 20.26 +2023-05-10 19:11:36,014 INFO [train.py:1021] (1/2) Epoch 33, batch 1300, loss[loss=0.1835, simple_loss=0.2805, pruned_loss=0.04321, over 35968.00 frames. ], tot_loss[loss=0.1649, simple_loss=0.2579, pruned_loss=0.03601, over 7169308.50 frames. ], batch size: 133, lr: 3.33e-03, grad_scale: 16.0 +2023-05-10 19:11:38,188 WARNING [train.py:1182] (1/2) Exclude cut with ID 5239-32139-0030-9324-0_sp0.9 from training. Duration: 21.3444375 +2023-05-10 19:11:50,456 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.hidden_balancer.prob, batch_count=588320.0, ans=0.125 +2023-05-10 19:12:23,073 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.nonlin_attention.balancer.prob, batch_count=588420.0, ans=0.125 +2023-05-10 19:12:34,363 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.348e+02 2.991e+02 3.418e+02 4.069e+02 7.058e+02, threshold=6.835e+02, percent-clipped=0.0 +2023-05-10 19:12:37,496 WARNING [train.py:1182] (1/2) Exclude cut with ID 497-129325-0061-62254-0_sp1.1 from training. Duration: 0.97725 +2023-05-10 19:12:47,151 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=588470.0, ans=0.125 +2023-05-10 19:12:52,736 INFO [train.py:1021] (1/2) Epoch 33, batch 1350, loss[loss=0.1475, simple_loss=0.2297, pruned_loss=0.03261, over 34994.00 frames. ], tot_loss[loss=0.1649, simple_loss=0.258, pruned_loss=0.03592, over 7162102.53 frames. ], batch size: 77, lr: 3.33e-03, grad_scale: 16.0 +2023-05-10 19:13:05,129 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module2.balancer1.max_abs, batch_count=588520.0, ans=10.0 +2023-05-10 19:13:20,891 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0031-39906-0_sp0.9 from training. Duration: 22.97225 +2023-05-10 19:13:50,245 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=588670.0, ans=0.1 +2023-05-10 19:13:54,550 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0047-39922-0_sp0.9 from training. Duration: 21.97775 +2023-05-10 19:14:02,235 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.balancer2.prob, batch_count=588720.0, ans=0.125 +2023-05-10 19:14:09,499 INFO [train.py:1021] (1/2) Epoch 33, batch 1400, loss[loss=0.1484, simple_loss=0.2429, pruned_loss=0.02698, over 36867.00 frames. ], tot_loss[loss=0.165, simple_loss=0.2583, pruned_loss=0.03589, over 7174920.81 frames. ], batch size: 96, lr: 3.33e-03, grad_scale: 16.0 +2023-05-10 19:14:09,599 WARNING [train.py:1182] (1/2) Exclude cut with ID 1112-1043-0006-89194-0_sp0.9 from training. Duration: 21.8333125 +2023-05-10 19:14:21,248 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=256, metric=8.17 vs. limit=22.5 +2023-05-10 19:14:22,106 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0031-94921-0 from training. Duration: 20.47 +2023-05-10 19:14:29,006 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.0.layers.0.self_attn_weights, attn_weights_entropy = tensor([6.0244, 5.3400, 5.1581, 5.8029], device='cuda:1') +2023-05-10 19:15:08,211 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.456e+02 3.148e+02 3.781e+02 4.359e+02 7.014e+02, threshold=7.562e+02, percent-clipped=1.0 +2023-05-10 19:15:13,031 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.out_proj.dropout_p, batch_count=588970.0, ans=0.1 +2023-05-10 19:15:13,568 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=256, metric=4.47 vs. limit=15.0 +2023-05-10 19:15:16,230 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module2.balancer1.min_positive, batch_count=588970.0, ans=0.025 +2023-05-10 19:15:27,023 INFO [train.py:1021] (1/2) Epoch 33, batch 1450, loss[loss=0.1488, simple_loss=0.2364, pruned_loss=0.03065, over 37177.00 frames. ], tot_loss[loss=0.1652, simple_loss=0.2583, pruned_loss=0.036, over 7198802.76 frames. ], batch size: 93, lr: 3.32e-03, grad_scale: 16.0 +2023-05-10 19:15:33,047 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0037-39912-0_sp0.9 from training. Duration: 20.67225 +2023-05-10 19:15:36,329 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_skip_rate, batch_count=589020.0, ans=0.0 +2023-05-10 19:15:47,594 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.conv_module2.whiten, num_groups=1, num_channels=192, metric=5.14 vs. limit=15.0 +2023-05-10 19:15:52,484 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0024-94914-0_sp0.9 from training. Duration: 25.2444375 +2023-05-10 19:15:56,139 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=4.20 vs. limit=15.0 +2023-05-10 19:15:56,352 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=256, metric=11.66 vs. limit=15.0 +2023-05-10 19:16:17,479 WARNING [train.py:1182] (1/2) Exclude cut with ID 3340-169293-0021-76797-0_sp0.9 from training. Duration: 21.1445 +2023-05-10 19:16:17,641 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward3.hidden_balancer.prob, batch_count=589170.0, ans=0.125 +2023-05-10 19:16:27,416 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 19:16:39,540 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module2.balancer1.prob, batch_count=589220.0, ans=0.125 +2023-05-10 19:16:43,443 INFO [train.py:1021] (1/2) Epoch 33, batch 1500, loss[loss=0.1492, simple_loss=0.2364, pruned_loss=0.03103, over 36815.00 frames. ], tot_loss[loss=0.1639, simple_loss=0.2567, pruned_loss=0.03556, over 7192220.23 frames. ], batch size: 89, lr: 3.32e-03, grad_scale: 16.0 +2023-05-10 19:16:50,571 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.self_attn2.whiten, num_groups=1, num_channels=256, metric=11.17 vs. limit=22.5 +2023-05-10 19:17:20,079 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.0022, 4.0712, 3.7452, 4.0765, 3.3884, 3.0351, 3.4862, 2.9764], + device='cuda:1') +2023-05-10 19:17:36,557 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0079-62383-0_sp0.9 from training. Duration: 33.038875 +2023-05-10 19:17:42,303 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.345e+02 3.056e+02 3.378e+02 4.254e+02 7.108e+02, threshold=6.757e+02, percent-clipped=0.0 +2023-05-10 19:17:45,777 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([2.6204, 3.5049, 3.3496, 4.1431, 2.3228, 3.6060, 4.1977, 3.6711], + device='cuda:1') +2023-05-10 19:17:58,359 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.bypass.skip_rate, batch_count=589470.0, ans=0.07 +2023-05-10 19:17:58,377 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=589470.0, ans=0.1 +2023-05-10 19:18:01,057 INFO [train.py:1021] (1/2) Epoch 33, batch 1550, loss[loss=0.1735, simple_loss=0.2735, pruned_loss=0.03672, over 36357.00 frames. ], tot_loss[loss=0.1638, simple_loss=0.2566, pruned_loss=0.03546, over 7207390.28 frames. ], batch size: 126, lr: 3.32e-03, grad_scale: 16.0 +2023-05-10 19:18:15,738 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=192, metric=2.98 vs. limit=10.0 +2023-05-10 19:18:16,709 WARNING [train.py:1182] (1/2) Exclude cut with ID 6426-64291-0000-16059-0_sp0.9 from training. Duration: 20.0944375 +2023-05-10 19:18:32,008 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0044-62348-0_sp1.1 from training. Duration: 20.4 +2023-05-10 19:18:33,720 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.ff2_skip_rate, batch_count=589620.0, ans=0.0 +2023-05-10 19:18:39,616 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62851-0022-91297-0 from training. Duration: 20.085 +2023-05-10 19:18:50,733 WARNING [train.py:1182] (1/2) Exclude cut with ID 4860-13185-0032-76709-0_sp0.9 from training. Duration: 23.07775 +2023-05-10 19:18:53,857 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.balancer1.prob, batch_count=589670.0, ans=0.125 +2023-05-10 19:18:55,349 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=589670.0, ans=0.125 +2023-05-10 19:19:08,590 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.0.conv_module2.whiten, num_groups=1, num_channels=192, metric=6.77 vs. limit=15.0 +2023-05-10 19:19:18,211 INFO [train.py:1021] (1/2) Epoch 33, batch 1600, loss[loss=0.1536, simple_loss=0.2397, pruned_loss=0.03382, over 37039.00 frames. ], tot_loss[loss=0.1636, simple_loss=0.2565, pruned_loss=0.03535, over 7214680.10 frames. ], batch size: 94, lr: 3.32e-03, grad_scale: 32.0 +2023-05-10 19:19:38,941 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=2.00 vs. limit=6.0 +2023-05-10 19:19:39,442 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0044-62348-0_sp0.9 from training. Duration: 24.9333125 +2023-05-10 19:19:42,608 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass.skip_rate, batch_count=589820.0, ans=0.04949747468305833 +2023-05-10 19:20:16,494 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.468e+02 2.997e+02 3.198e+02 3.738e+02 5.852e+02, threshold=6.396e+02, percent-clipped=0.0 +2023-05-10 19:20:25,667 WARNING [train.py:1182] (1/2) Exclude cut with ID 5118-111612-0016-124680-0_sp0.9 from training. Duration: 20.388875 +2023-05-10 19:20:25,884 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=589970.0, ans=0.1 +2023-05-10 19:20:33,179 WARNING [train.py:1182] (1/2) Exclude cut with ID 432-122774-0017-62487-0_sp1.1 from training. Duration: 20.3590625 +2023-05-10 19:20:34,663 INFO [train.py:1021] (1/2) Epoch 33, batch 1650, loss[loss=0.1748, simple_loss=0.2688, pruned_loss=0.04038, over 36761.00 frames. ], tot_loss[loss=0.1641, simple_loss=0.2572, pruned_loss=0.03552, over 7210385.11 frames. ], batch size: 122, lr: 3.32e-03, grad_scale: 32.0 +2023-05-10 19:20:52,627 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.conv_module2.whiten, num_groups=1, num_channels=256, metric=4.38 vs. limit=15.0 +2023-05-10 19:21:35,433 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.attention_skip_rate, batch_count=590220.0, ans=0.0 +2023-05-10 19:21:44,200 WARNING [train.py:1182] (1/2) Exclude cut with ID 3557-8342-0013-54691-0_sp1.1 from training. Duration: 0.836375 +2023-05-10 19:21:52,276 INFO [train.py:1021] (1/2) Epoch 33, batch 1700, loss[loss=0.175, simple_loss=0.2701, pruned_loss=0.03994, over 34678.00 frames. ], tot_loss[loss=0.1643, simple_loss=0.2572, pruned_loss=0.03574, over 7201804.26 frames. ], batch size: 145, lr: 3.32e-03, grad_scale: 32.0 +2023-05-10 19:22:01,549 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.bypass.skip_rate, batch_count=590270.0, ans=0.07 +2023-05-10 19:22:09,105 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=590320.0, ans=0.125 +2023-05-10 19:22:10,573 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.bypass_mid.scale_min, batch_count=590320.0, ans=0.2 +2023-05-10 19:22:29,118 WARNING [train.py:1182] (1/2) Exclude cut with ID 8565-290391-0049-67394-0_sp0.9 from training. Duration: 21.3166875 +2023-05-10 19:22:34,038 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module2.balancer2.prob, batch_count=590370.0, ans=0.125 +2023-05-10 19:22:41,555 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.balancer2.prob, batch_count=590420.0, ans=0.125 +2023-05-10 19:22:50,639 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.555e+02 3.131e+02 3.449e+02 3.957e+02 5.455e+02, threshold=6.898e+02, percent-clipped=0.0 +2023-05-10 19:22:50,982 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.0.layers.0.self_attn_weights, attn_weights_entropy = tensor([6.3163, 5.5952, 5.4740, 6.0740], device='cuda:1') +2023-05-10 19:22:59,786 WARNING [train.py:1182] (1/2) Exclude cut with ID 6533-399-0029-104863-0_sp0.9 from training. Duration: 22.1055625 +2023-05-10 19:23:08,714 INFO [train.py:1021] (1/2) Epoch 33, batch 1750, loss[loss=0.1672, simple_loss=0.2645, pruned_loss=0.03493, over 36940.00 frames. ], tot_loss[loss=0.166, simple_loss=0.2584, pruned_loss=0.03678, over 7192373.47 frames. ], batch size: 105, lr: 3.32e-03, grad_scale: 16.0 +2023-05-10 19:23:13,282 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0094-26379-0_sp1.1 from training. Duration: 21.77725 +2023-05-10 19:23:33,562 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0005-134304-0_sp0.9 from training. Duration: 27.8166875 +2023-05-10 19:23:56,767 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0021-15852-0_sp1.1 from training. Duration: 22.5090625 +2023-05-10 19:24:03,123 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward3.hidden_balancer.prob, batch_count=590670.0, ans=0.125 +2023-05-10 19:24:03,530 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=3.63 vs. limit=6.0 +2023-05-10 19:24:04,257 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0005-134304-0 from training. Duration: 25.035 +2023-05-10 19:24:20,119 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward3.hidden_balancer.prob, batch_count=590720.0, ans=0.125 +2023-05-10 19:24:21,666 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([4.9177, 4.2869, 3.2496, 2.9427], device='cuda:1') +2023-05-10 19:24:22,867 WARNING [train.py:1182] (1/2) Exclude cut with ID 774-127930-0014-10412-0_sp1.1 from training. Duration: 0.95 +2023-05-10 19:24:25,723 INFO [train.py:1021] (1/2) Epoch 33, batch 1800, loss[loss=0.1566, simple_loss=0.2484, pruned_loss=0.03244, over 37008.00 frames. ], tot_loss[loss=0.1673, simple_loss=0.259, pruned_loss=0.03779, over 7186257.53 frames. ], batch size: 99, lr: 3.32e-03, grad_scale: 16.0 +2023-05-10 19:24:30,015 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.whiten, num_groups=1, num_channels=256, metric=8.94 vs. limit=12.0 +2023-05-10 19:24:39,791 WARNING [train.py:1182] (1/2) Exclude cut with ID 3033-130750-0096-55598-0_sp0.9 from training. Duration: 0.92225 +2023-05-10 19:24:44,647 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module2.balancer1.max_abs, batch_count=590820.0, ans=10.0 +2023-05-10 19:24:53,749 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.balancer2.prob, batch_count=590820.0, ans=0.125 +2023-05-10 19:25:00,312 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module1.balancer1.prob, batch_count=590870.0, ans=0.125 +2023-05-10 19:25:05,991 WARNING [train.py:1182] (1/2) Exclude cut with ID 4511-76322-0006-80011-0 from training. Duration: 21.97 +2023-05-10 19:25:06,241 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=590870.0, ans=0.1 +2023-05-10 19:25:26,274 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.644e+02 3.384e+02 3.742e+02 4.374e+02 6.857e+02, threshold=7.484e+02, percent-clipped=0.0 +2023-05-10 19:25:27,904 WARNING [train.py:1182] (1/2) Exclude cut with ID 7492-105653-0055-62765-0_sp0.9 from training. Duration: 21.97225 +2023-05-10 19:25:29,424 WARNING [train.py:1182] (1/2) Exclude cut with ID 453-131332-0000-47844-0_sp0.9 from training. Duration: 25.3333125 +2023-05-10 19:25:40,037 WARNING [train.py:1182] (1/2) Exclude cut with ID 5172-29468-0015-19128-0_sp0.9 from training. Duration: 21.5055625 +2023-05-10 19:25:42,995 INFO [train.py:1021] (1/2) Epoch 33, batch 1850, loss[loss=0.1981, simple_loss=0.2825, pruned_loss=0.05686, over 23953.00 frames. ], tot_loss[loss=0.1686, simple_loss=0.2595, pruned_loss=0.03882, over 7145036.47 frames. ], batch size: 234, lr: 3.32e-03, grad_scale: 16.0 +2023-05-10 19:25:49,213 WARNING [train.py:1182] (1/2) Exclude cut with ID 453-131332-0000-47844-0_sp1.1 from training. Duration: 20.72725 +2023-05-10 19:25:49,515 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.balancer1.prob, batch_count=591020.0, ans=0.125 +2023-05-10 19:26:15,619 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.balancer2.prob, batch_count=591120.0, ans=0.125 +2023-05-10 19:26:21,833 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0030-130156-0_sp0.9 from training. Duration: 26.32775 +2023-05-10 19:26:25,073 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=591120.0, ans=0.125 +2023-05-10 19:26:48,209 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.ff3_skip_rate, batch_count=591220.0, ans=0.0 +2023-05-10 19:26:57,153 WARNING [train.py:1182] (1/2) Exclude cut with ID 3867-173237-0077-144769-0 from training. Duration: 20.025 +2023-05-10 19:27:00,031 INFO [train.py:1021] (1/2) Epoch 33, batch 1900, loss[loss=0.1594, simple_loss=0.2405, pruned_loss=0.03911, over 36745.00 frames. ], tot_loss[loss=0.1696, simple_loss=0.2597, pruned_loss=0.0397, over 7153032.46 frames. ], batch size: 89, lr: 3.32e-03, grad_scale: 16.0 +2023-05-10 19:27:03,180 WARNING [train.py:1182] (1/2) Exclude cut with ID 6709-74022-0004-86860-0_sp1.1 from training. Duration: 0.9409375 +2023-05-10 19:27:03,202 WARNING [train.py:1182] (1/2) Exclude cut with ID 4757-1811-0023-62229-0_sp0.9 from training. Duration: 21.37775 +2023-05-10 19:27:18,550 INFO [scaling.py:969] (1/2) Whitening: name=encoder_embed.convnext.out_whiten, num_groups=1, num_channels=128, metric=4.86 vs. limit=5.0 +2023-05-10 19:27:25,091 WARNING [train.py:1182] (1/2) Exclude cut with ID 1250-135782-0004-25974-0_sp0.9 from training. Duration: 21.17225 +2023-05-10 19:27:25,101 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0021-15852-0_sp0.9 from training. Duration: 27.511125 +2023-05-10 19:27:48,737 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=591420.0, ans=0.0 +2023-05-10 19:27:59,229 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.const_attention_rate, batch_count=591420.0, ans=0.025 +2023-05-10 19:28:00,387 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.833e+02 3.487e+02 3.765e+02 4.497e+02 6.996e+02, threshold=7.530e+02, percent-clipped=0.0 +2023-05-10 19:28:02,077 WARNING [train.py:1182] (1/2) Exclude cut with ID 453-131332-0000-47844-0 from training. Duration: 22.8 +2023-05-10 19:28:06,635 WARNING [train.py:1182] (1/2) Exclude cut with ID 4964-30587-0040-44509-0 from training. Duration: 22.585 +2023-05-10 19:28:17,656 INFO [train.py:1021] (1/2) Epoch 33, batch 1950, loss[loss=0.1987, simple_loss=0.2839, pruned_loss=0.05677, over 35900.00 frames. ], tot_loss[loss=0.1698, simple_loss=0.2591, pruned_loss=0.04022, over 7157539.86 frames. ], batch size: 133, lr: 3.32e-03, grad_scale: 16.0 +2023-05-10 19:28:18,436 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=10.63 vs. limit=15.0 +2023-05-10 19:28:25,702 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.ff2_skip_rate, batch_count=591520.0, ans=0.0 +2023-05-10 19:28:37,956 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0001-146967-0_sp0.9 from training. Duration: 22.0166875 +2023-05-10 19:28:43,175 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=5.79 vs. limit=15.0 +2023-05-10 19:28:55,930 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0003-134302-0_sp1.1 from training. Duration: 24.395375 +2023-05-10 19:28:57,684 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module1.balancer2.prob, batch_count=591620.0, ans=0.125 +2023-05-10 19:29:05,280 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0017-41203-0_sp0.9 from training. Duration: 27.47775 +2023-05-10 19:29:07,283 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.balancer2.prob, batch_count=591670.0, ans=0.125 +2023-05-10 19:29:09,961 WARNING [train.py:1182] (1/2) Exclude cut with ID 432-122774-0017-62487-0_sp0.9 from training. Duration: 24.8833125 +2023-05-10 19:29:13,036 WARNING [train.py:1182] (1/2) Exclude cut with ID 6758-72288-0033-108368-0 from training. Duration: 23.39 +2023-05-10 19:29:18,983 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0007-12994-0_sp0.9 from training. Duration: 28.72225 +2023-05-10 19:29:30,037 WARNING [train.py:1182] (1/2) Exclude cut with ID 585-294811-0110-133686-0_sp0.9 from training. Duration: 20.8944375 +2023-05-10 19:29:34,461 INFO [train.py:1021] (1/2) Epoch 33, batch 2000, loss[loss=0.1557, simple_loss=0.2378, pruned_loss=0.03676, over 37074.00 frames. ], tot_loss[loss=0.1698, simple_loss=0.2586, pruned_loss=0.04047, over 7162731.70 frames. ], batch size: 88, lr: 3.32e-03, grad_scale: 32.0 +2023-05-10 19:29:44,991 WARNING [train.py:1182] (1/2) Exclude cut with ID 5796-66357-0007-116447-0_sp0.9 from training. Duration: 23.8444375 +2023-05-10 19:30:05,208 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.bypass.skip_rate, batch_count=591870.0, ans=0.035 +2023-05-10 19:30:07,960 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0007-12994-0 from training. Duration: 25.85 +2023-05-10 19:30:09,442 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0023-13010-0 from training. Duration: 21.39 +2023-05-10 19:30:21,991 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0014-15845-0 from training. Duration: 27.92 +2023-05-10 19:30:28,276 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module2.balancer1.prob, batch_count=591920.0, ans=0.125 +2023-05-10 19:30:31,275 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.3.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 19:30:33,822 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.904e+02 3.459e+02 3.926e+02 4.339e+02 6.224e+02, threshold=7.851e+02, percent-clipped=0.0 +2023-05-10 19:30:47,652 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0039-130165-0_sp0.9 from training. Duration: 20.661125 +2023-05-10 19:30:51,029 INFO [train.py:1021] (1/2) Epoch 33, batch 2050, loss[loss=0.1739, simple_loss=0.266, pruned_loss=0.04092, over 34831.00 frames. ], tot_loss[loss=0.171, simple_loss=0.2598, pruned_loss=0.04114, over 7146025.40 frames. ], batch size: 145, lr: 3.32e-03, grad_scale: 32.0 +2023-05-10 19:31:15,862 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0043-15874-0_sp0.9 from training. Duration: 20.07225 +2023-05-10 19:31:23,238 WARNING [train.py:1182] (1/2) Exclude cut with ID 1085-156170-0017-128270-0 from training. Duration: 21.01 +2023-05-10 19:31:48,175 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.nonlin_attention.balancer.min_positive, batch_count=592170.0, ans=0.05 +2023-05-10 19:31:51,042 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward3.hidden_balancer.prob, batch_count=592220.0, ans=0.125 +2023-05-10 19:32:01,740 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.bypass.skip_rate, batch_count=592220.0, ans=0.035 +2023-05-10 19:32:07,722 INFO [train.py:1021] (1/2) Epoch 33, batch 2100, loss[loss=0.1554, simple_loss=0.2349, pruned_loss=0.03795, over 37075.00 frames. ], tot_loss[loss=0.1709, simple_loss=0.2592, pruned_loss=0.04134, over 7149827.25 frames. ], batch size: 88, lr: 3.32e-03, grad_scale: 32.0 +2023-05-10 19:32:14,601 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=7.88 vs. limit=15.0 +2023-05-10 19:32:20,032 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.balancer_na.min_abs, batch_count=592270.0, ans=0.02 +2023-05-10 19:32:35,186 WARNING [train.py:1182] (1/2) Exclude cut with ID 2195-150901-0045-59933-0 from training. Duration: 20.65 +2023-05-10 19:32:35,472 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.3.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 19:32:40,030 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=592370.0, ans=0.1 +2023-05-10 19:32:44,747 WARNING [train.py:1182] (1/2) Exclude cut with ID 5796-66357-0007-116447-0 from training. Duration: 21.46 +2023-05-10 19:33:07,953 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.993e+02 3.623e+02 4.154e+02 4.965e+02 7.322e+02, threshold=8.307e+02, percent-clipped=0.0 +2023-05-10 19:33:24,768 INFO [train.py:1021] (1/2) Epoch 33, batch 2150, loss[loss=0.1759, simple_loss=0.2718, pruned_loss=0.03996, over 36905.00 frames. ], tot_loss[loss=0.1709, simple_loss=0.2588, pruned_loss=0.04153, over 7145779.59 frames. ], batch size: 105, lr: 3.31e-03, grad_scale: 32.0 +2023-05-10 19:33:32,265 WARNING [train.py:1182] (1/2) Exclude cut with ID 3557-8342-0013-54691-0 from training. Duration: 0.92 +2023-05-10 19:33:36,227 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.8066, 3.8700, 4.2577, 3.9253], device='cuda:1') +2023-05-10 19:33:38,797 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0023-13010-0_sp0.9 from training. Duration: 23.7666875 +2023-05-10 19:33:46,583 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.attention_skip_rate, batch_count=592570.0, ans=0.0 +2023-05-10 19:33:55,357 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=8.28 vs. limit=15.0 +2023-05-10 19:34:15,703 WARNING [train.py:1182] (1/2) Exclude cut with ID 8544-281189-0060-101339-0_sp0.9 from training. Duration: 20.861125 +2023-05-10 19:34:24,699 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-65654-0031-41259-0_sp0.9 from training. Duration: 22.711125 +2023-05-10 19:34:26,527 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_skip_rate, batch_count=592720.0, ans=0.0 +2023-05-10 19:34:28,596 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([1.8193, 3.1212, 4.6020, 3.1146], device='cuda:1') +2023-05-10 19:34:34,489 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.out_combiner.scale_min, batch_count=592720.0, ans=0.2 +2023-05-10 19:34:36,046 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.balancer2.prob, batch_count=592720.0, ans=0.125 +2023-05-10 19:34:41,632 INFO [train.py:1021] (1/2) Epoch 33, batch 2200, loss[loss=0.1835, simple_loss=0.2716, pruned_loss=0.04767, over 36770.00 frames. ], tot_loss[loss=0.1713, simple_loss=0.2588, pruned_loss=0.04191, over 7120549.78 frames. ], batch size: 118, lr: 3.31e-03, grad_scale: 32.0 +2023-05-10 19:34:54,745 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.balancer1.prob, batch_count=592770.0, ans=0.125 +2023-05-10 19:35:00,647 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module2.balancer2.min_positive, batch_count=592820.0, ans=0.05 +2023-05-10 19:35:06,662 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module2.balancer2.prob, batch_count=592820.0, ans=0.125 +2023-05-10 19:35:12,464 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0043-132310-0_sp1.1 from training. Duration: 22.986375 +2023-05-10 19:35:17,232 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.const_attention_rate, batch_count=592870.0, ans=0.025 +2023-05-10 19:35:29,600 WARNING [train.py:1182] (1/2) Exclude cut with ID 8040-260924-0003-80960-0_sp0.9 from training. Duration: 22.07225 +2023-05-10 19:35:35,705 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0045-26330-0_sp0.9 from training. Duration: 20.3055625 +2023-05-10 19:35:38,756 WARNING [train.py:1182] (1/2) Exclude cut with ID 6356-271890-0060-94317-0_sp0.9 from training. Duration: 20.72225 +2023-05-10 19:35:41,684 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.772e+02 3.502e+02 3.860e+02 4.353e+02 7.257e+02, threshold=7.720e+02, percent-clipped=0.0 +2023-05-10 19:35:46,161 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.whiten, num_groups=1, num_channels=256, metric=3.22 vs. limit=12.0 +2023-05-10 19:35:48,768 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module1.balancer2.prob, batch_count=592970.0, ans=0.125 +2023-05-10 19:35:53,459 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.ff2_skip_rate, batch_count=592970.0, ans=0.0 +2023-05-10 19:35:57,664 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0017-41203-0_sp1.1 from training. Duration: 22.4818125 +2023-05-10 19:35:59,081 INFO [train.py:1021] (1/2) Epoch 33, batch 2250, loss[loss=0.193, simple_loss=0.2831, pruned_loss=0.05142, over 35819.00 frames. ], tot_loss[loss=0.1707, simple_loss=0.2581, pruned_loss=0.04166, over 7158922.94 frames. ], batch size: 133, lr: 3.31e-03, grad_scale: 32.0 +2023-05-10 19:36:25,136 WARNING [train.py:1182] (1/2) Exclude cut with ID 4964-30587-0040-44509-0_sp0.9 from training. Duration: 25.0944375 +2023-05-10 19:36:28,290 WARNING [train.py:1182] (1/2) Exclude cut with ID 6533-399-0047-104881-0 from training. Duration: 21.515 +2023-05-10 19:36:36,231 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0009-15840-0_sp0.9 from training. Duration: 27.02225 +2023-05-10 19:36:42,222 WARNING [train.py:1182] (1/2) Exclude cut with ID 432-122774-0010-62480-0_sp0.9 from training. Duration: 22.22225 +2023-05-10 19:36:49,589 WARNING [train.py:1182] (1/2) Exclude cut with ID 4964-30587-0085-44554-0_sp0.9 from training. Duration: 20.85 +2023-05-10 19:36:55,925 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.bypass_mid.scale_min, batch_count=593170.0, ans=0.2 +2023-05-10 19:36:57,528 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.balancer1.prob, batch_count=593170.0, ans=0.125 +2023-05-10 19:37:09,645 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.nonlin_attention.balancer.max_positive, batch_count=593220.0, ans=0.95 +2023-05-10 19:37:15,842 INFO [train.py:1021] (1/2) Epoch 33, batch 2300, loss[loss=0.1769, simple_loss=0.2652, pruned_loss=0.04427, over 36347.00 frames. ], tot_loss[loss=0.1707, simple_loss=0.2581, pruned_loss=0.04171, over 7141369.50 frames. ], batch size: 126, lr: 3.31e-03, grad_scale: 32.0 +2023-05-10 19:37:22,077 WARNING [train.py:1182] (1/2) Exclude cut with ID 4295-39940-0007-92567-0 from training. Duration: 21.54 +2023-05-10 19:37:27,940 WARNING [train.py:1182] (1/2) Exclude cut with ID 4964-30587-0040-44509-0_sp1.1 from training. Duration: 20.5318125 +2023-05-10 19:37:37,646 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0012-134311-0_sp0.9 from training. Duration: 21.9333125 +2023-05-10 19:37:48,390 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass.skip_rate, batch_count=593370.0, ans=0.04949747468305833 +2023-05-10 19:38:07,011 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.balancer2.prob, batch_count=593420.0, ans=0.125 +2023-05-10 19:38:15,705 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.541e+02 3.425e+02 3.783e+02 4.284e+02 8.244e+02, threshold=7.566e+02, percent-clipped=1.0 +2023-05-10 19:38:22,727 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_skip_rate, batch_count=593470.0, ans=0.0 +2023-05-10 19:38:24,081 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.attention_skip_rate, batch_count=593470.0, ans=0.0 +2023-05-10 19:38:25,612 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.balancer1.prob, batch_count=593470.0, ans=0.125 +2023-05-10 19:38:28,278 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0025-130151-0_sp0.9 from training. Duration: 21.7944375 +2023-05-10 19:38:32,614 INFO [train.py:1021] (1/2) Epoch 33, batch 2350, loss[loss=0.1874, simple_loss=0.2764, pruned_loss=0.04921, over 32114.00 frames. ], tot_loss[loss=0.1704, simple_loss=0.2576, pruned_loss=0.04166, over 7142998.17 frames. ], batch size: 170, lr: 3.31e-03, grad_scale: 32.0 +2023-05-10 19:38:40,192 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0002-12989-0_sp0.9 from training. Duration: 22.4666875 +2023-05-10 19:38:43,421 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=593520.0, ans=0.125 +2023-05-10 19:38:46,289 WARNING [train.py:1182] (1/2) Exclude cut with ID 6121-9014-0076-24124-0 from training. Duration: 21.635 +2023-05-10 19:38:52,250 WARNING [train.py:1182] (1/2) Exclude cut with ID 6121-9014-0076-24124-0_sp0.9 from training. Duration: 24.038875 +2023-05-10 19:38:57,726 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.ff3_skip_rate, batch_count=593570.0, ans=0.0 +2023-05-10 19:38:59,154 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.0.layers.1.self_attn_weights, attn_weights_entropy = tensor([6.2081, 5.4164, 5.5334, 6.0880], device='cuda:1') +2023-05-10 19:39:39,282 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0009-134308-0_sp1.1 from training. Duration: 21.786375 +2023-05-10 19:39:50,038 INFO [train.py:1021] (1/2) Epoch 33, batch 2400, loss[loss=0.1809, simple_loss=0.2707, pruned_loss=0.04553, over 36908.00 frames. ], tot_loss[loss=0.1707, simple_loss=0.2579, pruned_loss=0.04174, over 7136085.26 frames. ], batch size: 105, lr: 3.31e-03, grad_scale: 32.0 +2023-05-10 19:39:50,155 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0002-12989-0 from training. Duration: 20.22 +2023-05-10 19:40:08,939 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=593820.0, ans=0.1 +2023-05-10 19:40:14,319 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_skip_rate, batch_count=593820.0, ans=0.0 +2023-05-10 19:40:36,809 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=256, metric=2.99 vs. limit=15.0 +2023-05-10 19:40:50,273 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.5.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 19:41:01,709 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=593920.0, ans=0.125 +2023-05-10 19:41:15,745 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.810e+02 3.492e+02 3.954e+02 4.428e+02 7.522e+02, threshold=7.907e+02, percent-clipped=0.0 +2023-05-10 19:41:36,632 INFO [train.py:1021] (1/2) Epoch 33, batch 2450, loss[loss=0.1832, simple_loss=0.2739, pruned_loss=0.04619, over 37038.00 frames. ], tot_loss[loss=0.1708, simple_loss=0.2579, pruned_loss=0.0419, over 7122329.74 frames. ], batch size: 116, lr: 3.31e-03, grad_scale: 32.0 +2023-05-10 19:41:41,692 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.balancer1.max_abs, batch_count=594020.0, ans=10.0 +2023-05-10 19:41:44,680 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.const_attention_rate, batch_count=594020.0, ans=0.025 +2023-05-10 19:42:14,187 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.balancer1.prob, batch_count=594120.0, ans=0.125 +2023-05-10 19:42:29,646 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0043-132310-0 from training. Duration: 25.285 +2023-05-10 19:42:53,303 INFO [train.py:1021] (1/2) Epoch 33, batch 2500, loss[loss=0.1631, simple_loss=0.2512, pruned_loss=0.03756, over 36846.00 frames. ], tot_loss[loss=0.1706, simple_loss=0.2576, pruned_loss=0.04178, over 7137184.77 frames. ], batch size: 96, lr: 3.31e-03, grad_scale: 32.0 +2023-05-10 19:42:55,174 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.balancer1.prob, batch_count=594270.0, ans=0.125 +2023-05-10 19:43:09,615 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.out_combiner.scale_min, batch_count=594320.0, ans=0.2 +2023-05-10 19:43:14,146 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.5687, 3.6837, 4.0536, 3.6640], device='cuda:1') +2023-05-10 19:43:36,662 WARNING [train.py:1182] (1/2) Exclude cut with ID 811-130148-0001-63453-0_sp0.9 from training. Duration: 20.861125 +2023-05-10 19:43:36,941 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module2.balancer1.prob, batch_count=594370.0, ans=0.125 +2023-05-10 19:43:41,548 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.1.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([5.7963, 5.0603, 5.3080, 5.0208], device='cuda:1') +2023-05-10 19:43:48,273 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=192, metric=4.15 vs. limit=10.0 +2023-05-10 19:43:54,828 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.908e+02 3.624e+02 4.185e+02 5.401e+02 8.953e+02, threshold=8.370e+02, percent-clipped=1.0 +2023-05-10 19:43:59,932 WARNING [train.py:1182] (1/2) Exclude cut with ID 6010-56788-0055-90261-0 from training. Duration: 20.88 +2023-05-10 19:44:10,429 INFO [train.py:1021] (1/2) Epoch 33, batch 2550, loss[loss=0.1634, simple_loss=0.2585, pruned_loss=0.03417, over 37106.00 frames. ], tot_loss[loss=0.1704, simple_loss=0.2576, pruned_loss=0.04163, over 7148932.81 frames. ], batch size: 107, lr: 3.31e-03, grad_scale: 32.0 +2023-05-10 19:44:25,130 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass.skip_rate, batch_count=594570.0, ans=0.09899494936611666 +2023-05-10 19:44:38,178 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0045-15876-0_sp0.9 from training. Duration: 23.4166875 +2023-05-10 19:44:44,766 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([5.1490, 4.2865, 4.6731, 4.7192], device='cuda:1') +2023-05-10 19:44:55,235 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=2.99 vs. limit=6.0 +2023-05-10 19:45:18,445 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([1.9126, 2.9454, 4.5889, 3.1887], device='cuda:1') +2023-05-10 19:46:03,220 INFO [train.py:1021] (1/2) Epoch 33, batch 2600, loss[loss=0.1529, simple_loss=0.237, pruned_loss=0.03441, over 36960.00 frames. ], tot_loss[loss=0.1703, simple_loss=0.2573, pruned_loss=0.04164, over 7131369.86 frames. ], batch size: 95, lr: 3.31e-03, grad_scale: 32.0 +2023-05-10 19:46:17,352 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_skip_rate, batch_count=594820.0, ans=0.0 +2023-05-10 19:46:24,110 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.ff2_skip_rate, batch_count=594820.0, ans=0.0 +2023-05-10 19:46:29,890 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0006-134305-0 from training. Duration: 21.24 +2023-05-10 19:46:29,913 WARNING [train.py:1182] (1/2) Exclude cut with ID 6533-399-0047-104881-0_sp0.9 from training. Duration: 23.9055625 +2023-05-10 19:47:17,013 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=594920.0, ans=0.1 +2023-05-10 19:47:23,767 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.749e+02 3.355e+02 3.741e+02 4.168e+02 5.655e+02, threshold=7.482e+02, percent-clipped=0.0 +2023-05-10 19:47:25,535 WARNING [train.py:1182] (1/2) Exclude cut with ID 6758-72288-0033-108368-0_sp0.9 from training. Duration: 25.988875 +2023-05-10 19:47:39,427 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0001-134300-0_sp0.9 from training. Duration: 20.67225 +2023-05-10 19:47:44,354 INFO [train.py:1021] (1/2) Epoch 33, batch 2650, loss[loss=0.1772, simple_loss=0.2686, pruned_loss=0.04288, over 37069.00 frames. ], tot_loss[loss=0.1702, simple_loss=0.2573, pruned_loss=0.04155, over 7143198.53 frames. ], batch size: 116, lr: 3.31e-03, grad_scale: 32.0 +2023-05-10 19:48:00,234 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module2.balancer1.prob, batch_count=595020.0, ans=0.125 +2023-05-10 19:48:13,085 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=256, metric=9.85 vs. limit=22.5 +2023-05-10 19:48:26,814 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward3.hidden_balancer.prob, batch_count=595120.0, ans=0.125 +2023-05-10 19:48:29,674 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.conv_module2.whiten, num_groups=1, num_channels=256, metric=8.26 vs. limit=15.0 +2023-05-10 19:48:33,900 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0038-41224-0 from training. Duration: 20.34 +2023-05-10 19:48:35,180 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=3.10 vs. limit=15.0 +2023-05-10 19:48:51,204 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.5.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.3536, 4.5839, 2.4220, 2.5708], device='cuda:1') +2023-05-10 19:49:06,728 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.balancer2.prob, batch_count=595220.0, ans=0.125 +2023-05-10 19:49:07,259 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.conv_module2.whiten, num_groups=1, num_channels=256, metric=8.92 vs. limit=15.0 +2023-05-10 19:49:14,021 INFO [train.py:1021] (1/2) Epoch 33, batch 2700, loss[loss=0.1506, simple_loss=0.2297, pruned_loss=0.03575, over 36922.00 frames. ], tot_loss[loss=0.1702, simple_loss=0.2574, pruned_loss=0.04151, over 7135530.99 frames. ], batch size: 86, lr: 3.31e-03, grad_scale: 32.0 +2023-05-10 19:49:23,088 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=595270.0, ans=0.125 +2023-05-10 19:49:23,098 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass.scale_min, batch_count=595270.0, ans=0.2 +2023-05-10 19:49:39,527 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=595320.0, ans=0.0 +2023-05-10 19:49:41,154 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([4.9783, 4.3552, 3.0552, 2.8653], device='cuda:1') +2023-05-10 19:49:44,950 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.ff3_skip_rate, batch_count=595320.0, ans=0.0 +2023-05-10 19:50:11,294 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0026-15857-0_sp0.9 from training. Duration: 25.061125 +2023-05-10 19:50:15,455 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=192, metric=3.82 vs. limit=10.0 +2023-05-10 19:50:17,022 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.whiten, num_groups=1, num_channels=256, metric=4.32 vs. limit=12.0 +2023-05-10 19:50:21,249 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.ff2_skip_rate, batch_count=595420.0, ans=0.0 +2023-05-10 19:50:22,511 WARNING [train.py:1182] (1/2) Exclude cut with ID 3033-130750-0096-55598-0 from training. Duration: 0.83 +2023-05-10 19:50:27,281 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass.skip_rate, batch_count=595470.0, ans=0.09899494936611666 +2023-05-10 19:50:28,360 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.812e+02 3.439e+02 3.778e+02 4.325e+02 6.478e+02, threshold=7.556e+02, percent-clipped=0.0 +2023-05-10 19:50:45,810 INFO [train.py:1021] (1/2) Epoch 33, batch 2750, loss[loss=0.1876, simple_loss=0.2761, pruned_loss=0.0495, over 36307.00 frames. ], tot_loss[loss=0.1699, simple_loss=0.257, pruned_loss=0.04136, over 7134910.49 frames. ], batch size: 126, lr: 3.31e-03, grad_scale: 32.0 +2023-05-10 19:50:46,458 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=595520.0, ans=0.125 +2023-05-10 19:50:53,740 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0017-41203-0 from training. Duration: 24.73 +2023-05-10 19:51:07,520 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module1.balancer1.prob, batch_count=595570.0, ans=0.125 +2023-05-10 19:51:08,875 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0009-134308-0 from training. Duration: 23.965 +2023-05-10 19:51:16,728 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.balancer2.min_positive, batch_count=595570.0, ans=0.05 +2023-05-10 19:51:17,990 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0030-146996-0_sp0.9 from training. Duration: 22.088875 +2023-05-10 19:51:22,025 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.whiten, num_groups=1, num_channels=192, metric=4.03 vs. limit=12.0 +2023-05-10 19:51:36,612 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0006-134305-0_sp0.9 from training. Duration: 23.6 +2023-05-10 19:51:40,413 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module1.balancer2.prob, batch_count=595670.0, ans=0.125 +2023-05-10 19:51:55,434 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=14.09 vs. limit=15.0 +2023-05-10 19:52:10,678 INFO [train.py:1021] (1/2) Epoch 33, batch 2800, loss[loss=0.1577, simple_loss=0.2359, pruned_loss=0.03973, over 34560.00 frames. ], tot_loss[loss=0.1696, simple_loss=0.2565, pruned_loss=0.04129, over 7118290.94 frames. ], batch size: 76, lr: 3.31e-03, grad_scale: 32.0 +2023-05-10 19:52:11,128 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.balancer1.prob, batch_count=595770.0, ans=0.125 +2023-05-10 19:52:17,040 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward3.hidden_balancer.prob, batch_count=595770.0, ans=0.125 +2023-05-10 19:52:52,043 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=595870.0, ans=0.0 +2023-05-10 19:53:12,285 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.balancer1.prob, batch_count=595970.0, ans=0.125 +2023-05-10 19:53:13,297 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.945e+02 3.765e+02 4.557e+02 5.512e+02 9.587e+02, threshold=9.113e+02, percent-clipped=1.0 +2023-05-10 19:53:26,953 INFO [train.py:1021] (1/2) Epoch 33, batch 2850, loss[loss=0.1704, simple_loss=0.2629, pruned_loss=0.03899, over 37078.00 frames. ], tot_loss[loss=0.17, simple_loss=0.2568, pruned_loss=0.0416, over 7101859.10 frames. ], batch size: 103, lr: 3.31e-03, grad_scale: 32.0 +2023-05-10 19:53:27,046 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0024-13011-0 from training. Duration: 23.795 +2023-05-10 19:53:43,116 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=596070.0, ans=0.125 +2023-05-10 19:53:45,801 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0030-130156-0_sp1.1 from training. Duration: 21.5409375 +2023-05-10 19:53:47,400 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0019-146985-0_sp0.9 from training. Duration: 24.97775 +2023-05-10 19:53:57,310 WARNING [train.py:1182] (1/2) Exclude cut with ID 1085-156170-0017-128270-0_sp0.9 from training. Duration: 23.3444375 +2023-05-10 19:54:00,903 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=256, metric=9.57 vs. limit=22.5 +2023-05-10 19:54:08,181 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=596120.0, ans=0.125 +2023-05-10 19:54:24,745 WARNING [train.py:1182] (1/2) Exclude cut with ID 6010-56788-0055-90261-0_sp0.9 from training. Duration: 23.2 +2023-05-10 19:54:31,408 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module1.balancer2.prob, batch_count=596220.0, ans=0.125 +2023-05-10 19:54:32,827 WARNING [train.py:1182] (1/2) Exclude cut with ID 5653-46179-0060-117930-0_sp0.9 from training. Duration: 21.17225 +2023-05-10 19:54:36,081 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass.scale_min, batch_count=596220.0, ans=0.2 +2023-05-10 19:54:38,567 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=1.79 vs. limit=6.0 +2023-05-10 19:54:45,432 INFO [train.py:1021] (1/2) Epoch 33, batch 2900, loss[loss=0.1566, simple_loss=0.2429, pruned_loss=0.03517, over 36869.00 frames. ], tot_loss[loss=0.1707, simple_loss=0.2578, pruned_loss=0.04185, over 7086903.15 frames. ], batch size: 96, lr: 3.30e-03, grad_scale: 32.0 +2023-05-10 19:54:54,686 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0008-134307-0_sp0.9 from training. Duration: 24.6555625 +2023-05-10 19:55:00,713 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward2.hidden_balancer.prob, batch_count=596320.0, ans=0.125 +2023-05-10 19:55:00,747 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.ff3_skip_rate, batch_count=596320.0, ans=0.0 +2023-05-10 19:55:05,220 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.bypass.scale_min, batch_count=596320.0, ans=0.2 +2023-05-10 19:55:18,474 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_skip_rate, batch_count=596370.0, ans=0.0 +2023-05-10 19:55:38,943 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.const_attention_rate, batch_count=596370.0, ans=0.025 +2023-05-10 19:55:42,582 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.ff2_skip_rate, batch_count=596370.0, ans=0.0 +2023-05-10 19:56:04,175 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.755e+02 3.351e+02 3.898e+02 4.540e+02 7.255e+02, threshold=7.795e+02, percent-clipped=0.0 +2023-05-10 19:56:04,747 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=596470.0, ans=0.1 +2023-05-10 19:56:12,319 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-65654-0031-41259-0 from training. Duration: 20.44 +2023-05-10 19:56:15,722 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.balancer2.prob, batch_count=596470.0, ans=0.125 +2023-05-10 19:56:22,109 INFO [train.py:1021] (1/2) Epoch 33, batch 2950, loss[loss=0.1678, simple_loss=0.2586, pruned_loss=0.03847, over 37088.00 frames. ], tot_loss[loss=0.1707, simple_loss=0.258, pruned_loss=0.04168, over 7108206.51 frames. ], batch size: 103, lr: 3.30e-03, grad_scale: 32.0 +2023-05-10 19:56:22,478 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=596520.0, ans=0.1 +2023-05-10 19:56:29,874 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0018-132285-0_sp0.9 from training. Duration: 23.45 +2023-05-10 19:56:43,497 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.balancer2.prob, batch_count=596570.0, ans=0.125 +2023-05-10 19:56:55,514 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass.skip_rate, batch_count=596570.0, ans=0.04949747468305833 +2023-05-10 19:57:10,991 WARNING [train.py:1182] (1/2) Exclude cut with ID 6945-60535-0076-12784-0_sp0.9 from training. Duration: 20.52225 +2023-05-10 19:57:15,068 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=192, metric=5.17 vs. limit=15.0 +2023-05-10 19:57:18,040 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0008-134307-0 from training. Duration: 22.19 +2023-05-10 19:57:31,756 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0014-15845-0_sp1.1 from training. Duration: 25.3818125 +2023-05-10 19:57:55,690 INFO [train.py:1021] (1/2) Epoch 33, batch 3000, loss[loss=0.1717, simple_loss=0.262, pruned_loss=0.04067, over 35928.00 frames. ], tot_loss[loss=0.1712, simple_loss=0.2585, pruned_loss=0.042, over 7077981.46 frames. ], batch size: 133, lr: 3.30e-03, grad_scale: 32.0 +2023-05-10 19:57:55,690 INFO [train.py:1048] (1/2) Computing validation loss +2023-05-10 19:58:08,080 INFO [train.py:1057] (1/2) Epoch 33, validation: loss=0.1518, simple_loss=0.2528, pruned_loss=0.02542, over 944034.00 frames. +2023-05-10 19:58:08,081 INFO [train.py:1058] (1/2) Maximum memory allocated so far is 18883MB +2023-05-10 19:58:08,226 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0043-132310-0_sp0.9 from training. Duration: 28.0944375 +2023-05-10 19:58:14,208 WARNING [train.py:1182] (1/2) Exclude cut with ID 2195-150901-0045-59933-0_sp0.9 from training. Duration: 22.9444375 +2023-05-10 19:58:21,724 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0024-13011-0_sp1.1 from training. Duration: 21.6318125 +2023-05-10 19:58:39,794 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.const_attention_rate, batch_count=596820.0, ans=0.025 +2023-05-10 19:58:49,128 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0030-130156-0 from training. Duration: 23.695 +2023-05-10 19:59:07,676 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_skip_rate, batch_count=596920.0, ans=0.0 +2023-05-10 19:59:15,329 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0094-26379-0 from training. Duration: 23.955 +2023-05-10 19:59:19,832 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.740e+02 3.388e+02 3.736e+02 4.231e+02 6.020e+02, threshold=7.473e+02, percent-clipped=0.0 +2023-05-10 19:59:23,338 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=596970.0, ans=0.1 +2023-05-10 19:59:33,891 INFO [train.py:1021] (1/2) Epoch 33, batch 3050, loss[loss=0.1467, simple_loss=0.2279, pruned_loss=0.0327, over 36803.00 frames. ], tot_loss[loss=0.1707, simple_loss=0.2579, pruned_loss=0.04173, over 7079334.68 frames. ], batch size: 89, lr: 3.30e-03, grad_scale: 32.0 +2023-05-10 19:59:39,153 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.3.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 19:59:55,991 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0024-13011-0_sp0.9 from training. Duration: 26.438875 +2023-05-10 20:00:16,250 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.balancer1.prob, batch_count=597120.0, ans=0.125 +2023-05-10 20:00:39,410 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.1.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([5.5811, 4.9243, 5.0614, 4.7444], device='cuda:1') +2023-05-10 20:00:44,177 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0021-26306-0_sp0.9 from training. Duration: 21.2444375 +2023-05-10 20:00:44,213 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0014-15845-0_sp0.9 from training. Duration: 31.02225 +2023-05-10 20:00:47,601 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.balancer1.prob, batch_count=597220.0, ans=0.125 +2023-05-10 20:00:54,647 INFO [train.py:1021] (1/2) Epoch 33, batch 3100, loss[loss=0.1774, simple_loss=0.2673, pruned_loss=0.04377, over 32495.00 frames. ], tot_loss[loss=0.1703, simple_loss=0.2576, pruned_loss=0.04152, over 7095819.92 frames. ], batch size: 170, lr: 3.30e-03, grad_scale: 32.0 +2023-05-10 20:00:54,767 WARNING [train.py:1182] (1/2) Exclude cut with ID 432-122774-0017-62487-0 from training. Duration: 22.395 +2023-05-10 20:01:11,466 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0045-15876-0 from training. Duration: 21.075 +2023-05-10 20:01:14,659 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.1.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([5.6123, 4.8906, 5.0977, 4.7819], device='cuda:1') +2023-05-10 20:01:17,477 WARNING [train.py:1182] (1/2) Exclude cut with ID 6482-98857-0025-147532-0_sp0.9 from training. Duration: 20.0055625 +2023-05-10 20:01:17,491 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0037-132304-0_sp0.9 from training. Duration: 22.05 +2023-05-10 20:01:17,515 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0003-134302-0 from training. Duration: 26.8349375 +2023-05-10 20:01:20,405 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0009-15840-0_sp1.1 from training. Duration: 22.1090625 +2023-05-10 20:01:27,955 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0094-26379-0_sp0.9 from training. Duration: 26.6166875 +2023-05-10 20:01:34,755 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module2.balancer1.prob, batch_count=597370.0, ans=0.125 +2023-05-10 20:01:47,057 WARNING [train.py:1182] (1/2) Exclude cut with ID 2046-178027-0000-53705-0_sp0.9 from training. Duration: 20.3055625 +2023-05-10 20:01:56,583 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.nonlin_attention.balancer.min_positive, batch_count=597470.0, ans=0.05 +2023-05-10 20:01:57,713 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.952e+02 3.380e+02 3.930e+02 4.726e+02 7.032e+02, threshold=7.859e+02, percent-clipped=0.0 +2023-05-10 20:01:59,908 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=256, metric=7.90 vs. limit=22.5 +2023-05-10 20:02:08,355 WARNING [train.py:1182] (1/2) Exclude cut with ID 7205-50138-0008-5373-0_sp0.9 from training. Duration: 20.7 +2023-05-10 20:02:11,099 INFO [train.py:1021] (1/2) Epoch 33, batch 3150, loss[loss=0.1983, simple_loss=0.2797, pruned_loss=0.05849, over 24728.00 frames. ], tot_loss[loss=0.1703, simple_loss=0.2575, pruned_loss=0.04155, over 7076595.45 frames. ], batch size: 234, lr: 3.30e-03, grad_scale: 16.0 +2023-05-10 20:02:19,713 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module1.balancer2.prob, batch_count=597520.0, ans=0.125 +2023-05-10 20:02:35,329 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=597570.0, ans=0.1 +2023-05-10 20:02:39,788 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.const_attention_rate, batch_count=597570.0, ans=0.025 +2023-05-10 20:02:53,302 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0019-146985-0 from training. Duration: 22.48 +2023-05-10 20:02:58,121 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_skip_rate, batch_count=597670.0, ans=0.0 +2023-05-10 20:03:10,132 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0003-134302-0_sp0.9 from training. Duration: 29.816625 +2023-05-10 20:03:16,029 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.const_attention_rate, batch_count=597720.0, ans=0.025 +2023-05-10 20:03:39,856 INFO [train.py:1021] (1/2) Epoch 33, batch 3200, loss[loss=0.1615, simple_loss=0.2497, pruned_loss=0.03662, over 36855.00 frames. ], tot_loss[loss=0.1699, simple_loss=0.2571, pruned_loss=0.0413, over 7086618.42 frames. ], batch size: 96, lr: 3.30e-03, grad_scale: 32.0 +2023-05-10 20:03:41,700 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.1.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 20:03:42,908 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0005-134304-0_sp1.1 from training. Duration: 22.7590625 +2023-05-10 20:03:49,023 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0026-15857-0 from training. Duration: 22.555 +2023-05-10 20:04:06,684 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=597820.0, ans=0.1 +2023-05-10 20:04:14,396 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.3.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 20:04:20,110 WARNING [train.py:1182] (1/2) Exclude cut with ID 1250-135782-0005-25975-0_sp0.9 from training. Duration: 21.688875 +2023-05-10 20:04:34,622 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=597870.0, ans=0.0 +2023-05-10 20:04:38,368 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass.skip_rate, batch_count=597920.0, ans=0.04949747468305833 +2023-05-10 20:04:39,834 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass.skip_rate, batch_count=597920.0, ans=0.09899494936611666 +2023-05-10 20:05:07,846 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.908e+02 3.606e+02 4.015e+02 4.755e+02 7.987e+02, threshold=8.030e+02, percent-clipped=2.0 +2023-05-10 20:05:10,228 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0038-41224-0_sp0.9 from training. Duration: 22.6 +2023-05-10 20:05:24,808 INFO [train.py:1021] (1/2) Epoch 33, batch 3250, loss[loss=0.1508, simple_loss=0.2328, pruned_loss=0.03444, over 37054.00 frames. ], tot_loss[loss=0.1704, simple_loss=0.2578, pruned_loss=0.04153, over 7075461.31 frames. ], batch size: 88, lr: 3.30e-03, grad_scale: 32.0 +2023-05-10 20:05:25,383 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module2.balancer1.max_abs, batch_count=598020.0, ans=10.0 +2023-05-10 20:05:58,618 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0009-15840-0 from training. Duration: 24.32 +2023-05-10 20:06:54,398 INFO [train.py:1021] (1/2) Epoch 33, batch 3300, loss[loss=0.1755, simple_loss=0.2671, pruned_loss=0.04195, over 37000.00 frames. ], tot_loss[loss=0.1703, simple_loss=0.2576, pruned_loss=0.04153, over 7063737.15 frames. ], batch size: 104, lr: 3.30e-03, grad_scale: 32.0 +2023-05-10 20:07:05,813 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-276745-0093-13116-0_sp0.9 from training. Duration: 21.061125 +2023-05-10 20:07:10,858 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=598320.0, ans=0.1 +2023-05-10 20:07:21,094 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0024-15855-0_sp0.9 from training. Duration: 20.32225 +2023-05-10 20:07:32,831 WARNING [train.py:1182] (1/2) Exclude cut with ID 3033-130750-0096-55598-0_sp1.1 from training. Duration: 0.7545625 +2023-05-10 20:07:52,824 WARNING [train.py:1182] (1/2) Exclude cut with ID 4295-39940-0007-92567-0_sp0.9 from training. Duration: 23.9333125 +2023-05-10 20:08:09,121 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.970e+02 3.752e+02 4.174e+02 4.902e+02 6.909e+02, threshold=8.348e+02, percent-clipped=0.0 +2023-05-10 20:08:24,179 INFO [train.py:1021] (1/2) Epoch 33, batch 3350, loss[loss=0.1619, simple_loss=0.2505, pruned_loss=0.0367, over 36979.00 frames. ], tot_loss[loss=0.1696, simple_loss=0.2567, pruned_loss=0.04124, over 7086392.04 frames. ], batch size: 95, lr: 3.30e-03, grad_scale: 32.0 +2023-05-10 20:08:25,955 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.balancer1.prob, batch_count=598520.0, ans=0.125 +2023-05-10 20:08:37,811 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0008-134307-0_sp1.1 from training. Duration: 20.17275 +2023-05-10 20:08:39,763 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.bypass.skip_rate, batch_count=598570.0, ans=0.04949747468305833 +2023-05-10 20:08:45,133 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0019-146985-0_sp1.1 from training. Duration: 20.436375 +2023-05-10 20:08:48,517 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward3.hidden_balancer.prob, batch_count=598570.0, ans=0.125 +2023-05-10 20:08:51,867 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=598570.0, ans=0.1 +2023-05-10 20:08:58,956 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=598620.0, ans=0.125 +2023-05-10 20:09:23,151 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.bypass.scale_min, batch_count=598670.0, ans=0.2 +2023-05-10 20:09:29,205 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_skip_rate, batch_count=598720.0, ans=0.0 +2023-05-10 20:09:33,979 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=598720.0, ans=0.125 +2023-05-10 20:09:43,165 INFO [train.py:1021] (1/2) Epoch 33, batch 3400, loss[loss=0.1786, simple_loss=0.2711, pruned_loss=0.04306, over 34713.00 frames. ], tot_loss[loss=0.1699, simple_loss=0.2571, pruned_loss=0.04142, over 7085772.07 frames. ], batch size: 145, lr: 3.30e-03, grad_scale: 32.0 +2023-05-10 20:10:03,459 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([5.0110, 4.1553, 4.5822, 4.6121], device='cuda:1') +2023-05-10 20:10:05,773 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=598820.0, ans=0.1 +2023-05-10 20:10:16,605 WARNING [train.py:1182] (1/2) Exclude cut with ID 4234-40345-0022-142709-0_sp0.9 from training. Duration: 23.1055625 +2023-05-10 20:10:18,202 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0007-12994-0_sp1.1 from training. Duration: 23.5 +2023-05-10 20:10:30,415 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0009-134308-0_sp0.9 from training. Duration: 26.62775 +2023-05-10 20:10:52,300 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0018-132285-0 from training. Duration: 21.105 +2023-05-10 20:11:00,215 WARNING [train.py:1182] (1/2) Exclude cut with ID 4511-76322-0006-80011-0_sp0.9 from training. Duration: 24.411125 +2023-05-10 20:11:03,658 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass.scale_min, batch_count=598970.0, ans=0.2 +2023-05-10 20:11:04,772 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.828e+02 3.417e+02 3.807e+02 4.219e+02 5.955e+02, threshold=7.614e+02, percent-clipped=0.0 +2023-05-10 20:11:17,417 INFO [train.py:1021] (1/2) Epoch 33, batch 3450, loss[loss=0.1528, simple_loss=0.2367, pruned_loss=0.03441, over 36960.00 frames. ], tot_loss[loss=0.1699, simple_loss=0.2571, pruned_loss=0.04141, over 7108124.12 frames. ], batch size: 91, lr: 3.30e-03, grad_scale: 32.0 +2023-05-10 20:11:28,199 WARNING [train.py:1182] (1/2) Exclude cut with ID 6758-72288-0033-108368-0_sp1.1 from training. Duration: 21.263625 +2023-05-10 20:11:31,621 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([2.5026, 3.4446, 3.1872, 4.0623, 2.2558, 3.4555, 4.0866, 3.5146], + device='cuda:1') +2023-05-10 20:11:31,663 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.balancer1.prob, batch_count=599070.0, ans=0.125 +2023-05-10 20:11:31,689 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module2.balancer2.prob, batch_count=599070.0, ans=0.125 +2023-05-10 20:11:34,663 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.1.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([5.5740, 5.3568, 4.7567, 5.1579], device='cuda:1') +2023-05-10 20:12:16,196 WARNING [train.py:1182] (1/2) Exclude cut with ID 4234-40345-0022-142709-0 from training. Duration: 20.795 +2023-05-10 20:12:30,002 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0021-15852-0 from training. Duration: 24.76 +2023-05-10 20:12:30,019 WARNING [train.py:1182] (1/2) Exclude cut with ID 3867-173237-0077-144769-0_sp0.9 from training. Duration: 22.25 +2023-05-10 20:12:48,505 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=599220.0, ans=0.125 +2023-05-10 20:12:51,742 INFO [train.py:1021] (1/2) Epoch 33, batch 3500, loss[loss=0.17, simple_loss=0.265, pruned_loss=0.0375, over 36863.00 frames. ], tot_loss[loss=0.1698, simple_loss=0.2572, pruned_loss=0.04126, over 7106632.08 frames. ], batch size: 111, lr: 3.30e-03, grad_scale: 32.0 +2023-05-10 20:12:55,352 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=599270.0, ans=0.125 +2023-05-10 20:13:02,166 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0026-15857-0_sp1.1 from training. Duration: 20.5045625 +2023-05-10 20:13:20,250 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.feed_forward1.out_whiten, num_groups=1, num_channels=192, metric=4.29 vs. limit=15.0 +2023-05-10 20:13:27,814 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.7666, 3.7481, 4.0873, 3.6708], device='cuda:1') +2023-05-10 20:13:29,293 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=599320.0, ans=0.125 +2023-05-10 20:13:41,426 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.const_attention_rate, batch_count=599370.0, ans=0.025 +2023-05-10 20:14:10,859 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 3.022e+02 3.425e+02 3.878e+02 4.369e+02 7.345e+02, threshold=7.755e+02, percent-clipped=0.0 +2023-05-10 20:14:26,417 INFO [train.py:1021] (1/2) Epoch 33, batch 3550, loss[loss=0.1676, simple_loss=0.2589, pruned_loss=0.03813, over 36945.00 frames. ], tot_loss[loss=0.1695, simple_loss=0.257, pruned_loss=0.04101, over 7129363.38 frames. ], batch size: 108, lr: 3.30e-03, grad_scale: 32.0 +2023-05-10 20:14:37,547 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.nonlin_attention.balancer.prob, batch_count=599520.0, ans=0.125 +2023-05-10 20:14:51,515 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=599570.0, ans=0.0 +2023-05-10 20:14:52,902 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.0.layers.1.self_attn_weights, attn_weights_entropy = tensor([5.7393, 5.0586, 5.1003, 5.6021], device='cuda:1') +2023-05-10 20:14:53,610 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.self_attn1.whiten, num_groups=1, num_channels=192, metric=10.16 vs. limit=22.5 +2023-05-10 20:15:12,076 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.balancer2.prob, batch_count=599620.0, ans=0.125 +2023-05-10 20:15:27,799 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass.skip_rate, batch_count=599670.0, ans=0.07 +2023-05-10 20:15:33,828 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.const_attention_rate, batch_count=599720.0, ans=0.025 +2023-05-10 20:15:44,070 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.ff2_skip_rate, batch_count=599720.0, ans=0.0 +2023-05-10 20:15:46,446 INFO [train.py:1021] (1/2) Epoch 33, batch 3600, loss[loss=0.1803, simple_loss=0.2681, pruned_loss=0.04627, over 37064.00 frames. ], tot_loss[loss=0.1696, simple_loss=0.2573, pruned_loss=0.04096, over 7156538.44 frames. ], batch size: 110, lr: 3.29e-03, grad_scale: 32.0 +2023-05-10 20:15:57,385 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_skip_rate, batch_count=599770.0, ans=0.0 +2023-05-10 20:16:56,669 WARNING [train.py:1182] (1/2) Exclude cut with ID 7859-102521-0017-7548-0_sp1.1 from training. Duration: 22.2954375 +2023-05-10 20:17:05,119 INFO [train.py:1021] (1/2) Epoch 34, batch 0, loss[loss=0.1838, simple_loss=0.2783, pruned_loss=0.04466, over 36722.00 frames. ], tot_loss[loss=0.1838, simple_loss=0.2783, pruned_loss=0.04466, over 36722.00 frames. ], batch size: 122, lr: 3.24e-03, grad_scale: 32.0 +2023-05-10 20:17:05,120 INFO [train.py:1048] (1/2) Computing validation loss +2023-05-10 20:17:10,061 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([1.8939, 2.8218, 4.0517, 2.9930], device='cuda:1') +2023-05-10 20:17:16,986 INFO [train.py:1057] (1/2) Epoch 34, validation: loss=0.152, simple_loss=0.2531, pruned_loss=0.02546, over 944034.00 frames. +2023-05-10 20:17:16,987 INFO [train.py:1058] (1/2) Maximum memory allocated so far is 18883MB +2023-05-10 20:17:22,003 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.6122, 4.0042, 4.3291, 3.9827], device='cuda:1') +2023-05-10 20:17:23,374 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module1.balancer2.prob, batch_count=599950.0, ans=0.125 +2023-05-10 20:17:26,012 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.727e+02 3.385e+02 3.825e+02 4.645e+02 7.439e+02, threshold=7.651e+02, percent-clipped=0.0 +2023-05-10 20:18:18,243 WARNING [train.py:1182] (1/2) Exclude cut with ID 298-126791-0067-24026-0_sp0.9 from training. Duration: 21.438875 +2023-05-10 20:18:22,920 WARNING [train.py:1182] (1/2) Exclude cut with ID 5652-39938-0025-23684-0_sp0.9 from training. Duration: 22.2055625 +2023-05-10 20:18:27,545 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.0.layers.0.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 20:18:36,273 INFO [train.py:1021] (1/2) Epoch 34, batch 50, loss[loss=0.1588, simple_loss=0.2594, pruned_loss=0.02909, over 37064.00 frames. ], tot_loss[loss=0.1662, simple_loss=0.259, pruned_loss=0.03674, over 1625801.58 frames. ], batch size: 103, lr: 3.24e-03, grad_scale: 32.0 +2023-05-10 20:18:55,654 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.1.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([5.5798, 4.9112, 5.0402, 4.7255], device='cuda:1') +2023-05-10 20:19:29,134 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module1.balancer1.prob, batch_count=600350.0, ans=0.125 +2023-05-10 20:19:29,631 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=7.71 vs. limit=15.0 +2023-05-10 20:19:48,832 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=600400.0, ans=0.0 +2023-05-10 20:19:56,768 INFO [train.py:1021] (1/2) Epoch 34, batch 100, loss[loss=0.1466, simple_loss=0.2341, pruned_loss=0.02956, over 37074.00 frames. ], tot_loss[loss=0.1634, simple_loss=0.256, pruned_loss=0.03538, over 2878829.23 frames. ], batch size: 88, lr: 3.24e-03, grad_scale: 32.0 +2023-05-10 20:20:00,093 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward3.hidden_balancer.prob, batch_count=600450.0, ans=0.125 +2023-05-10 20:20:06,513 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.411e+02 2.920e+02 3.228e+02 3.638e+02 6.148e+02, threshold=6.455e+02, percent-clipped=0.0 +2023-05-10 20:20:13,237 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.balancer1.prob, batch_count=600500.0, ans=0.125 +2023-05-10 20:21:09,036 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module2.balancer1.prob, batch_count=600600.0, ans=0.125 +2023-05-10 20:21:17,259 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.bypass_mid.scale_min, batch_count=600650.0, ans=0.2 +2023-05-10 20:21:37,355 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=600700.0, ans=0.125 +2023-05-10 20:21:39,873 INFO [train.py:1021] (1/2) Epoch 34, batch 150, loss[loss=0.169, simple_loss=0.2649, pruned_loss=0.03651, over 36959.00 frames. ], tot_loss[loss=0.164, simple_loss=0.2565, pruned_loss=0.0357, over 3813094.08 frames. ], batch size: 108, lr: 3.24e-03, grad_scale: 32.0 +2023-05-10 20:22:09,226 WARNING [train.py:1182] (1/2) Exclude cut with ID 7859-102521-0017-7548-0 from training. Duration: 24.525 +2023-05-10 20:22:28,018 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module1.balancer2.prob, batch_count=600800.0, ans=0.125 +2023-05-10 20:22:36,368 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([1.9534, 2.6430, 4.3651, 2.8938], device='cuda:1') +2023-05-10 20:22:46,479 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.ff2_skip_rate, batch_count=600850.0, ans=0.0 +2023-05-10 20:23:03,208 WARNING [train.py:1182] (1/2) Exclude cut with ID 3699-47246-0007-3408-0_sp0.9 from training. Duration: 20.26675 +2023-05-10 20:23:09,451 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module2.balancer1.max_abs, batch_count=600900.0, ans=10.0 +2023-05-10 20:23:18,423 WARNING [train.py:1182] (1/2) Exclude cut with ID 7859-102521-0017-7548-0_sp0.9 from training. Duration: 27.25 +2023-05-10 20:23:24,470 INFO [train.py:1021] (1/2) Epoch 34, batch 200, loss[loss=0.1584, simple_loss=0.2557, pruned_loss=0.03055, over 36913.00 frames. ], tot_loss[loss=0.1635, simple_loss=0.2559, pruned_loss=0.0355, over 4581371.63 frames. ], batch size: 105, lr: 3.24e-03, grad_scale: 32.0 +2023-05-10 20:23:34,258 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.548e+02 3.045e+02 3.563e+02 4.449e+02 6.161e+02, threshold=7.126e+02, percent-clipped=0.0 +2023-05-10 20:24:42,417 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_skip_rate, batch_count=601150.0, ans=0.0 +2023-05-10 20:24:48,571 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.ff2_skip_rate, batch_count=601150.0, ans=0.0 +2023-05-10 20:24:57,706 WARNING [train.py:1182] (1/2) Exclude cut with ID 6426-64292-0017-15984-0 from training. Duration: 21.68 +2023-05-10 20:24:58,311 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=256, metric=3.17 vs. limit=15.0 +2023-05-10 20:25:04,570 INFO [train.py:1021] (1/2) Epoch 34, batch 250, loss[loss=0.1753, simple_loss=0.2737, pruned_loss=0.0384, over 36308.00 frames. ], tot_loss[loss=0.1631, simple_loss=0.2556, pruned_loss=0.03525, over 5136397.15 frames. ], batch size: 126, lr: 3.24e-03, grad_scale: 32.0 +2023-05-10 20:25:12,294 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0007-59342-0 from training. Duration: 21.6300625 +2023-05-10 20:25:21,813 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.ff2_skip_rate, batch_count=601250.0, ans=0.0 +2023-05-10 20:25:34,394 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass.scale_min, batch_count=601250.0, ans=0.2 +2023-05-10 20:25:39,276 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0007-59342-0_sp0.9 from training. Duration: 24.033375 +2023-05-10 20:26:07,981 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=601350.0, ans=0.0 +2023-05-10 20:26:38,284 INFO [train.py:1021] (1/2) Epoch 34, batch 300, loss[loss=0.1477, simple_loss=0.2363, pruned_loss=0.02953, over 37164.00 frames. ], tot_loss[loss=0.1628, simple_loss=0.2552, pruned_loss=0.03517, over 5596187.53 frames. ], batch size: 93, lr: 3.24e-03, grad_scale: 32.0 +2023-05-10 20:26:47,205 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.470e+02 3.032e+02 3.664e+02 4.773e+02 9.187e+02, threshold=7.328e+02, percent-clipped=4.0 +2023-05-10 20:26:55,445 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0009-59344-0 from training. Duration: 22.905 +2023-05-10 20:26:55,508 WARNING [train.py:1182] (1/2) Exclude cut with ID 5622-44585-0006-90525-0_sp1.1 from training. Duration: 23.4318125 +2023-05-10 20:26:58,861 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module1.balancer1.prob, batch_count=601500.0, ans=0.125 +2023-05-10 20:27:33,037 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.balancer1.prob, batch_count=601600.0, ans=0.125 +2023-05-10 20:27:41,605 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward2.hidden_balancer.prob, batch_count=601650.0, ans=0.125 +2023-05-10 20:28:04,578 INFO [train.py:1021] (1/2) Epoch 34, batch 350, loss[loss=0.1782, simple_loss=0.2751, pruned_loss=0.04065, over 36781.00 frames. ], tot_loss[loss=0.1636, simple_loss=0.2564, pruned_loss=0.0354, over 5968365.34 frames. ], batch size: 118, lr: 3.24e-03, grad_scale: 16.0 +2023-05-10 20:28:30,757 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=601750.0, ans=0.125 +2023-05-10 20:28:39,952 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=601750.0, ans=0.125 +2023-05-10 20:28:52,419 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.ff2_skip_rate, batch_count=601800.0, ans=0.0 +2023-05-10 20:29:14,133 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.const_attention_rate, batch_count=601850.0, ans=0.025 +2023-05-10 20:29:19,859 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0009-59344-0_sp1.1 from training. Duration: 20.82275 +2023-05-10 20:29:21,401 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0009-59344-0_sp0.9 from training. Duration: 25.45 +2023-05-10 20:29:26,959 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=192, metric=2.93 vs. limit=10.0 +2023-05-10 20:29:37,122 INFO [train.py:1021] (1/2) Epoch 34, batch 400, loss[loss=0.1772, simple_loss=0.2719, pruned_loss=0.04125, over 37001.00 frames. ], tot_loss[loss=0.164, simple_loss=0.2567, pruned_loss=0.0356, over 6226760.28 frames. ], batch size: 104, lr: 3.24e-03, grad_scale: 16.0 +2023-05-10 20:29:42,932 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.ff2_skip_rate, batch_count=601950.0, ans=0.0 +2023-05-10 20:29:59,677 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=4.87 vs. limit=15.0 +2023-05-10 20:30:01,646 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.474e+02 2.876e+02 3.299e+02 3.964e+02 6.221e+02, threshold=6.599e+02, percent-clipped=0.0 +2023-05-10 20:30:10,384 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.0.layers.0.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 20:30:38,453 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=602100.0, ans=0.1 +2023-05-10 20:30:41,140 WARNING [train.py:1182] (1/2) Exclude cut with ID 5622-44585-0006-90525-0 from training. Duration: 25.775 +2023-05-10 20:30:50,687 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_skip_rate, batch_count=602150.0, ans=0.0 +2023-05-10 20:30:58,523 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.bypass.skip_rate, batch_count=602150.0, ans=0.04949747468305833 +2023-05-10 20:31:07,894 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0071-62375-0_sp0.9 from training. Duration: 22.25 +2023-05-10 20:31:15,079 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=256, metric=8.28 vs. limit=15.0 +2023-05-10 20:31:15,812 INFO [train.py:1021] (1/2) Epoch 34, batch 450, loss[loss=0.1845, simple_loss=0.2798, pruned_loss=0.04461, over 37047.00 frames. ], tot_loss[loss=0.1647, simple_loss=0.2579, pruned_loss=0.03576, over 6445198.71 frames. ], batch size: 116, lr: 3.24e-03, grad_scale: 16.0 +2023-05-10 20:32:08,082 WARNING [train.py:1182] (1/2) Exclude cut with ID 3972-170212-0014-23379-0 from training. Duration: 26.205 +2023-05-10 20:32:37,267 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module2.balancer1.prob, batch_count=602300.0, ans=0.125 +2023-05-10 20:32:38,821 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.out_proj.dropout_p, batch_count=602300.0, ans=0.1 +2023-05-10 20:32:41,847 WARNING [train.py:1182] (1/2) Exclude cut with ID 5239-32139-0047-9341-0_sp0.9 from training. Duration: 30.1555625 +2023-05-10 20:32:48,290 WARNING [train.py:1182] (1/2) Exclude cut with ID 1265-135635-0050-6781-0_sp0.9 from training. Duration: 21.8333125 +2023-05-10 20:33:02,077 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0024-94914-0_sp1.1 from training. Duration: 20.6545625 +2023-05-10 20:33:18,662 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.ff3_skip_rate, batch_count=602400.0, ans=0.0 +2023-05-10 20:33:23,193 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module1.balancer1.prob, batch_count=602450.0, ans=0.125 +2023-05-10 20:33:24,810 INFO [train.py:1021] (1/2) Epoch 34, batch 500, loss[loss=0.1467, simple_loss=0.2338, pruned_loss=0.02978, over 36987.00 frames. ], tot_loss[loss=0.1644, simple_loss=0.2573, pruned_loss=0.03574, over 6609264.21 frames. ], batch size: 86, lr: 3.24e-03, grad_scale: 16.0 +2023-05-10 20:33:31,372 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_skip_rate, batch_count=602450.0, ans=0.0 +2023-05-10 20:33:31,450 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module2.balancer1.prob, batch_count=602450.0, ans=0.125 +2023-05-10 20:33:37,211 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.528e+02 2.975e+02 3.217e+02 3.688e+02 5.922e+02, threshold=6.434e+02, percent-clipped=0.0 +2023-05-10 20:33:53,101 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass.skip_rate, batch_count=602500.0, ans=0.04949747468305833 +2023-05-10 20:33:54,995 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0045-39920-0_sp0.9 from training. Duration: 20.52225 +2023-05-10 20:34:02,744 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module1.balancer1.prob, batch_count=602500.0, ans=0.125 +2023-05-10 20:34:07,800 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.nonlin_attention.balancer.min_positive, batch_count=602550.0, ans=0.05 +2023-05-10 20:34:34,392 WARNING [train.py:1182] (1/2) Exclude cut with ID 3972-170212-0014-23379-0_sp0.9 from training. Duration: 29.1166875 +2023-05-10 20:34:49,427 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module1.balancer2.prob, batch_count=602600.0, ans=0.125 +2023-05-10 20:35:02,551 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=192, metric=8.64 vs. limit=15.0 +2023-05-10 20:35:09,431 INFO [train.py:1021] (1/2) Epoch 34, batch 550, loss[loss=0.1777, simple_loss=0.2749, pruned_loss=0.04019, over 36377.00 frames. ], tot_loss[loss=0.1646, simple_loss=0.2576, pruned_loss=0.03578, over 6757772.40 frames. ], batch size: 126, lr: 3.24e-03, grad_scale: 8.0 +2023-05-10 20:35:27,602 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 20:35:40,857 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_whiten.whitening_limit, batch_count=602750.0, ans=15.0 +2023-05-10 20:35:48,408 WARNING [train.py:1182] (1/2) Exclude cut with ID 543-133211-0007-59831-0_sp0.9 from training. Duration: 21.388875 +2023-05-10 20:36:36,324 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=602900.0, ans=0.125 +2023-05-10 20:36:55,107 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=3.92 vs. limit=15.0 +2023-05-10 20:37:01,023 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0024-94914-0 from training. Duration: 22.72 +2023-05-10 20:37:01,089 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0031-94921-0_sp0.9 from training. Duration: 22.7444375 +2023-05-10 20:37:03,114 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=602900.0, ans=0.125 +2023-05-10 20:37:05,693 INFO [train.py:1021] (1/2) Epoch 34, batch 600, loss[loss=0.1611, simple_loss=0.2639, pruned_loss=0.02913, over 36968.00 frames. ], tot_loss[loss=0.1646, simple_loss=0.2577, pruned_loss=0.03579, over 6852442.25 frames. ], batch size: 108, lr: 3.24e-03, grad_scale: 8.0 +2023-05-10 20:37:12,493 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=256, metric=8.49 vs. limit=22.5 +2023-05-10 20:37:19,410 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.561e+02 2.934e+02 3.218e+02 3.810e+02 5.356e+02, threshold=6.436e+02, percent-clipped=0.0 +2023-05-10 20:37:56,739 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.const_attention_rate, batch_count=603050.0, ans=0.025 +2023-05-10 20:37:59,526 WARNING [train.py:1182] (1/2) Exclude cut with ID 4133-6541-0027-40495-0_sp1.1 from training. Duration: 0.9681875 +2023-05-10 20:38:03,952 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62851-0022-91297-0_sp0.9 from training. Duration: 22.3166875 +2023-05-10 20:38:10,280 WARNING [train.py:1182] (1/2) Exclude cut with ID 543-133212-0015-59917-0_sp0.9 from training. Duration: 21.8166875 +2023-05-10 20:38:45,227 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.attention_skip_rate, batch_count=603200.0, ans=0.0 +2023-05-10 20:38:46,595 INFO [train.py:1021] (1/2) Epoch 34, batch 650, loss[loss=0.1806, simple_loss=0.2719, pruned_loss=0.04459, over 37027.00 frames. ], tot_loss[loss=0.1647, simple_loss=0.258, pruned_loss=0.03576, over 6933461.70 frames. ], batch size: 116, lr: 3.24e-03, grad_scale: 8.0 +2023-05-10 20:39:52,542 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=603350.0, ans=0.1 +2023-05-10 20:40:24,997 INFO [train.py:1021] (1/2) Epoch 34, batch 700, loss[loss=0.156, simple_loss=0.247, pruned_loss=0.03249, over 37022.00 frames. ], tot_loss[loss=0.1647, simple_loss=0.2578, pruned_loss=0.03576, over 6995282.65 frames. ], batch size: 99, lr: 3.24e-03, grad_scale: 8.0 +2023-05-10 20:40:42,440 WARNING [train.py:1182] (1/2) Exclude cut with ID 4957-30119-0041-23990-0_sp0.9 from training. Duration: 20.22775 +2023-05-10 20:40:47,853 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.balancer2.prob, batch_count=603450.0, ans=0.125 +2023-05-10 20:40:52,412 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module2.balancer2.prob, batch_count=603450.0, ans=0.125 +2023-05-10 20:40:54,121 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.375e+02 3.154e+02 3.542e+02 4.271e+02 8.697e+02, threshold=7.084e+02, percent-clipped=6.0 +2023-05-10 20:41:05,519 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=256, metric=8.06 vs. limit=22.5 +2023-05-10 20:41:32,548 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=3.22 vs. limit=15.0 +2023-05-10 20:41:37,434 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module2.balancer1.prob, batch_count=603600.0, ans=0.125 +2023-05-10 20:41:40,142 WARNING [train.py:1182] (1/2) Exclude cut with ID 5239-32139-0047-9341-0_sp1.1 from training. Duration: 24.67275 +2023-05-10 20:41:59,855 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module2.balancer1.prob, batch_count=603650.0, ans=0.125 +2023-05-10 20:41:59,905 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module1.balancer2.prob, batch_count=603650.0, ans=0.125 +2023-05-10 20:42:04,104 INFO [train.py:1021] (1/2) Epoch 34, batch 750, loss[loss=0.1471, simple_loss=0.2371, pruned_loss=0.02855, over 36959.00 frames. ], tot_loss[loss=0.165, simple_loss=0.258, pruned_loss=0.03599, over 7013506.29 frames. ], batch size: 91, lr: 3.23e-03, grad_scale: 8.0 +2023-05-10 20:42:07,462 WARNING [train.py:1182] (1/2) Exclude cut with ID 3082-165428-0081-50734-0_sp0.9 from training. Duration: 21.8055625 +2023-05-10 20:42:23,355 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=603750.0, ans=0.0 +2023-05-10 20:42:39,053 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module2.balancer2.prob, batch_count=603800.0, ans=0.125 +2023-05-10 20:42:49,359 WARNING [train.py:1182] (1/2) Exclude cut with ID 3340-169293-0054-76830-0_sp0.9 from training. Duration: 22.6666875 +2023-05-10 20:43:03,467 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module1.balancer1.prob, batch_count=603850.0, ans=0.125 +2023-05-10 20:43:21,707 INFO [train.py:1021] (1/2) Epoch 34, batch 800, loss[loss=0.1702, simple_loss=0.2671, pruned_loss=0.03668, over 37088.00 frames. ], tot_loss[loss=0.1646, simple_loss=0.2578, pruned_loss=0.03572, over 7073751.85 frames. ], batch size: 110, lr: 3.23e-03, grad_scale: 16.0 +2023-05-10 20:43:35,852 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.434e+02 3.077e+02 3.754e+02 4.899e+02 1.226e+03, threshold=7.508e+02, percent-clipped=6.0 +2023-05-10 20:43:37,958 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.ff2_skip_rate, batch_count=604000.0, ans=0.0 +2023-05-10 20:43:55,866 WARNING [train.py:1182] (1/2) Exclude cut with ID 2411-132532-0017-82279-0_sp1.1 from training. Duration: 0.9681875 +2023-05-10 20:44:00,966 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=7.25 vs. limit=15.0 +2023-05-10 20:44:02,225 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_skip_rate, batch_count=604050.0, ans=0.0 +2023-05-10 20:44:08,956 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=604100.0, ans=0.1 +2023-05-10 20:44:17,986 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.attention_skip_rate, batch_count=604100.0, ans=0.0 +2023-05-10 20:44:24,574 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62850-0007-91323-0 from training. Duration: 22.485 +2023-05-10 20:44:38,070 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_skip_rate, batch_count=604200.0, ans=0.0 +2023-05-10 20:44:39,203 INFO [train.py:1021] (1/2) Epoch 34, batch 850, loss[loss=0.1683, simple_loss=0.2647, pruned_loss=0.03595, over 36741.00 frames. ], tot_loss[loss=0.1646, simple_loss=0.258, pruned_loss=0.03564, over 7106905.63 frames. ], batch size: 122, lr: 3.23e-03, grad_scale: 16.0 +2023-05-10 20:44:56,166 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.attention_skip_rate, batch_count=604250.0, ans=0.0 +2023-05-10 20:45:05,581 WARNING [train.py:1182] (1/2) Exclude cut with ID 3972-170212-0014-23379-0_sp1.1 from training. Duration: 23.82275 +2023-05-10 20:45:07,468 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_skip_rate, batch_count=604250.0, ans=0.0 +2023-05-10 20:45:10,520 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=604300.0, ans=0.125 +2023-05-10 20:45:16,947 WARNING [train.py:1182] (1/2) Exclude cut with ID 4860-13185-0032-76709-0 from training. Duration: 20.77 +2023-05-10 20:45:25,789 WARNING [train.py:1182] (1/2) Exclude cut with ID 6426-64292-0017-15984-0_sp0.9 from training. Duration: 24.088875 +2023-05-10 20:46:05,368 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.bypass.skip_rate, batch_count=604450.0, ans=0.09899494936611666 +2023-05-10 20:46:06,593 INFO [train.py:1021] (1/2) Epoch 34, batch 900, loss[loss=0.1638, simple_loss=0.26, pruned_loss=0.03379, over 37137.00 frames. ], tot_loss[loss=0.1649, simple_loss=0.2584, pruned_loss=0.03573, over 7134555.92 frames. ], batch size: 98, lr: 3.23e-03, grad_scale: 16.0 +2023-05-10 20:46:08,133 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62850-0007-91323-0_sp1.1 from training. Duration: 20.4409375 +2023-05-10 20:46:20,099 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.504e+02 2.931e+02 3.315e+02 3.886e+02 5.873e+02, threshold=6.629e+02, percent-clipped=0.0 +2023-05-10 20:46:29,469 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=2.58 vs. limit=6.0 +2023-05-10 20:46:53,546 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module2.balancer1.prob, batch_count=604600.0, ans=0.125 +2023-05-10 20:47:20,039 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.balancer1.prob, batch_count=604650.0, ans=0.125 +2023-05-10 20:47:30,122 INFO [train.py:1021] (1/2) Epoch 34, batch 950, loss[loss=0.1561, simple_loss=0.2391, pruned_loss=0.03662, over 36793.00 frames. ], tot_loss[loss=0.1648, simple_loss=0.2583, pruned_loss=0.03568, over 7143086.12 frames. ], batch size: 89, lr: 3.23e-03, grad_scale: 16.0 +2023-05-10 20:47:34,928 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0062-62366-0_sp0.9 from training. Duration: 22.511125 +2023-05-10 20:47:35,027 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0031-39906-0 from training. Duration: 20.675 +2023-05-10 20:48:04,132 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=604750.0, ans=0.125 +2023-05-10 20:48:43,692 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=604900.0, ans=0.1 +2023-05-10 20:48:49,720 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=604900.0, ans=0.1 +2023-05-10 20:48:56,589 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=256, metric=10.49 vs. limit=15.0 +2023-05-10 20:48:57,113 INFO [train.py:1021] (1/2) Epoch 34, batch 1000, loss[loss=0.1691, simple_loss=0.2673, pruned_loss=0.0354, over 36738.00 frames. ], tot_loss[loss=0.1645, simple_loss=0.2581, pruned_loss=0.03547, over 7172522.78 frames. ], batch size: 118, lr: 3.23e-03, grad_scale: 16.0 +2023-05-10 20:49:11,001 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.447e+02 3.081e+02 3.662e+02 4.788e+02 7.005e+02, threshold=7.325e+02, percent-clipped=3.0 +2023-05-10 20:49:20,691 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.hidden_balancer.prob, batch_count=605000.0, ans=0.125 +2023-05-10 20:49:28,823 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62850-0007-91323-0_sp0.9 from training. Duration: 24.9833125 +2023-05-10 20:50:02,328 WARNING [train.py:1182] (1/2) Exclude cut with ID 5239-32139-0047-9341-0 from training. Duration: 27.14 +2023-05-10 20:50:04,842 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module1.balancer1.min_positive, batch_count=605150.0, ans=0.025 +2023-05-10 20:50:14,876 INFO [train.py:1021] (1/2) Epoch 34, batch 1050, loss[loss=0.1364, simple_loss=0.2205, pruned_loss=0.02614, over 36983.00 frames. ], tot_loss[loss=0.1642, simple_loss=0.2576, pruned_loss=0.03539, over 7177625.34 frames. ], batch size: 86, lr: 3.23e-03, grad_scale: 16.0 +2023-05-10 20:50:21,623 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0044-62348-0 from training. Duration: 22.44 +2023-05-10 20:50:43,713 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.ff2_skip_rate, batch_count=605250.0, ans=0.0 +2023-05-10 20:50:45,165 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=605300.0, ans=0.1 +2023-05-10 20:51:33,277 INFO [train.py:1021] (1/2) Epoch 34, batch 1100, loss[loss=0.1545, simple_loss=0.2457, pruned_loss=0.03165, over 37139.00 frames. ], tot_loss[loss=0.1633, simple_loss=0.2566, pruned_loss=0.03502, over 7213938.72 frames. ], batch size: 93, lr: 3.23e-03, grad_scale: 16.0 +2023-05-10 20:51:38,443 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.5.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([3.1741, 4.2421, 4.7950, 4.9716], device='cuda:1') +2023-05-10 20:51:47,239 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.411e+02 2.964e+02 3.387e+02 4.057e+02 9.879e+02, threshold=6.773e+02, percent-clipped=1.0 +2023-05-10 20:51:47,457 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0060-62364-0_sp0.9 from training. Duration: 21.361125 +2023-05-10 20:51:48,054 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=3.63 vs. limit=15.0 +2023-05-10 20:51:55,064 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0079-62383-0_sp1.1 from training. Duration: 27.0318125 +2023-05-10 20:52:05,372 WARNING [train.py:1182] (1/2) Exclude cut with ID 5622-44585-0006-90525-0_sp0.9 from training. Duration: 28.638875 +2023-05-10 20:52:22,507 WARNING [train.py:1182] (1/2) Exclude cut with ID 3340-169293-0054-76830-0 from training. Duration: 20.4 +2023-05-10 20:52:53,118 INFO [train.py:1021] (1/2) Epoch 34, batch 1150, loss[loss=0.1513, simple_loss=0.2462, pruned_loss=0.02819, over 37033.00 frames. ], tot_loss[loss=0.1633, simple_loss=0.2566, pruned_loss=0.03506, over 7230187.31 frames. ], batch size: 99, lr: 3.23e-03, grad_scale: 16.0 +2023-05-10 20:52:58,513 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward3.hidden_balancer.prob, batch_count=605700.0, ans=0.125 +2023-05-10 20:52:59,710 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0071-62375-0 from training. Duration: 20.025 +2023-05-10 20:52:59,723 WARNING [train.py:1182] (1/2) Exclude cut with ID 2364-131735-0112-64612-0_sp0.9 from training. Duration: 20.488875 +2023-05-10 20:53:05,838 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0079-62383-0 from training. Duration: 29.735 +2023-05-10 20:53:15,817 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=605750.0, ans=0.125 +2023-05-10 20:53:40,131 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.const_attention_rate, batch_count=605850.0, ans=0.025 +2023-05-10 20:54:06,293 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn2.whiten, num_groups=1, num_channels=256, metric=8.17 vs. limit=22.5 +2023-05-10 20:54:14,468 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module1.balancer2.prob, batch_count=605950.0, ans=0.125 +2023-05-10 20:54:15,564 INFO [train.py:1021] (1/2) Epoch 34, batch 1200, loss[loss=0.1687, simple_loss=0.2674, pruned_loss=0.03505, over 37099.00 frames. ], tot_loss[loss=0.1627, simple_loss=0.2557, pruned_loss=0.03484, over 7238435.21 frames. ], batch size: 110, lr: 3.23e-03, grad_scale: 32.0 +2023-05-10 20:54:31,051 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.502e+02 2.980e+02 3.310e+02 3.760e+02 5.356e+02, threshold=6.620e+02, percent-clipped=0.0 +2023-05-10 20:54:36,106 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=606000.0, ans=0.1 +2023-05-10 20:54:41,956 WARNING [train.py:1182] (1/2) Exclude cut with ID 7276-92427-0014-12983-0_sp0.9 from training. Duration: 21.3055625 +2023-05-10 20:54:43,504 WARNING [train.py:1182] (1/2) Exclude cut with ID 1025-75365-0008-79168-0_sp0.9 from training. Duration: 22.0666875 +2023-05-10 20:55:15,788 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_skip_rate, batch_count=606100.0, ans=0.0 +2023-05-10 20:55:36,998 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module1.balancer1.prob, batch_count=606150.0, ans=0.125 +2023-05-10 20:55:44,158 INFO [train.py:1021] (1/2) Epoch 34, batch 1250, loss[loss=0.182, simple_loss=0.2769, pruned_loss=0.04356, over 37019.00 frames. ], tot_loss[loss=0.1631, simple_loss=0.2563, pruned_loss=0.03496, over 7237792.64 frames. ], batch size: 116, lr: 3.23e-03, grad_scale: 32.0 +2023-05-10 20:56:09,093 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=8.89 vs. limit=15.0 +2023-05-10 20:56:10,263 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([1.8749, 2.8883, 2.7218, 2.7753, 2.6130, 2.3962, 2.7301, 2.4310], + device='cuda:1') +2023-05-10 20:56:15,690 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=6.30 vs. limit=15.0 +2023-05-10 20:56:18,189 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=606300.0, ans=0.0 +2023-05-10 20:56:21,519 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=6.39 vs. limit=15.0 +2023-05-10 20:56:33,334 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.balancer1.prob, batch_count=606350.0, ans=0.125 +2023-05-10 20:56:49,855 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0062-62366-0 from training. Duration: 20.26 +2023-05-10 20:56:52,723 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=606400.0, ans=0.125 +2023-05-10 20:57:09,203 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=192, metric=3.22 vs. limit=10.0 +2023-05-10 20:57:11,548 WARNING [train.py:1182] (1/2) Exclude cut with ID 5239-32139-0030-9324-0_sp0.9 from training. Duration: 21.3444375 +2023-05-10 20:57:12,991 INFO [train.py:1021] (1/2) Epoch 34, batch 1300, loss[loss=0.1621, simple_loss=0.2617, pruned_loss=0.03124, over 36933.00 frames. ], tot_loss[loss=0.1634, simple_loss=0.2566, pruned_loss=0.03514, over 7211698.22 frames. ], batch size: 108, lr: 3.23e-03, grad_scale: 32.0 +2023-05-10 20:57:23,281 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.5.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.2595, 4.4307, 2.1767, 2.4045], device='cuda:1') +2023-05-10 20:57:27,135 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.601e+02 3.003e+02 3.539e+02 4.501e+02 7.332e+02, threshold=7.078e+02, percent-clipped=2.0 +2023-05-10 20:57:41,236 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module2.balancer2.prob, batch_count=606500.0, ans=0.125 +2023-05-10 20:58:02,555 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=256, metric=8.64 vs. limit=22.5 +2023-05-10 20:58:27,701 WARNING [train.py:1182] (1/2) Exclude cut with ID 497-129325-0061-62254-0_sp1.1 from training. Duration: 0.97725 +2023-05-10 20:58:28,718 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward3.hidden_balancer.prob, batch_count=606650.0, ans=0.125 +2023-05-10 20:58:59,049 INFO [train.py:1021] (1/2) Epoch 34, batch 1350, loss[loss=0.1427, simple_loss=0.2285, pruned_loss=0.02844, over 36783.00 frames. ], tot_loss[loss=0.1633, simple_loss=0.2563, pruned_loss=0.03513, over 7208114.63 frames. ], batch size: 89, lr: 3.23e-03, grad_scale: 32.0 +2023-05-10 20:59:08,789 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=606700.0, ans=0.125 +2023-05-10 20:59:10,404 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.7857, 4.2929, 3.9541, 4.3177, 3.6047, 3.3000, 3.7175, 3.2956], + device='cuda:1') +2023-05-10 20:59:26,900 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0031-39906-0_sp0.9 from training. Duration: 22.97225 +2023-05-10 20:59:45,422 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=606800.0, ans=0.125 +2023-05-10 21:00:02,841 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0047-39922-0_sp0.9 from training. Duration: 21.97775 +2023-05-10 21:00:16,754 WARNING [train.py:1182] (1/2) Exclude cut with ID 1112-1043-0006-89194-0_sp0.9 from training. Duration: 21.8333125 +2023-05-10 21:00:21,373 INFO [train.py:1021] (1/2) Epoch 34, batch 1400, loss[loss=0.1453, simple_loss=0.2311, pruned_loss=0.02974, over 36977.00 frames. ], tot_loss[loss=0.1633, simple_loss=0.2564, pruned_loss=0.03517, over 7188654.35 frames. ], batch size: 91, lr: 3.23e-03, grad_scale: 32.0 +2023-05-10 21:00:22,228 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=1.96 vs. limit=6.0 +2023-05-10 21:00:28,690 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0031-94921-0 from training. Duration: 20.47 +2023-05-10 21:00:38,387 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.593e+02 3.069e+02 3.546e+02 4.439e+02 9.608e+02, threshold=7.092e+02, percent-clipped=1.0 +2023-05-10 21:00:47,120 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module1.balancer1.max_abs, batch_count=607000.0, ans=10.0 +2023-05-10 21:01:01,763 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward2.hidden_balancer.prob, batch_count=607050.0, ans=0.125 +2023-05-10 21:01:16,975 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.5825, 3.7219, 4.0437, 3.7868], device='cuda:1') +2023-05-10 21:01:35,555 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=3.78 vs. limit=6.0 +2023-05-10 21:01:49,448 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward2.hidden_balancer.prob, batch_count=607150.0, ans=0.125 +2023-05-10 21:02:00,656 INFO [train.py:1021] (1/2) Epoch 34, batch 1450, loss[loss=0.1509, simple_loss=0.2405, pruned_loss=0.03065, over 36952.00 frames. ], tot_loss[loss=0.1635, simple_loss=0.2565, pruned_loss=0.03525, over 7184428.83 frames. ], batch size: 95, lr: 3.23e-03, grad_scale: 32.0 +2023-05-10 21:02:00,758 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0037-39912-0_sp0.9 from training. Duration: 20.67225 +2023-05-10 21:02:22,308 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0024-94914-0_sp0.9 from training. Duration: 25.2444375 +2023-05-10 21:02:22,884 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_skip_rate, batch_count=607250.0, ans=0.0 +2023-05-10 21:02:26,740 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.0.layers.0.self_attn_weights, attn_weights_entropy = tensor([6.4086, 5.7186, 5.5783, 6.1417], device='cuda:1') +2023-05-10 21:02:32,852 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_skip_rate, batch_count=607300.0, ans=0.0 +2023-05-10 21:02:49,094 WARNING [train.py:1182] (1/2) Exclude cut with ID 3340-169293-0021-76797-0_sp0.9 from training. Duration: 21.1445 +2023-05-10 21:03:23,603 INFO [train.py:1021] (1/2) Epoch 34, batch 1500, loss[loss=0.1564, simple_loss=0.2428, pruned_loss=0.03504, over 35425.00 frames. ], tot_loss[loss=0.1631, simple_loss=0.256, pruned_loss=0.03513, over 7188868.61 frames. ], batch size: 78, lr: 3.22e-03, grad_scale: 32.0 +2023-05-10 21:03:38,441 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.560e+02 3.054e+02 3.536e+02 4.717e+02 1.052e+03, threshold=7.072e+02, percent-clipped=5.0 +2023-05-10 21:03:42,379 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module1.balancer2.prob, batch_count=607500.0, ans=0.125 +2023-05-10 21:04:25,460 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0079-62383-0_sp0.9 from training. Duration: 33.038875 +2023-05-10 21:04:58,318 INFO [train.py:1021] (1/2) Epoch 34, batch 1550, loss[loss=0.1954, simple_loss=0.282, pruned_loss=0.05442, over 24714.00 frames. ], tot_loss[loss=0.1634, simple_loss=0.2564, pruned_loss=0.03517, over 7180871.66 frames. ], batch size: 234, lr: 3.22e-03, grad_scale: 32.0 +2023-05-10 21:05:10,068 WARNING [train.py:1182] (1/2) Exclude cut with ID 6426-64291-0000-16059-0_sp0.9 from training. Duration: 20.0944375 +2023-05-10 21:05:25,347 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0044-62348-0_sp1.1 from training. Duration: 20.4 +2023-05-10 21:05:30,507 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=6.37 vs. limit=15.0 +2023-05-10 21:05:34,314 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62851-0022-91297-0 from training. Duration: 20.085 +2023-05-10 21:05:39,712 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.self_attn1.whiten, num_groups=1, num_channels=192, metric=10.48 vs. limit=22.5 +2023-05-10 21:05:46,407 WARNING [train.py:1182] (1/2) Exclude cut with ID 4860-13185-0032-76709-0_sp0.9 from training. Duration: 23.07775 +2023-05-10 21:05:51,202 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.balancer1.prob, batch_count=607850.0, ans=0.125 +2023-05-10 21:06:04,875 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=192, metric=3.32 vs. limit=10.0 +2023-05-10 21:06:10,628 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.balancer1.prob, batch_count=607900.0, ans=0.125 +2023-05-10 21:06:12,654 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=256, metric=4.41 vs. limit=15.0 +2023-05-10 21:06:18,146 INFO [train.py:1021] (1/2) Epoch 34, batch 1600, loss[loss=0.1716, simple_loss=0.2636, pruned_loss=0.03976, over 35941.00 frames. ], tot_loss[loss=0.1636, simple_loss=0.2567, pruned_loss=0.03527, over 7194969.82 frames. ], batch size: 133, lr: 3.22e-03, grad_scale: 32.0 +2023-05-10 21:06:29,341 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=607950.0, ans=0.125 +2023-05-10 21:06:31,936 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.433e+02 2.969e+02 3.541e+02 4.268e+02 6.968e+02, threshold=7.083e+02, percent-clipped=0.0 +2023-05-10 21:06:38,298 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0044-62348-0_sp0.9 from training. Duration: 24.9333125 +2023-05-10 21:06:38,610 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.1.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([4.3544, 3.8886, 3.7478, 4.0402], device='cuda:1') +2023-05-10 21:06:41,630 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=608000.0, ans=0.1 +2023-05-10 21:06:47,528 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass.scale_min, batch_count=608050.0, ans=0.2 +2023-05-10 21:07:28,708 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=608150.0, ans=0.1 +2023-05-10 21:07:29,953 WARNING [train.py:1182] (1/2) Exclude cut with ID 5118-111612-0016-124680-0_sp0.9 from training. Duration: 20.388875 +2023-05-10 21:07:35,429 WARNING [train.py:1182] (1/2) Exclude cut with ID 432-122774-0017-62487-0_sp1.1 from training. Duration: 20.3590625 +2023-05-10 21:07:40,987 INFO [train.py:1021] (1/2) Epoch 34, batch 1650, loss[loss=0.1628, simple_loss=0.2544, pruned_loss=0.03555, over 36879.00 frames. ], tot_loss[loss=0.1633, simple_loss=0.2565, pruned_loss=0.03507, over 7193164.00 frames. ], batch size: 96, lr: 3.22e-03, grad_scale: 32.0 +2023-05-10 21:08:55,360 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.out_combiner.scale_min, batch_count=608350.0, ans=0.2 +2023-05-10 21:09:00,683 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=2.55 vs. limit=6.0 +2023-05-10 21:09:16,880 WARNING [train.py:1182] (1/2) Exclude cut with ID 3557-8342-0013-54691-0_sp1.1 from training. Duration: 0.836375 +2023-05-10 21:09:20,898 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.attention_skip_rate, batch_count=608400.0, ans=0.0 +2023-05-10 21:09:26,571 INFO [train.py:1021] (1/2) Epoch 34, batch 1700, loss[loss=0.1804, simple_loss=0.2718, pruned_loss=0.04454, over 35927.00 frames. ], tot_loss[loss=0.1643, simple_loss=0.2572, pruned_loss=0.03568, over 7208400.64 frames. ], batch size: 133, lr: 3.22e-03, grad_scale: 32.0 +2023-05-10 21:09:40,101 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.659e+02 3.167e+02 3.611e+02 4.374e+02 7.091e+02, threshold=7.222e+02, percent-clipped=1.0 +2023-05-10 21:10:00,941 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.0.layers.0.self_attn_weights, attn_weights_entropy = tensor([6.4478, 5.7191, 5.6016, 6.1860], device='cuda:1') +2023-05-10 21:10:07,980 WARNING [train.py:1182] (1/2) Exclude cut with ID 8565-290391-0049-67394-0_sp0.9 from training. Duration: 21.3166875 +2023-05-10 21:10:39,544 WARNING [train.py:1182] (1/2) Exclude cut with ID 6533-399-0029-104863-0_sp0.9 from training. Duration: 22.1055625 +2023-05-10 21:10:50,252 INFO [train.py:1021] (1/2) Epoch 34, batch 1750, loss[loss=0.1763, simple_loss=0.2705, pruned_loss=0.04105, over 34805.00 frames. ], tot_loss[loss=0.165, simple_loss=0.2573, pruned_loss=0.03636, over 7195837.89 frames. ], batch size: 145, lr: 3.22e-03, grad_scale: 32.0 +2023-05-10 21:10:51,794 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0094-26379-0_sp1.1 from training. Duration: 21.77725 +2023-05-10 21:11:14,260 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0005-134304-0_sp0.9 from training. Duration: 27.8166875 +2023-05-10 21:11:16,045 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.out_combiner.scale_min, batch_count=608750.0, ans=0.2 +2023-05-10 21:11:36,326 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.self_attn_weights.whiten_keys, num_groups=8, num_channels=256, metric=3.38 vs. limit=6.0 +2023-05-10 21:11:40,239 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0021-15852-0_sp1.1 from training. Duration: 22.5090625 +2023-05-10 21:11:47,893 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0005-134304-0 from training. Duration: 25.035 +2023-05-10 21:11:49,881 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_skip_rate, batch_count=608850.0, ans=0.0 +2023-05-10 21:12:05,660 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([4.7542, 4.0915, 2.4805, 2.7055], device='cuda:1') +2023-05-10 21:12:09,057 WARNING [train.py:1182] (1/2) Exclude cut with ID 774-127930-0014-10412-0_sp1.1 from training. Duration: 0.95 +2023-05-10 21:12:10,578 INFO [train.py:1021] (1/2) Epoch 34, batch 1800, loss[loss=0.1494, simple_loss=0.2303, pruned_loss=0.0343, over 36969.00 frames. ], tot_loss[loss=0.1659, simple_loss=0.2573, pruned_loss=0.03726, over 7192435.04 frames. ], batch size: 86, lr: 3.22e-03, grad_scale: 32.0 +2023-05-10 21:12:15,869 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.attention_skip_rate, batch_count=608950.0, ans=0.0 +2023-05-10 21:12:24,448 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.824e+02 3.682e+02 4.466e+02 5.170e+02 1.072e+03, threshold=8.931e+02, percent-clipped=4.0 +2023-05-10 21:12:25,105 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=7.21 vs. limit=15.0 +2023-05-10 21:12:29,426 WARNING [train.py:1182] (1/2) Exclude cut with ID 3033-130750-0096-55598-0_sp0.9 from training. Duration: 0.92225 +2023-05-10 21:12:47,550 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.balancer1.prob, batch_count=609050.0, ans=0.125 +2023-05-10 21:12:50,939 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.ff2_skip_rate, batch_count=609050.0, ans=0.0 +2023-05-10 21:12:52,650 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=609050.0, ans=0.125 +2023-05-10 21:13:01,795 WARNING [train.py:1182] (1/2) Exclude cut with ID 4511-76322-0006-80011-0 from training. Duration: 21.97 +2023-05-10 21:13:22,643 WARNING [train.py:1182] (1/2) Exclude cut with ID 7492-105653-0055-62765-0_sp0.9 from training. Duration: 21.97225 +2023-05-10 21:13:24,157 WARNING [train.py:1182] (1/2) Exclude cut with ID 453-131332-0000-47844-0_sp0.9 from training. Duration: 25.3333125 +2023-05-10 21:13:36,205 INFO [train.py:1021] (1/2) Epoch 34, batch 1850, loss[loss=0.186, simple_loss=0.2701, pruned_loss=0.05091, over 36783.00 frames. ], tot_loss[loss=0.1667, simple_loss=0.2569, pruned_loss=0.03818, over 7161475.79 frames. ], batch size: 113, lr: 3.22e-03, grad_scale: 32.0 +2023-05-10 21:13:36,320 WARNING [train.py:1182] (1/2) Exclude cut with ID 5172-29468-0015-19128-0_sp0.9 from training. Duration: 21.5055625 +2023-05-10 21:13:46,828 WARNING [train.py:1182] (1/2) Exclude cut with ID 453-131332-0000-47844-0_sp1.1 from training. Duration: 20.72725 +2023-05-10 21:13:57,752 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=4.63 vs. limit=15.0 +2023-05-10 21:14:15,840 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=256, metric=5.91 vs. limit=15.0 +2023-05-10 21:14:22,725 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0030-130156-0_sp0.9 from training. Duration: 26.32775 +2023-05-10 21:14:33,667 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.ff2_skip_rate, batch_count=609350.0, ans=0.0 +2023-05-10 21:14:39,579 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.bypass_mid.scale_min, batch_count=609400.0, ans=0.2 +2023-05-10 21:14:44,061 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.0.layers.0.self_attn_weights, attn_weights_entropy = tensor([6.2391, 5.5943, 5.4561, 6.0281], device='cuda:1') +2023-05-10 21:14:47,201 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass.skip_rate, batch_count=609400.0, ans=0.07 +2023-05-10 21:14:53,012 INFO [train.py:1021] (1/2) Epoch 34, batch 1900, loss[loss=0.1771, simple_loss=0.2695, pruned_loss=0.04234, over 32520.00 frames. ], tot_loss[loss=0.1677, simple_loss=0.2574, pruned_loss=0.03898, over 7158504.71 frames. ], batch size: 170, lr: 3.22e-03, grad_scale: 32.0 +2023-05-10 21:14:54,579 WARNING [train.py:1182] (1/2) Exclude cut with ID 3867-173237-0077-144769-0 from training. Duration: 20.025 +2023-05-10 21:14:56,898 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module2.balancer2.prob, batch_count=609450.0, ans=0.125 +2023-05-10 21:14:59,816 WARNING [train.py:1182] (1/2) Exclude cut with ID 6709-74022-0004-86860-0_sp1.1 from training. Duration: 0.9409375 +2023-05-10 21:15:01,201 WARNING [train.py:1182] (1/2) Exclude cut with ID 4757-1811-0023-62229-0_sp0.9 from training. Duration: 21.37775 +2023-05-10 21:15:04,636 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([2.2014, 3.1022, 4.7135, 3.2814], device='cuda:1') +2023-05-10 21:15:07,675 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 3.012e+02 3.564e+02 3.880e+02 4.723e+02 7.226e+02, threshold=7.760e+02, percent-clipped=0.0 +2023-05-10 21:15:18,762 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.bypass.scale_min, batch_count=609500.0, ans=0.2 +2023-05-10 21:15:21,466 WARNING [train.py:1182] (1/2) Exclude cut with ID 1250-135782-0004-25974-0_sp0.9 from training. Duration: 21.17225 +2023-05-10 21:15:22,966 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0021-15852-0_sp0.9 from training. Duration: 27.511125 +2023-05-10 21:15:24,834 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=609550.0, ans=0.1 +2023-05-10 21:15:26,443 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.5.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([2.9069, 3.8787, 4.4771, 4.6916], device='cuda:1') +2023-05-10 21:15:31,176 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.5868, 3.7731, 4.1668, 3.8282], device='cuda:1') +2023-05-10 21:15:45,348 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=609600.0, ans=0.125 +2023-05-10 21:15:55,975 WARNING [train.py:1182] (1/2) Exclude cut with ID 453-131332-0000-47844-0 from training. Duration: 22.8 +2023-05-10 21:16:01,087 WARNING [train.py:1182] (1/2) Exclude cut with ID 4964-30587-0040-44509-0 from training. Duration: 22.585 +2023-05-10 21:16:07,660 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.3017, 3.9900, 3.6841, 3.9964, 3.3360, 3.0225, 3.4269, 2.9424], + device='cuda:1') +2023-05-10 21:16:07,689 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=609650.0, ans=0.125 +2023-05-10 21:16:13,113 INFO [train.py:1021] (1/2) Epoch 34, batch 1950, loss[loss=0.1507, simple_loss=0.2307, pruned_loss=0.03532, over 37049.00 frames. ], tot_loss[loss=0.1683, simple_loss=0.2574, pruned_loss=0.03962, over 7167137.59 frames. ], batch size: 88, lr: 3.22e-03, grad_scale: 32.0 +2023-05-10 21:16:15,084 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.balancer1.max_abs, batch_count=609700.0, ans=10.0 +2023-05-10 21:16:37,070 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0001-146967-0_sp0.9 from training. Duration: 22.0166875 +2023-05-10 21:16:50,700 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=144, metric=5.34 vs. limit=10.0 +2023-05-10 21:16:56,290 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0003-134302-0_sp1.1 from training. Duration: 24.395375 +2023-05-10 21:16:58,483 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=256, metric=11.14 vs. limit=15.0 +2023-05-10 21:17:04,329 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0017-41203-0_sp0.9 from training. Duration: 27.47775 +2023-05-10 21:17:07,541 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass.scale_min, batch_count=609850.0, ans=0.2 +2023-05-10 21:17:08,798 WARNING [train.py:1182] (1/2) Exclude cut with ID 432-122774-0017-62487-0_sp0.9 from training. Duration: 24.8833125 +2023-05-10 21:17:11,636 WARNING [train.py:1182] (1/2) Exclude cut with ID 6758-72288-0033-108368-0 from training. Duration: 23.39 +2023-05-10 21:17:18,011 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0007-12994-0_sp0.9 from training. Duration: 28.72225 +2023-05-10 21:17:28,042 WARNING [train.py:1182] (1/2) Exclude cut with ID 585-294811-0110-133686-0_sp0.9 from training. Duration: 20.8944375 +2023-05-10 21:17:37,554 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.ff3_skip_rate, batch_count=609950.0, ans=0.0 +2023-05-10 21:17:38,539 INFO [train.py:1021] (1/2) Epoch 34, batch 2000, loss[loss=0.2084, simple_loss=0.2824, pruned_loss=0.06723, over 23623.00 frames. ], tot_loss[loss=0.1694, simple_loss=0.2581, pruned_loss=0.04031, over 7147287.38 frames. ], batch size: 233, lr: 3.22e-03, grad_scale: 32.0 +2023-05-10 21:17:44,821 WARNING [train.py:1182] (1/2) Exclude cut with ID 5796-66357-0007-116447-0_sp0.9 from training. Duration: 23.8444375 +2023-05-10 21:17:52,999 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.943e+02 3.647e+02 4.147e+02 4.742e+02 6.641e+02, threshold=8.294e+02, percent-clipped=0.0 +2023-05-10 21:17:53,456 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 21:17:55,674 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.ff2_skip_rate, batch_count=610000.0, ans=0.0 +2023-05-10 21:18:05,301 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=610000.0, ans=0.125 +2023-05-10 21:18:08,417 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.5.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.4770, 4.7237, 2.3913, 2.5967], device='cuda:1') +2023-05-10 21:18:12,583 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0007-12994-0 from training. Duration: 25.85 +2023-05-10 21:18:12,604 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0023-13010-0 from training. Duration: 21.39 +2023-05-10 21:18:24,630 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0014-15845-0 from training. Duration: 27.92 +2023-05-10 21:18:25,041 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.4240, 3.5583, 3.9224, 3.6267], device='cuda:1') +2023-05-10 21:18:56,862 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0039-130165-0_sp0.9 from training. Duration: 20.661125 +2023-05-10 21:18:58,285 INFO [train.py:1021] (1/2) Epoch 34, batch 2050, loss[loss=0.1594, simple_loss=0.247, pruned_loss=0.03587, over 36925.00 frames. ], tot_loss[loss=0.1693, simple_loss=0.2577, pruned_loss=0.04049, over 7183069.39 frames. ], batch size: 100, lr: 3.22e-03, grad_scale: 32.0 +2023-05-10 21:19:07,915 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module1.balancer2.prob, batch_count=610200.0, ans=0.125 +2023-05-10 21:19:11,125 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=610200.0, ans=0.125 +2023-05-10 21:19:16,258 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=7.68 vs. limit=15.0 +2023-05-10 21:19:20,053 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0043-15874-0_sp0.9 from training. Duration: 20.07225 +2023-05-10 21:19:29,335 WARNING [train.py:1182] (1/2) Exclude cut with ID 1085-156170-0017-128270-0 from training. Duration: 21.01 +2023-05-10 21:19:46,098 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=192, metric=9.53 vs. limit=15.0 +2023-05-10 21:20:14,403 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.bypass_mid.scale_min, batch_count=610450.0, ans=0.2 +2023-05-10 21:20:15,513 INFO [train.py:1021] (1/2) Epoch 34, batch 2100, loss[loss=0.1561, simple_loss=0.2379, pruned_loss=0.0371, over 37029.00 frames. ], tot_loss[loss=0.1693, simple_loss=0.2574, pruned_loss=0.04062, over 7179699.47 frames. ], batch size: 88, lr: 3.22e-03, grad_scale: 32.0 +2023-05-10 21:20:29,109 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.818e+02 3.538e+02 3.931e+02 4.395e+02 7.746e+02, threshold=7.862e+02, percent-clipped=0.0 +2023-05-10 21:20:30,927 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module2.balancer1.min_positive, batch_count=610500.0, ans=0.025 +2023-05-10 21:20:40,897 WARNING [train.py:1182] (1/2) Exclude cut with ID 2195-150901-0045-59933-0 from training. Duration: 20.65 +2023-05-10 21:20:43,060 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.whiten, num_groups=1, num_channels=256, metric=2.86 vs. limit=12.0 +2023-05-10 21:20:47,135 WARNING [train.py:1182] (1/2) Exclude cut with ID 5796-66357-0007-116447-0 from training. Duration: 21.46 +2023-05-10 21:21:04,062 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module2.balancer1.min_positive, batch_count=610600.0, ans=0.025 +2023-05-10 21:21:11,372 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_skip_rate, batch_count=610600.0, ans=0.0 +2023-05-10 21:21:17,692 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([1.6945, 2.9115, 4.4533, 2.8436], device='cuda:1') +2023-05-10 21:21:19,110 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.ff3_skip_rate, batch_count=610650.0, ans=0.0 +2023-05-10 21:21:20,655 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.1.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 21:21:20,674 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass_mid.scale_min, batch_count=610650.0, ans=0.2 +2023-05-10 21:21:20,728 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.ff3_skip_rate, batch_count=610650.0, ans=0.0 +2023-05-10 21:21:32,820 INFO [train.py:1021] (1/2) Epoch 34, batch 2150, loss[loss=0.1583, simple_loss=0.2434, pruned_loss=0.03662, over 37149.00 frames. ], tot_loss[loss=0.1695, simple_loss=0.2571, pruned_loss=0.04094, over 7158489.13 frames. ], batch size: 98, lr: 3.22e-03, grad_scale: 32.0 +2023-05-10 21:21:36,489 WARNING [train.py:1182] (1/2) Exclude cut with ID 3557-8342-0013-54691-0 from training. Duration: 0.92 +2023-05-10 21:21:44,263 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.1.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([5.5456, 4.8729, 5.0307, 4.7100], device='cuda:1') +2023-05-10 21:21:47,052 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0023-13010-0_sp0.9 from training. Duration: 23.7666875 +2023-05-10 21:22:25,954 WARNING [train.py:1182] (1/2) Exclude cut with ID 8544-281189-0060-101339-0_sp0.9 from training. Duration: 20.861125 +2023-05-10 21:22:37,427 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-65654-0031-41259-0_sp0.9 from training. Duration: 22.711125 +2023-05-10 21:22:42,341 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.ff3_skip_rate, batch_count=610900.0, ans=0.0 +2023-05-10 21:22:52,908 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 21:22:53,953 INFO [train.py:1021] (1/2) Epoch 34, batch 2200, loss[loss=0.1733, simple_loss=0.2655, pruned_loss=0.04054, over 32519.00 frames. ], tot_loss[loss=0.1702, simple_loss=0.2578, pruned_loss=0.04125, over 7159503.04 frames. ], batch size: 170, lr: 3.22e-03, grad_scale: 32.0 +2023-05-10 21:22:57,468 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module2.balancer1.prob, batch_count=610950.0, ans=0.125 +2023-05-10 21:22:58,945 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=610950.0, ans=0.125 +2023-05-10 21:23:07,666 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.873e+02 3.660e+02 4.191e+02 4.524e+02 6.713e+02, threshold=8.383e+02, percent-clipped=0.0 +2023-05-10 21:23:14,100 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=611000.0, ans=0.0 +2023-05-10 21:23:19,358 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module1.balancer1.prob, batch_count=611000.0, ans=0.125 +2023-05-10 21:23:23,521 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0043-132310-0_sp1.1 from training. Duration: 22.986375 +2023-05-10 21:23:25,404 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([5.1078, 4.4872, 3.3102, 3.2250], device='cuda:1') +2023-05-10 21:23:40,775 WARNING [train.py:1182] (1/2) Exclude cut with ID 8040-260924-0003-80960-0_sp0.9 from training. Duration: 22.07225 +2023-05-10 21:23:41,372 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.conv_module2.whiten, num_groups=1, num_channels=256, metric=6.32 vs. limit=15.0 +2023-05-10 21:23:46,800 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0045-26330-0_sp0.9 from training. Duration: 20.3055625 +2023-05-10 21:23:49,742 WARNING [train.py:1182] (1/2) Exclude cut with ID 6356-271890-0060-94317-0_sp0.9 from training. Duration: 20.72225 +2023-05-10 21:24:03,345 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.nonlin_attention.balancer.max_positive, batch_count=611150.0, ans=0.95 +2023-05-10 21:24:08,521 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0017-41203-0_sp1.1 from training. Duration: 22.4818125 +2023-05-10 21:24:12,386 INFO [train.py:1021] (1/2) Epoch 34, batch 2250, loss[loss=0.1768, simple_loss=0.2641, pruned_loss=0.04479, over 34355.00 frames. ], tot_loss[loss=0.1698, simple_loss=0.2574, pruned_loss=0.04112, over 7184615.45 frames. ], batch size: 144, lr: 3.21e-03, grad_scale: 32.0 +2023-05-10 21:24:42,977 WARNING [train.py:1182] (1/2) Exclude cut with ID 4964-30587-0040-44509-0_sp0.9 from training. Duration: 25.0944375 +2023-05-10 21:24:46,131 WARNING [train.py:1182] (1/2) Exclude cut with ID 6533-399-0047-104881-0 from training. Duration: 21.515 +2023-05-10 21:24:46,423 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([4.6985, 3.7899, 4.2574, 4.2741], device='cuda:1') +2023-05-10 21:24:53,621 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0009-15840-0_sp0.9 from training. Duration: 27.02225 +2023-05-10 21:24:58,216 WARNING [train.py:1182] (1/2) Exclude cut with ID 432-122774-0010-62480-0_sp0.9 from training. Duration: 22.22225 +2023-05-10 21:25:07,079 WARNING [train.py:1182] (1/2) Exclude cut with ID 4964-30587-0085-44554-0_sp0.9 from training. Duration: 20.85 +2023-05-10 21:25:10,292 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=611350.0, ans=0.0 +2023-05-10 21:25:18,766 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=611400.0, ans=0.125 +2023-05-10 21:25:36,330 INFO [train.py:1021] (1/2) Epoch 34, batch 2300, loss[loss=0.1544, simple_loss=0.2425, pruned_loss=0.03311, over 37162.00 frames. ], tot_loss[loss=0.1703, simple_loss=0.2578, pruned_loss=0.04139, over 7172429.63 frames. ], batch size: 93, lr: 3.21e-03, grad_scale: 32.0 +2023-05-10 21:25:42,198 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=611450.0, ans=0.125 +2023-05-10 21:25:43,519 WARNING [train.py:1182] (1/2) Exclude cut with ID 4295-39940-0007-92567-0 from training. Duration: 21.54 +2023-05-10 21:25:49,489 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([2.3949, 3.5116, 3.1802, 4.0356, 2.1414, 3.4351, 4.0480, 3.4907], + device='cuda:1') +2023-05-10 21:25:51,278 WARNING [train.py:1182] (1/2) Exclude cut with ID 4964-30587-0040-44509-0_sp1.1 from training. Duration: 20.5318125 +2023-05-10 21:25:52,918 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.912e+02 3.351e+02 3.673e+02 4.167e+02 6.199e+02, threshold=7.347e+02, percent-clipped=0.0 +2023-05-10 21:25:53,565 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=611500.0, ans=0.125 +2023-05-10 21:26:03,752 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0012-134311-0_sp0.9 from training. Duration: 21.9333125 +2023-05-10 21:26:07,045 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module1.balancer2.prob, batch_count=611500.0, ans=0.125 +2023-05-10 21:26:10,109 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward3.hidden_balancer.prob, batch_count=611500.0, ans=0.125 +2023-05-10 21:26:17,229 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=611550.0, ans=0.0 +2023-05-10 21:26:24,955 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.0.layers.1.self_attn_weights, attn_weights_entropy = tensor([6.2482, 5.4705, 5.5268, 6.0961], device='cuda:1') +2023-05-10 21:26:25,097 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=611550.0, ans=0.1 +2023-05-10 21:26:53,116 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward3.hidden_balancer.prob, batch_count=611650.0, ans=0.125 +2023-05-10 21:26:57,340 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0025-130151-0_sp0.9 from training. Duration: 21.7944375 +2023-05-10 21:26:57,530 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=611650.0, ans=0.125 +2023-05-10 21:27:00,211 INFO [train.py:1021] (1/2) Epoch 34, batch 2350, loss[loss=0.163, simple_loss=0.2565, pruned_loss=0.03481, over 37092.00 frames. ], tot_loss[loss=0.1699, simple_loss=0.2574, pruned_loss=0.04122, over 7189142.69 frames. ], batch size: 103, lr: 3.21e-03, grad_scale: 32.0 +2023-05-10 21:27:09,976 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0002-12989-0_sp0.9 from training. Duration: 22.4666875 +2023-05-10 21:27:15,904 WARNING [train.py:1182] (1/2) Exclude cut with ID 6121-9014-0076-24124-0 from training. Duration: 21.635 +2023-05-10 21:27:20,966 WARNING [train.py:1182] (1/2) Exclude cut with ID 6121-9014-0076-24124-0_sp0.9 from training. Duration: 24.038875 +2023-05-10 21:27:21,434 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.self_attn2.whiten, num_groups=1, num_channels=256, metric=8.65 vs. limit=22.5 +2023-05-10 21:27:27,010 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.bypass.skip_rate, batch_count=611750.0, ans=0.04949747468305833 +2023-05-10 21:27:37,801 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_skip_rate, batch_count=611800.0, ans=0.0 +2023-05-10 21:27:39,149 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward3.hidden_balancer.prob, batch_count=611800.0, ans=0.125 +2023-05-10 21:27:41,579 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.0.conv_module2.whiten, num_groups=1, num_channels=192, metric=6.88 vs. limit=15.0 +2023-05-10 21:27:46,846 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.hidden_balancer.prob, batch_count=611850.0, ans=0.125 +2023-05-10 21:27:48,987 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=192, metric=13.12 vs. limit=15.0 +2023-05-10 21:28:06,845 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0009-134308-0_sp1.1 from training. Duration: 21.786375 +2023-05-10 21:28:13,578 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=256, metric=8.03 vs. limit=22.5 +2023-05-10 21:28:17,997 INFO [train.py:1021] (1/2) Epoch 34, batch 2400, loss[loss=0.1611, simple_loss=0.2431, pruned_loss=0.03957, over 36965.00 frames. ], tot_loss[loss=0.1704, simple_loss=0.2576, pruned_loss=0.04157, over 7141882.35 frames. ], batch size: 91, lr: 3.21e-03, grad_scale: 32.0 +2023-05-10 21:28:19,526 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0002-12989-0 from training. Duration: 20.22 +2023-05-10 21:28:31,160 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=192, metric=9.18 vs. limit=15.0 +2023-05-10 21:28:31,594 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.883e+02 3.487e+02 3.845e+02 4.460e+02 8.111e+02, threshold=7.690e+02, percent-clipped=1.0 +2023-05-10 21:28:37,078 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=256, metric=4.63 vs. limit=15.0 +2023-05-10 21:28:48,570 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.1.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([5.4784, 4.8213, 4.9853, 4.6514], device='cuda:1') +2023-05-10 21:29:10,693 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward3.hidden_balancer.prob, batch_count=612100.0, ans=0.125 +2023-05-10 21:29:34,623 INFO [train.py:1021] (1/2) Epoch 34, batch 2450, loss[loss=0.1585, simple_loss=0.2348, pruned_loss=0.04109, over 37051.00 frames. ], tot_loss[loss=0.1705, simple_loss=0.2575, pruned_loss=0.04172, over 7096385.75 frames. ], batch size: 88, lr: 3.21e-03, grad_scale: 16.0 +2023-05-10 21:30:27,873 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0043-132310-0 from training. Duration: 25.285 +2023-05-10 21:30:29,641 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module2.balancer2.prob, batch_count=612350.0, ans=0.125 +2023-05-10 21:30:40,549 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.conv_module2.whiten, num_groups=1, num_channels=256, metric=3.44 vs. limit=15.0 +2023-05-10 21:31:12,216 INFO [train.py:1021] (1/2) Epoch 34, batch 2500, loss[loss=0.1745, simple_loss=0.2602, pruned_loss=0.04438, over 32330.00 frames. ], tot_loss[loss=0.1707, simple_loss=0.2578, pruned_loss=0.04175, over 7108254.01 frames. ], batch size: 170, lr: 3.21e-03, grad_scale: 16.0 +2023-05-10 21:31:29,874 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=5.77 vs. limit=15.0 +2023-05-10 21:31:32,220 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.722e+02 3.613e+02 4.324e+02 5.537e+02 9.031e+02, threshold=8.649e+02, percent-clipped=5.0 +2023-05-10 21:31:45,011 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=4.67 vs. limit=15.0 +2023-05-10 21:31:52,675 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.0.layers.0.self_attn_weights, attn_weights_entropy = tensor([6.2517, 5.6187, 5.4153, 6.0333], device='cuda:1') +2023-05-10 21:32:00,925 WARNING [train.py:1182] (1/2) Exclude cut with ID 811-130148-0001-63453-0_sp0.9 from training. Duration: 20.861125 +2023-05-10 21:32:06,350 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=192, metric=4.40 vs. limit=15.0 +2023-05-10 21:32:27,287 WARNING [train.py:1182] (1/2) Exclude cut with ID 6010-56788-0055-90261-0 from training. Duration: 20.88 +2023-05-10 21:32:40,997 INFO [train.py:1021] (1/2) Epoch 34, batch 2550, loss[loss=0.1942, simple_loss=0.2718, pruned_loss=0.05831, over 24533.00 frames. ], tot_loss[loss=0.1704, simple_loss=0.2576, pruned_loss=0.04154, over 7103916.22 frames. ], batch size: 233, lr: 3.21e-03, grad_scale: 16.0 +2023-05-10 21:33:11,848 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0045-15876-0_sp0.9 from training. Duration: 23.4166875 +2023-05-10 21:33:34,007 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.bypass_mid.scale_min, batch_count=612800.0, ans=0.2 +2023-05-10 21:34:11,458 INFO [train.py:1021] (1/2) Epoch 34, batch 2600, loss[loss=0.1856, simple_loss=0.273, pruned_loss=0.04907, over 35827.00 frames. ], tot_loss[loss=0.1706, simple_loss=0.2577, pruned_loss=0.04178, over 7101851.78 frames. ], batch size: 133, lr: 3.21e-03, grad_scale: 16.0 +2023-05-10 21:34:27,619 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.balancer2.prob, batch_count=613000.0, ans=0.125 +2023-05-10 21:34:28,935 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.783e+02 3.641e+02 4.131e+02 5.132e+02 9.178e+02, threshold=8.262e+02, percent-clipped=2.0 +2023-05-10 21:34:38,329 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0006-134305-0 from training. Duration: 21.24 +2023-05-10 21:34:39,799 WARNING [train.py:1182] (1/2) Exclude cut with ID 6533-399-0047-104881-0_sp0.9 from training. Duration: 23.9055625 +2023-05-10 21:34:40,156 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.balancer2.prob, batch_count=613000.0, ans=0.125 +2023-05-10 21:34:46,211 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.balancer2.prob, batch_count=613050.0, ans=0.125 +2023-05-10 21:34:46,944 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=192, metric=6.87 vs. limit=15.0 +2023-05-10 21:35:07,989 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.ff2_skip_rate, batch_count=613100.0, ans=0.0 +2023-05-10 21:35:14,142 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.ff2_skip_rate, batch_count=613100.0, ans=0.0 +2023-05-10 21:35:15,288 WARNING [train.py:1182] (1/2) Exclude cut with ID 6758-72288-0033-108368-0_sp0.9 from training. Duration: 25.988875 +2023-05-10 21:35:25,162 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0001-134300-0_sp0.9 from training. Duration: 20.67225 +2023-05-10 21:35:25,406 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.ff2_skip_rate, batch_count=613150.0, ans=0.0 +2023-05-10 21:35:36,630 INFO [train.py:1021] (1/2) Epoch 34, batch 2650, loss[loss=0.1808, simple_loss=0.2648, pruned_loss=0.0484, over 36314.00 frames. ], tot_loss[loss=0.1706, simple_loss=0.2577, pruned_loss=0.04178, over 7095990.94 frames. ], batch size: 126, lr: 3.21e-03, grad_scale: 16.0 +2023-05-10 21:35:54,036 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.balancer2.prob, batch_count=613200.0, ans=0.125 +2023-05-10 21:36:31,456 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module2.balancer1.prob, batch_count=613300.0, ans=0.125 +2023-05-10 21:36:35,988 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.4.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 21:36:37,596 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0038-41224-0 from training. Duration: 20.34 +2023-05-10 21:36:39,884 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module1.balancer1.max_abs, batch_count=613300.0, ans=10.0 +2023-05-10 21:36:50,844 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass.scale_min, batch_count=613350.0, ans=0.2 +2023-05-10 21:37:06,563 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder_embed.conv.2.prob, batch_count=613400.0, ans=0.125 +2023-05-10 21:37:08,460 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=613400.0, ans=0.0 +2023-05-10 21:37:24,590 INFO [train.py:1021] (1/2) Epoch 34, batch 2700, loss[loss=0.1809, simple_loss=0.2708, pruned_loss=0.0455, over 37041.00 frames. ], tot_loss[loss=0.1699, simple_loss=0.2571, pruned_loss=0.04134, over 7120594.41 frames. ], batch size: 116, lr: 3.21e-03, grad_scale: 16.0 +2023-05-10 21:37:42,240 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.659e+02 3.556e+02 4.055e+02 4.683e+02 6.600e+02, threshold=8.110e+02, percent-clipped=0.0 +2023-05-10 21:37:48,581 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.balancer1.prob, batch_count=613500.0, ans=0.125 +2023-05-10 21:38:11,648 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0026-15857-0_sp0.9 from training. Duration: 25.061125 +2023-05-10 21:38:27,764 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.conv_module2.whiten, num_groups=1, num_channels=256, metric=6.47 vs. limit=15.0 +2023-05-10 21:38:28,970 WARNING [train.py:1182] (1/2) Exclude cut with ID 3033-130750-0096-55598-0 from training. Duration: 0.83 +2023-05-10 21:38:40,305 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward3.hidden_balancer.prob, batch_count=613650.0, ans=0.125 +2023-05-10 21:38:41,897 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 21:38:49,867 INFO [train.py:1021] (1/2) Epoch 34, batch 2750, loss[loss=0.1554, simple_loss=0.2416, pruned_loss=0.03464, over 36951.00 frames. ], tot_loss[loss=0.17, simple_loss=0.2574, pruned_loss=0.04128, over 7138033.86 frames. ], batch size: 95, lr: 3.21e-03, grad_scale: 16.0 +2023-05-10 21:38:55,857 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0017-41203-0 from training. Duration: 24.73 +2023-05-10 21:39:04,261 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=5.04 vs. limit=15.0 +2023-05-10 21:39:09,433 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0009-134308-0 from training. Duration: 23.965 +2023-05-10 21:39:16,312 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module2.balancer1.prob, batch_count=613750.0, ans=0.125 +2023-05-10 21:39:17,612 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0030-146996-0_sp0.9 from training. Duration: 22.088875 +2023-05-10 21:39:35,530 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0006-134305-0_sp0.9 from training. Duration: 23.6 +2023-05-10 21:39:44,866 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=4.68 vs. limit=15.0 +2023-05-10 21:40:08,579 INFO [train.py:1021] (1/2) Epoch 34, batch 2800, loss[loss=0.1594, simple_loss=0.2389, pruned_loss=0.03996, over 37067.00 frames. ], tot_loss[loss=0.1696, simple_loss=0.2568, pruned_loss=0.04119, over 7105292.73 frames. ], batch size: 88, lr: 3.21e-03, grad_scale: 32.0 +2023-05-10 21:40:18,830 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=4.33 vs. limit=15.0 +2023-05-10 21:40:24,225 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.992e+02 3.575e+02 4.005e+02 4.691e+02 7.512e+02, threshold=8.010e+02, percent-clipped=0.0 +2023-05-10 21:40:42,677 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.balancer2.min_positive, batch_count=614050.0, ans=0.05 +2023-05-10 21:41:14,136 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.out_proj.dropout_p, batch_count=614150.0, ans=0.1 +2023-05-10 21:41:25,220 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=144, metric=5.52 vs. limit=10.0 +2023-05-10 21:41:25,827 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0024-13011-0 from training. Duration: 23.795 +2023-05-10 21:41:26,017 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_skip_rate, batch_count=614150.0, ans=0.0 +2023-05-10 21:41:30,212 INFO [train.py:1021] (1/2) Epoch 34, batch 2850, loss[loss=0.1695, simple_loss=0.2657, pruned_loss=0.03667, over 37167.00 frames. ], tot_loss[loss=0.1694, simple_loss=0.2565, pruned_loss=0.04112, over 7122568.12 frames. ], batch size: 112, lr: 3.21e-03, grad_scale: 32.0 +2023-05-10 21:41:46,356 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0030-130156-0_sp1.1 from training. Duration: 21.5409375 +2023-05-10 21:41:46,904 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.5.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.3898, 4.6474, 2.3149, 2.5044], device='cuda:1') +2023-05-10 21:41:48,420 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0019-146985-0_sp0.9 from training. Duration: 24.97775 +2023-05-10 21:41:58,565 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=192, metric=4.11 vs. limit=10.0 +2023-05-10 21:42:03,160 WARNING [train.py:1182] (1/2) Exclude cut with ID 1085-156170-0017-128270-0_sp0.9 from training. Duration: 23.3444375 +2023-05-10 21:42:03,295 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder_embed.convnext.hidden_balancer.prob, batch_count=614250.0, ans=0.125 +2023-05-10 21:42:14,300 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn1.whiten, num_groups=1, num_channels=256, metric=8.51 vs. limit=22.5 +2023-05-10 21:42:34,308 WARNING [train.py:1182] (1/2) Exclude cut with ID 6010-56788-0055-90261-0_sp0.9 from training. Duration: 23.2 +2023-05-10 21:42:40,268 WARNING [train.py:1182] (1/2) Exclude cut with ID 5653-46179-0060-117930-0_sp0.9 from training. Duration: 21.17225 +2023-05-10 21:42:40,433 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.0.layers.0.self_attn_weights, attn_weights_entropy = tensor([5.0399, 4.6019, 4.4148, 4.9839], device='cuda:1') +2023-05-10 21:42:53,623 INFO [train.py:1021] (1/2) Epoch 34, batch 2900, loss[loss=0.1793, simple_loss=0.2672, pruned_loss=0.04569, over 36725.00 frames. ], tot_loss[loss=0.17, simple_loss=0.2572, pruned_loss=0.04134, over 7115814.33 frames. ], batch size: 122, lr: 3.21e-03, grad_scale: 32.0 +2023-05-10 21:43:01,738 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0008-134307-0_sp0.9 from training. Duration: 24.6555625 +2023-05-10 21:43:09,170 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.759e+02 3.560e+02 4.015e+02 4.654e+02 7.519e+02, threshold=8.031e+02, percent-clipped=0.0 +2023-05-10 21:43:24,731 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.balancer1.prob, batch_count=614550.0, ans=0.125 +2023-05-10 21:44:01,251 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.self_attn2.whiten, num_groups=1, num_channels=256, metric=6.29 vs. limit=22.5 +2023-05-10 21:44:02,060 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-65654-0031-41259-0 from training. Duration: 20.44 +2023-05-10 21:44:10,934 INFO [train.py:1021] (1/2) Epoch 34, batch 2950, loss[loss=0.1779, simple_loss=0.2735, pruned_loss=0.04116, over 36908.00 frames. ], tot_loss[loss=0.1701, simple_loss=0.2575, pruned_loss=0.04137, over 7122727.21 frames. ], batch size: 105, lr: 3.21e-03, grad_scale: 32.0 +2023-05-10 21:44:16,750 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0018-132285-0_sp0.9 from training. Duration: 23.45 +2023-05-10 21:44:29,566 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module2.balancer2.prob, batch_count=614750.0, ans=0.125 +2023-05-10 21:44:49,597 WARNING [train.py:1182] (1/2) Exclude cut with ID 6945-60535-0076-12784-0_sp0.9 from training. Duration: 20.52225 +2023-05-10 21:44:58,754 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0008-134307-0 from training. Duration: 22.19 +2023-05-10 21:45:06,508 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_skip_rate, batch_count=614850.0, ans=0.0 +2023-05-10 21:45:06,810 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_whiten.whitening_limit, batch_count=614850.0, ans=15.0 +2023-05-10 21:45:09,213 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0014-15845-0_sp1.1 from training. Duration: 25.3818125 +2023-05-10 21:45:09,451 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass.skip_rate, batch_count=614850.0, ans=0.04949747468305833 +2023-05-10 21:45:23,874 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.balancer2.prob, batch_count=614900.0, ans=0.125 +2023-05-10 21:45:24,059 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.whiten, num_groups=1, num_channels=256, metric=4.22 vs. limit=12.0 +2023-05-10 21:45:26,543 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0043-132310-0_sp0.9 from training. Duration: 28.0944375 +2023-05-10 21:45:26,935 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=614950.0, ans=0.125 +2023-05-10 21:45:27,977 INFO [train.py:1021] (1/2) Epoch 34, batch 3000, loss[loss=0.1589, simple_loss=0.2498, pruned_loss=0.03401, over 37052.00 frames. ], tot_loss[loss=0.1696, simple_loss=0.2569, pruned_loss=0.04112, over 7137695.43 frames. ], batch size: 94, lr: 3.20e-03, grad_scale: 32.0 +2023-05-10 21:45:27,978 INFO [train.py:1048] (1/2) Computing validation loss +2023-05-10 21:45:36,784 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.5.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.2273, 3.4851, 2.1237, 2.2495], device='cuda:1') +2023-05-10 21:45:39,172 INFO [train.py:1057] (1/2) Epoch 34, validation: loss=0.1518, simple_loss=0.2525, pruned_loss=0.02558, over 944034.00 frames. +2023-05-10 21:45:39,173 INFO [train.py:1058] (1/2) Maximum memory allocated so far is 18883MB +2023-05-10 21:45:45,117 WARNING [train.py:1182] (1/2) Exclude cut with ID 2195-150901-0045-59933-0_sp0.9 from training. Duration: 22.9444375 +2023-05-10 21:45:45,545 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.5.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.4611, 4.8221, 2.4741, 2.6624], device='cuda:1') +2023-05-10 21:45:54,507 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.886e+02 3.364e+02 3.825e+02 4.564e+02 7.849e+02, threshold=7.649e+02, percent-clipped=0.0 +2023-05-10 21:45:54,664 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0024-13011-0_sp1.1 from training. Duration: 21.6318125 +2023-05-10 21:45:54,962 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=615000.0, ans=0.125 +2023-05-10 21:46:11,840 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0030-130156-0 from training. Duration: 23.695 +2023-05-10 21:46:16,483 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.const_attention_rate, batch_count=615050.0, ans=0.025 +2023-05-10 21:46:21,172 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.whiten, num_groups=1, num_channels=256, metric=3.58 vs. limit=12.0 +2023-05-10 21:46:23,012 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.whiten, num_groups=1, num_channels=256, metric=9.31 vs. limit=12.0 +2023-05-10 21:46:24,633 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.1.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([5.6048, 4.9355, 5.1198, 4.8080], device='cuda:1') +2023-05-10 21:46:50,447 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0094-26379-0 from training. Duration: 23.955 +2023-05-10 21:47:15,820 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass_mid.scale_min, batch_count=615150.0, ans=0.2 +2023-05-10 21:47:15,920 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.balancer2.prob, batch_count=615150.0, ans=0.125 +2023-05-10 21:47:20,664 INFO [train.py:1021] (1/2) Epoch 34, batch 3050, loss[loss=0.143, simple_loss=0.2209, pruned_loss=0.03256, over 36949.00 frames. ], tot_loss[loss=0.1694, simple_loss=0.2568, pruned_loss=0.04096, over 7130619.56 frames. ], batch size: 86, lr: 3.20e-03, grad_scale: 32.0 +2023-05-10 21:47:38,800 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0024-13011-0_sp0.9 from training. Duration: 26.438875 +2023-05-10 21:47:46,015 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module2.balancer2.min_positive, batch_count=615250.0, ans=0.05 +2023-05-10 21:48:31,123 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0021-26306-0_sp0.9 from training. Duration: 21.2444375 +2023-05-10 21:48:31,236 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0014-15845-0_sp0.9 from training. Duration: 31.02225 +2023-05-10 21:48:36,091 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.balancer_ff3.min_abs, batch_count=615400.0, ans=0.2 +2023-05-10 21:48:46,178 INFO [train.py:1021] (1/2) Epoch 34, batch 3100, loss[loss=0.1466, simple_loss=0.2266, pruned_loss=0.03328, over 36931.00 frames. ], tot_loss[loss=0.1692, simple_loss=0.2567, pruned_loss=0.0409, over 7105998.37 frames. ], batch size: 86, lr: 3.20e-03, grad_scale: 32.0 +2023-05-10 21:48:48,754 WARNING [train.py:1182] (1/2) Exclude cut with ID 432-122774-0017-62487-0 from training. Duration: 22.395 +2023-05-10 21:49:08,410 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.940e+02 3.414e+02 3.891e+02 4.479e+02 7.608e+02, threshold=7.782e+02, percent-clipped=0.0 +2023-05-10 21:49:10,353 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0045-15876-0 from training. Duration: 21.075 +2023-05-10 21:49:10,795 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.balancer2.prob, batch_count=615500.0, ans=0.125 +2023-05-10 21:49:17,325 WARNING [train.py:1182] (1/2) Exclude cut with ID 6482-98857-0025-147532-0_sp0.9 from training. Duration: 20.0055625 +2023-05-10 21:49:17,973 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=10.67 vs. limit=15.0 +2023-05-10 21:49:18,849 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0037-132304-0_sp0.9 from training. Duration: 22.05 +2023-05-10 21:49:18,864 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0003-134302-0 from training. Duration: 26.8349375 +2023-05-10 21:49:19,135 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=615500.0, ans=0.1 +2023-05-10 21:49:22,071 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0009-15840-0_sp1.1 from training. Duration: 22.1090625 +2023-05-10 21:49:26,084 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.bypass.skip_rate, batch_count=615550.0, ans=0.035 +2023-05-10 21:49:26,238 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([1.9844, 3.0287, 4.5091, 3.1828], device='cuda:1') +2023-05-10 21:49:30,491 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0094-26379-0_sp0.9 from training. Duration: 26.6166875 +2023-05-10 21:49:32,355 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=615550.0, ans=0.1 +2023-05-10 21:49:47,357 WARNING [train.py:1182] (1/2) Exclude cut with ID 2046-178027-0000-53705-0_sp0.9 from training. Duration: 20.3055625 +2023-05-10 21:50:27,418 INFO [train.py:1021] (1/2) Epoch 34, batch 3150, loss[loss=0.1636, simple_loss=0.2484, pruned_loss=0.03938, over 37188.00 frames. ], tot_loss[loss=0.1691, simple_loss=0.2566, pruned_loss=0.04079, over 7119965.63 frames. ], batch size: 93, lr: 3.20e-03, grad_scale: 32.0 +2023-05-10 21:50:27,553 WARNING [train.py:1182] (1/2) Exclude cut with ID 7205-50138-0008-5373-0_sp0.9 from training. Duration: 20.7 +2023-05-10 21:50:28,006 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.3396, 3.0291, 3.0900, 2.8724], device='cuda:1') +2023-05-10 21:50:44,145 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module2.balancer1.prob, batch_count=615750.0, ans=0.125 +2023-05-10 21:50:53,419 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.out_combiner.scale_min, batch_count=615750.0, ans=0.2 +2023-05-10 21:51:04,927 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=192, metric=3.47 vs. limit=10.0 +2023-05-10 21:51:13,225 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0019-146985-0 from training. Duration: 22.48 +2023-05-10 21:51:17,552 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.5.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.3714, 4.3510, 2.1593, 2.4384], device='cuda:1') +2023-05-10 21:51:34,671 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0003-134302-0_sp0.9 from training. Duration: 29.816625 +2023-05-10 21:51:49,704 INFO [train.py:1021] (1/2) Epoch 34, batch 3200, loss[loss=0.1567, simple_loss=0.2375, pruned_loss=0.03796, over 35721.00 frames. ], tot_loss[loss=0.1688, simple_loss=0.2561, pruned_loss=0.04074, over 7116346.38 frames. ], batch size: 79, lr: 3.20e-03, grad_scale: 32.0 +2023-05-10 21:51:57,340 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0005-134304-0_sp1.1 from training. Duration: 22.7590625 +2023-05-10 21:52:06,565 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0026-15857-0 from training. Duration: 22.555 +2023-05-10 21:52:07,109 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.4.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 21:52:11,510 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.811e+02 3.464e+02 3.885e+02 4.612e+02 8.813e+02, threshold=7.771e+02, percent-clipped=1.0 +2023-05-10 21:52:27,091 WARNING [train.py:1182] (1/2) Exclude cut with ID 1250-135782-0005-25975-0_sp0.9 from training. Duration: 21.688875 +2023-05-10 21:52:52,104 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.0.layers.0.self_attn_weights, attn_weights_entropy = tensor([6.0822, 5.5039, 5.2981, 5.8769], device='cuda:1') +2023-05-10 21:52:55,791 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.ff3_skip_rate, batch_count=616100.0, ans=0.0 +2023-05-10 21:53:00,342 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.1.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([5.2479, 5.0459, 4.5404, 4.8482], device='cuda:1') +2023-05-10 21:53:06,057 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0038-41224-0_sp0.9 from training. Duration: 22.6 +2023-05-10 21:53:06,382 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.ff3_skip_rate, batch_count=616150.0, ans=0.0 +2023-05-10 21:53:13,576 INFO [train.py:1021] (1/2) Epoch 34, batch 3250, loss[loss=0.1633, simple_loss=0.2531, pruned_loss=0.0367, over 37034.00 frames. ], tot_loss[loss=0.1691, simple_loss=0.2564, pruned_loss=0.04087, over 7106409.03 frames. ], batch size: 99, lr: 3.20e-03, grad_scale: 32.0 +2023-05-10 21:53:36,128 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=3.22 vs. limit=15.0 +2023-05-10 21:53:37,295 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.out_combiner.scale_min, batch_count=616250.0, ans=0.2 +2023-05-10 21:53:44,548 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0009-15840-0 from training. Duration: 24.32 +2023-05-10 21:54:35,609 INFO [train.py:1021] (1/2) Epoch 34, batch 3300, loss[loss=0.1863, simple_loss=0.2766, pruned_loss=0.048, over 36738.00 frames. ], tot_loss[loss=0.1689, simple_loss=0.2564, pruned_loss=0.04074, over 7128925.49 frames. ], batch size: 118, lr: 3.20e-03, grad_scale: 32.0 +2023-05-10 21:54:39,789 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module2.balancer1.min_positive, batch_count=616450.0, ans=0.025 +2023-05-10 21:54:58,804 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.973e+02 3.526e+02 3.995e+02 4.526e+02 6.923e+02, threshold=7.990e+02, percent-clipped=0.0 +2023-05-10 21:54:58,977 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-276745-0093-13116-0_sp0.9 from training. Duration: 21.061125 +2023-05-10 21:54:59,513 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module2.balancer1.prob, batch_count=616500.0, ans=0.125 +2023-05-10 21:55:22,161 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0024-15855-0_sp0.9 from training. Duration: 20.32225 +2023-05-10 21:55:25,400 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.const_attention_rate, batch_count=616550.0, ans=0.025 +2023-05-10 21:55:28,447 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=616550.0, ans=0.1 +2023-05-10 21:55:34,890 WARNING [train.py:1182] (1/2) Exclude cut with ID 3033-130750-0096-55598-0_sp1.1 from training. Duration: 0.7545625 +2023-05-10 21:55:45,333 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.ff2_skip_rate, batch_count=616600.0, ans=0.0 +2023-05-10 21:55:46,858 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.const_attention_rate, batch_count=616600.0, ans=0.025 +2023-05-10 21:55:49,678 WARNING [train.py:1182] (1/2) Exclude cut with ID 4295-39940-0007-92567-0_sp0.9 from training. Duration: 23.9333125 +2023-05-10 21:56:12,943 INFO [train.py:1021] (1/2) Epoch 34, batch 3350, loss[loss=0.1916, simple_loss=0.2776, pruned_loss=0.05277, over 36801.00 frames. ], tot_loss[loss=0.1697, simple_loss=0.2575, pruned_loss=0.04098, over 7104995.73 frames. ], batch size: 122, lr: 3.20e-03, grad_scale: 32.0 +2023-05-10 21:56:27,406 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module1.balancer1.prob, batch_count=616700.0, ans=0.125 +2023-05-10 21:56:49,743 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0008-134307-0_sp1.1 from training. Duration: 20.17275 +2023-05-10 21:56:57,688 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0019-146985-0_sp1.1 from training. Duration: 20.436375 +2023-05-10 21:57:03,045 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=616800.0, ans=0.1 +2023-05-10 21:57:10,290 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.const_attention_rate, batch_count=616800.0, ans=0.025 +2023-05-10 21:57:54,046 INFO [train.py:1021] (1/2) Epoch 34, batch 3400, loss[loss=0.1721, simple_loss=0.2618, pruned_loss=0.04117, over 37172.00 frames. ], tot_loss[loss=0.1695, simple_loss=0.2573, pruned_loss=0.04081, over 7115012.31 frames. ], batch size: 102, lr: 3.20e-03, grad_scale: 32.0 +2023-05-10 21:57:59,521 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module1.balancer2.min_abs, batch_count=616950.0, ans=0.5 +2023-05-10 21:58:14,862 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.771e+02 3.443e+02 3.781e+02 4.308e+02 5.922e+02, threshold=7.562e+02, percent-clipped=0.0 +2023-05-10 21:58:36,101 WARNING [train.py:1182] (1/2) Exclude cut with ID 4234-40345-0022-142709-0_sp0.9 from training. Duration: 23.1055625 +2023-05-10 21:58:37,643 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0007-12994-0_sp1.1 from training. Duration: 23.5 +2023-05-10 21:58:37,941 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=617050.0, ans=0.125 +2023-05-10 21:58:50,210 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0009-134308-0_sp0.9 from training. Duration: 26.62775 +2023-05-10 21:59:06,958 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0018-132285-0 from training. Duration: 21.105 +2023-05-10 21:59:10,797 WARNING [train.py:1182] (1/2) Exclude cut with ID 4511-76322-0006-80011-0_sp0.9 from training. Duration: 24.411125 +2023-05-10 21:59:19,645 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.attention_skip_rate, batch_count=617150.0, ans=0.0 +2023-05-10 21:59:29,279 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.ff3_skip_rate, batch_count=617150.0, ans=0.0 +2023-05-10 21:59:31,978 INFO [train.py:1021] (1/2) Epoch 34, batch 3450, loss[loss=0.1598, simple_loss=0.2422, pruned_loss=0.03875, over 36960.00 frames. ], tot_loss[loss=0.1699, simple_loss=0.2577, pruned_loss=0.04109, over 7114378.48 frames. ], batch size: 91, lr: 3.20e-03, grad_scale: 32.0 +2023-05-10 21:59:33,921 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=617200.0, ans=0.125 +2023-05-10 21:59:44,256 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.bypass.scale_min, batch_count=617200.0, ans=0.2 +2023-05-10 21:59:46,869 WARNING [train.py:1182] (1/2) Exclude cut with ID 6758-72288-0033-108368-0_sp1.1 from training. Duration: 21.263625 +2023-05-10 22:00:08,986 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=617300.0, ans=0.1 +2023-05-10 22:00:18,118 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward3.hidden_balancer.prob, batch_count=617300.0, ans=0.125 +2023-05-10 22:00:24,171 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=192, metric=10.10 vs. limit=15.0 +2023-05-10 22:00:26,871 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.5.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([2.8935, 3.3676, 3.8142, 3.7436], device='cuda:1') +2023-05-10 22:00:32,098 WARNING [train.py:1182] (1/2) Exclude cut with ID 4234-40345-0022-142709-0 from training. Duration: 20.795 +2023-05-10 22:00:46,385 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0021-15852-0 from training. Duration: 24.76 +2023-05-10 22:00:47,832 WARNING [train.py:1182] (1/2) Exclude cut with ID 3867-173237-0077-144769-0_sp0.9 from training. Duration: 22.25 +2023-05-10 22:01:01,272 INFO [train.py:1021] (1/2) Epoch 34, batch 3500, loss[loss=0.1647, simple_loss=0.255, pruned_loss=0.03719, over 36941.00 frames. ], tot_loss[loss=0.1695, simple_loss=0.2572, pruned_loss=0.04088, over 7117023.64 frames. ], batch size: 108, lr: 3.20e-03, grad_scale: 32.0 +2023-05-10 22:01:14,201 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0026-15857-0_sp1.1 from training. Duration: 20.5045625 +2023-05-10 22:01:16,966 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.764e+02 3.492e+02 3.847e+02 4.374e+02 7.573e+02, threshold=7.694e+02, percent-clipped=1.0 +2023-05-10 22:01:30,128 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module1.balancer1.prob, batch_count=617500.0, ans=0.125 +2023-05-10 22:01:58,254 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=617600.0, ans=0.125 +2023-05-10 22:02:01,010 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_skip_rate, batch_count=617650.0, ans=0.0 +2023-05-10 22:02:17,124 INFO [train.py:1021] (1/2) Epoch 34, batch 3550, loss[loss=0.151, simple_loss=0.2384, pruned_loss=0.03179, over 36978.00 frames. ], tot_loss[loss=0.1701, simple_loss=0.2578, pruned_loss=0.04121, over 7092936.67 frames. ], batch size: 91, lr: 3.20e-03, grad_scale: 32.0 +2023-05-10 22:02:22,007 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=617700.0, ans=0.125 +2023-05-10 22:02:31,720 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.const_attention_rate, batch_count=617750.0, ans=0.025 +2023-05-10 22:02:34,943 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module1.balancer2.prob, batch_count=617750.0, ans=0.125 +2023-05-10 22:02:35,878 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.whiten.whitening_limit, batch_count=617750.0, ans=12.0 +2023-05-10 22:03:38,690 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.ff3_skip_rate, batch_count=617900.0, ans=0.0 +2023-05-10 22:03:41,272 INFO [train.py:1021] (1/2) Epoch 34, batch 3600, loss[loss=0.1565, simple_loss=0.2461, pruned_loss=0.03349, over 37090.00 frames. ], tot_loss[loss=0.1695, simple_loss=0.2571, pruned_loss=0.04095, over 7090154.47 frames. ], batch size: 94, lr: 3.20e-03, grad_scale: 32.0 +2023-05-10 22:03:54,686 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=256, metric=5.48 vs. limit=15.0 +2023-05-10 22:03:56,955 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.699e+02 3.401e+02 3.724e+02 4.208e+02 5.378e+02, threshold=7.447e+02, percent-clipped=0.0 +2023-05-10 22:03:59,449 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.balancer2.prob, batch_count=618000.0, ans=0.125 +2023-05-10 22:04:08,087 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=1.96 vs. limit=6.0 +2023-05-10 22:05:08,374 WARNING [train.py:1182] (1/2) Exclude cut with ID 7859-102521-0017-7548-0_sp1.1 from training. Duration: 22.2954375 +2023-05-10 22:05:14,294 INFO [train.py:1021] (1/2) Epoch 35, batch 0, loss[loss=0.1733, simple_loss=0.2692, pruned_loss=0.03874, over 36927.00 frames. ], tot_loss[loss=0.1733, simple_loss=0.2692, pruned_loss=0.03874, over 36927.00 frames. ], batch size: 108, lr: 3.15e-03, grad_scale: 32.0 +2023-05-10 22:05:14,295 INFO [train.py:1048] (1/2) Computing validation loss +2023-05-10 22:05:25,302 INFO [train.py:1057] (1/2) Epoch 35, validation: loss=0.1521, simple_loss=0.2529, pruned_loss=0.02563, over 944034.00 frames. +2023-05-10 22:05:25,303 INFO [train.py:1058] (1/2) Maximum memory allocated so far is 18883MB +2023-05-10 22:05:28,682 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=618130.0, ans=0.1 +2023-05-10 22:05:33,348 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.attention_skip_rate, batch_count=618130.0, ans=0.0 +2023-05-10 22:05:33,881 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=4.90 vs. limit=15.0 +2023-05-10 22:05:49,877 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=618180.0, ans=0.125 +2023-05-10 22:06:21,224 WARNING [train.py:1182] (1/2) Exclude cut with ID 298-126791-0067-24026-0_sp0.9 from training. Duration: 21.438875 +2023-05-10 22:06:23,318 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=6.27 vs. limit=15.0 +2023-05-10 22:06:24,473 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 22:06:27,195 WARNING [train.py:1182] (1/2) Exclude cut with ID 5652-39938-0025-23684-0_sp0.9 from training. Duration: 22.2055625 +2023-05-10 22:06:47,115 INFO [train.py:1021] (1/2) Epoch 35, batch 50, loss[loss=0.1612, simple_loss=0.2604, pruned_loss=0.03098, over 36832.00 frames. ], tot_loss[loss=0.1668, simple_loss=0.2605, pruned_loss=0.03657, over 1630153.03 frames. ], batch size: 111, lr: 3.15e-03, grad_scale: 32.0 +2023-05-10 22:06:50,739 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=256, metric=3.56 vs. limit=15.0 +2023-05-10 22:07:12,517 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.5.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.4139, 4.7078, 2.4113, 2.5047], device='cuda:1') +2023-05-10 22:07:26,237 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.612e+02 3.507e+02 3.965e+02 5.093e+02 8.891e+02, threshold=7.929e+02, percent-clipped=5.0 +2023-05-10 22:07:28,080 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward3.hidden_balancer.prob, batch_count=618480.0, ans=0.125 +2023-05-10 22:07:40,390 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.3.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 22:08:06,081 INFO [train.py:1021] (1/2) Epoch 35, batch 100, loss[loss=0.1685, simple_loss=0.266, pruned_loss=0.03557, over 34543.00 frames. ], tot_loss[loss=0.1624, simple_loss=0.2548, pruned_loss=0.03505, over 2897637.19 frames. ], batch size: 144, lr: 3.15e-03, grad_scale: 32.0 +2023-05-10 22:08:41,460 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([2.2625, 3.2561, 4.5924, 3.3086], device='cuda:1') +2023-05-10 22:09:13,984 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.self_attn1.whiten, num_groups=1, num_channels=192, metric=11.16 vs. limit=22.5 +2023-05-10 22:09:33,153 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module2.balancer1.prob, batch_count=618830.0, ans=0.125 +2023-05-10 22:09:37,408 INFO [train.py:1021] (1/2) Epoch 35, batch 150, loss[loss=0.1494, simple_loss=0.2355, pruned_loss=0.03162, over 36859.00 frames. ], tot_loss[loss=0.1626, simple_loss=0.2552, pruned_loss=0.035, over 3862570.04 frames. ], batch size: 84, lr: 3.15e-03, grad_scale: 32.0 +2023-05-10 22:09:54,684 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.1.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([5.7644, 5.0415, 5.1957, 4.8915], device='cuda:1') +2023-05-10 22:10:02,272 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward2.hidden_balancer.prob, batch_count=618930.0, ans=0.125 +2023-05-10 22:10:05,057 WARNING [train.py:1182] (1/2) Exclude cut with ID 7859-102521-0017-7548-0 from training. Duration: 24.525 +2023-05-10 22:10:12,907 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.0.layers.1.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 22:10:13,037 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.5243, 3.5417, 3.8035, 3.3987], device='cuda:1') +2023-05-10 22:10:14,179 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.536e+02 3.038e+02 3.427e+02 4.333e+02 7.080e+02, threshold=6.854e+02, percent-clipped=0.0 +2023-05-10 22:10:18,238 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.bypass.skip_rate, batch_count=618980.0, ans=0.07 +2023-05-10 22:10:43,419 WARNING [train.py:1182] (1/2) Exclude cut with ID 3699-47246-0007-3408-0_sp0.9 from training. Duration: 20.26675 +2023-05-10 22:10:47,274 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=256, metric=5.11 vs. limit=15.0 +2023-05-10 22:10:54,355 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_skip_rate, batch_count=619130.0, ans=0.0 +2023-05-10 22:10:55,484 INFO [train.py:1021] (1/2) Epoch 35, batch 200, loss[loss=0.1676, simple_loss=0.2647, pruned_loss=0.03531, over 36735.00 frames. ], tot_loss[loss=0.1617, simple_loss=0.2541, pruned_loss=0.03459, over 4595283.77 frames. ], batch size: 122, lr: 3.15e-03, grad_scale: 32.0 +2023-05-10 22:10:57,182 WARNING [train.py:1182] (1/2) Exclude cut with ID 7859-102521-0017-7548-0_sp0.9 from training. Duration: 27.25 +2023-05-10 22:11:08,497 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.attention_skip_rate, batch_count=619130.0, ans=0.0 +2023-05-10 22:11:28,178 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.bypass.scale_min, batch_count=619230.0, ans=0.2 +2023-05-10 22:11:30,059 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.conv_module2.whiten, num_groups=1, num_channels=256, metric=5.80 vs. limit=15.0 +2023-05-10 22:11:32,818 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=619230.0, ans=0.125 +2023-05-10 22:11:47,975 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.ff3_skip_rate, batch_count=619280.0, ans=0.0 +2023-05-10 22:11:48,027 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.balancer2.prob, batch_count=619280.0, ans=0.125 +2023-05-10 22:11:54,368 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=1.86 vs. limit=6.0 +2023-05-10 22:12:14,067 INFO [train.py:1021] (1/2) Epoch 35, batch 250, loss[loss=0.162, simple_loss=0.259, pruned_loss=0.03252, over 32641.00 frames. ], tot_loss[loss=0.1611, simple_loss=0.2536, pruned_loss=0.0343, over 5177815.22 frames. ], batch size: 170, lr: 3.15e-03, grad_scale: 32.0 +2023-05-10 22:12:17,158 WARNING [train.py:1182] (1/2) Exclude cut with ID 6426-64292-0017-15984-0 from training. Duration: 21.68 +2023-05-10 22:12:31,557 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0007-59342-0 from training. Duration: 21.6300625 +2023-05-10 22:12:34,790 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.nonlin_attention.balancer.prob, batch_count=619430.0, ans=0.125 +2023-05-10 22:12:48,701 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=6.59 vs. limit=15.0 +2023-05-10 22:12:48,775 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=6.13 vs. limit=15.0 +2023-05-10 22:12:51,002 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.356e+02 2.953e+02 3.213e+02 3.926e+02 7.160e+02, threshold=6.425e+02, percent-clipped=1.0 +2023-05-10 22:12:54,561 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.ff3_skip_rate, batch_count=619480.0, ans=0.0 +2023-05-10 22:12:57,264 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0007-59342-0_sp0.9 from training. Duration: 24.033375 +2023-05-10 22:13:02,121 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.5.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([3.0732, 4.1631, 4.7036, 4.8907], device='cuda:1') +2023-05-10 22:13:08,766 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.balancer1.prob, batch_count=619530.0, ans=0.125 +2023-05-10 22:13:11,882 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_skip_rate, batch_count=619530.0, ans=0.0 +2023-05-10 22:13:31,850 INFO [train.py:1021] (1/2) Epoch 35, batch 300, loss[loss=0.1601, simple_loss=0.2598, pruned_loss=0.03019, over 37124.00 frames. ], tot_loss[loss=0.1608, simple_loss=0.2535, pruned_loss=0.03403, over 5624062.63 frames. ], batch size: 107, lr: 3.15e-03, grad_scale: 32.0 +2023-05-10 22:13:48,065 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.self_attn_weights.whiten_keys.whitening_limit, batch_count=619680.0, ans=6.0 +2023-05-10 22:14:00,266 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0009-59344-0 from training. Duration: 22.905 +2023-05-10 22:14:01,775 WARNING [train.py:1182] (1/2) Exclude cut with ID 5622-44585-0006-90525-0_sp1.1 from training. Duration: 23.4318125 +2023-05-10 22:14:08,490 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=1.79 vs. limit=6.0 +2023-05-10 22:14:14,177 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.balancer2.prob, batch_count=619730.0, ans=0.125 +2023-05-10 22:14:33,437 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=256, metric=5.20 vs. limit=15.0 +2023-05-10 22:14:42,362 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_module1.balancer1.prob, batch_count=619830.0, ans=0.125 +2023-05-10 22:14:49,661 INFO [train.py:1021] (1/2) Epoch 35, batch 350, loss[loss=0.1845, simple_loss=0.2796, pruned_loss=0.04474, over 37035.00 frames. ], tot_loss[loss=0.1607, simple_loss=0.2533, pruned_loss=0.034, over 5961283.71 frames. ], batch size: 116, lr: 3.15e-03, grad_scale: 32.0 +2023-05-10 22:14:55,055 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([1.8326, 3.1611, 4.4797, 2.9519], device='cuda:1') +2023-05-10 22:15:07,779 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.const_attention_rate, batch_count=619930.0, ans=0.025 +2023-05-10 22:15:31,034 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.482e+02 2.919e+02 3.451e+02 4.329e+02 8.370e+02, threshold=6.903e+02, percent-clipped=0.0 +2023-05-10 22:15:46,758 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module1.balancer2.prob, batch_count=620030.0, ans=0.125 +2023-05-10 22:15:48,174 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.const_attention_rate, batch_count=620030.0, ans=0.025 +2023-05-10 22:16:09,255 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0009-59344-0_sp1.1 from training. Duration: 20.82275 +2023-05-10 22:16:09,502 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=620080.0, ans=0.0 +2023-05-10 22:16:13,622 WARNING [train.py:1182] (1/2) Exclude cut with ID 4278-13270-0009-59344-0_sp0.9 from training. Duration: 25.45 +2023-05-10 22:16:16,117 INFO [train.py:1021] (1/2) Epoch 35, batch 400, loss[loss=0.1556, simple_loss=0.2479, pruned_loss=0.03163, over 37145.00 frames. ], tot_loss[loss=0.1621, simple_loss=0.2549, pruned_loss=0.03463, over 6236168.06 frames. ], batch size: 98, lr: 3.14e-03, grad_scale: 32.0 +2023-05-10 22:16:31,942 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.whiten, num_groups=1, num_channels=256, metric=5.60 vs. limit=12.0 +2023-05-10 22:17:11,524 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.balancer2.prob, batch_count=620280.0, ans=0.125 +2023-05-10 22:17:14,478 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module1.balancer1.min_positive, batch_count=620280.0, ans=0.025 +2023-05-10 22:17:30,584 WARNING [train.py:1182] (1/2) Exclude cut with ID 5622-44585-0006-90525-0 from training. Duration: 25.775 +2023-05-10 22:17:46,024 INFO [train.py:1021] (1/2) Epoch 35, batch 450, loss[loss=0.147, simple_loss=0.2364, pruned_loss=0.02878, over 36793.00 frames. ], tot_loss[loss=0.1623, simple_loss=0.2553, pruned_loss=0.03463, over 6462853.53 frames. ], batch size: 89, lr: 3.14e-03, grad_scale: 32.0 +2023-05-10 22:17:58,048 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0071-62375-0_sp0.9 from training. Duration: 22.25 +2023-05-10 22:18:22,291 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.const_attention_rate, batch_count=620480.0, ans=0.025 +2023-05-10 22:18:27,077 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.655e+02 3.018e+02 3.361e+02 3.981e+02 7.453e+02, threshold=6.722e+02, percent-clipped=3.0 +2023-05-10 22:18:28,713 WARNING [train.py:1182] (1/2) Exclude cut with ID 3972-170212-0014-23379-0 from training. Duration: 26.205 +2023-05-10 22:18:41,432 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.bypass.skip_rate, batch_count=620530.0, ans=0.04949747468305833 +2023-05-10 22:18:45,626 WARNING [train.py:1182] (1/2) Exclude cut with ID 5239-32139-0047-9341-0_sp0.9 from training. Duration: 30.1555625 +2023-05-10 22:18:50,177 WARNING [train.py:1182] (1/2) Exclude cut with ID 1265-135635-0050-6781-0_sp0.9 from training. Duration: 21.8333125 +2023-05-10 22:18:52,171 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=6.77 vs. limit=15.0 +2023-05-10 22:18:58,909 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.balancer2.prob, batch_count=620580.0, ans=0.125 +2023-05-10 22:19:00,000 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0024-94914-0_sp1.1 from training. Duration: 20.6545625 +2023-05-10 22:19:07,437 INFO [train.py:1021] (1/2) Epoch 35, batch 500, loss[loss=0.1724, simple_loss=0.2673, pruned_loss=0.03879, over 37199.00 frames. ], tot_loss[loss=0.1632, simple_loss=0.2562, pruned_loss=0.03506, over 6625955.87 frames. ], batch size: 102, lr: 3.14e-03, grad_scale: 32.0 +2023-05-10 22:19:20,507 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=620630.0, ans=0.125 +2023-05-10 22:19:25,524 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.conv_module2.whiten, num_groups=1, num_channels=256, metric=6.08 vs. limit=15.0 +2023-05-10 22:19:31,376 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.ff2_skip_rate, batch_count=620680.0, ans=0.0 +2023-05-10 22:19:44,712 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0045-39920-0_sp0.9 from training. Duration: 20.52225 +2023-05-10 22:19:49,543 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=620730.0, ans=0.0 +2023-05-10 22:20:08,870 WARNING [train.py:1182] (1/2) Exclude cut with ID 3972-170212-0014-23379-0_sp0.9 from training. Duration: 29.1166875 +2023-05-10 22:20:29,316 INFO [train.py:1021] (1/2) Epoch 35, batch 550, loss[loss=0.1498, simple_loss=0.2372, pruned_loss=0.03123, over 36821.00 frames. ], tot_loss[loss=0.1642, simple_loss=0.2575, pruned_loss=0.03548, over 6739415.18 frames. ], batch size: 89, lr: 3.14e-03, grad_scale: 16.0 +2023-05-10 22:20:56,685 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([4.5991, 3.9536, 2.4757, 2.6582], device='cuda:1') +2023-05-10 22:21:04,463 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.3101, 4.1913, 3.9061, 4.1795, 3.5375, 3.3031, 3.7001, 3.1740], + device='cuda:1') +2023-05-10 22:21:10,219 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.543e+02 3.197e+02 3.774e+02 4.826e+02 6.909e+02, threshold=7.548e+02, percent-clipped=2.0 +2023-05-10 22:21:14,940 WARNING [train.py:1182] (1/2) Exclude cut with ID 543-133211-0007-59831-0_sp0.9 from training. Duration: 21.388875 +2023-05-10 22:21:53,976 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.1926, 4.1399, 3.8408, 4.1503, 3.4618, 3.0908, 3.5180, 3.0853], + device='cuda:1') +2023-05-10 22:21:54,958 INFO [train.py:1021] (1/2) Epoch 35, batch 600, loss[loss=0.1546, simple_loss=0.2527, pruned_loss=0.02828, over 37150.00 frames. ], tot_loss[loss=0.1643, simple_loss=0.2575, pruned_loss=0.0356, over 6843635.27 frames. ], batch size: 98, lr: 3.14e-03, grad_scale: 16.0 +2023-05-10 22:21:59,570 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0024-94914-0 from training. Duration: 22.72 +2023-05-10 22:22:01,126 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0031-94921-0_sp0.9 from training. Duration: 22.7444375 +2023-05-10 22:22:28,474 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module2.balancer1.prob, batch_count=621230.0, ans=0.125 +2023-05-10 22:22:36,156 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass_mid.scale_min, batch_count=621230.0, ans=0.2 +2023-05-10 22:22:47,239 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.ff3_skip_rate, batch_count=621280.0, ans=0.0 +2023-05-10 22:22:51,673 WARNING [train.py:1182] (1/2) Exclude cut with ID 4133-6541-0027-40495-0_sp1.1 from training. Duration: 0.9681875 +2023-05-10 22:22:56,614 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62851-0022-91297-0_sp0.9 from training. Duration: 22.3166875 +2023-05-10 22:23:01,360 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.balancer1.prob, batch_count=621330.0, ans=0.125 +2023-05-10 22:23:02,725 WARNING [train.py:1182] (1/2) Exclude cut with ID 543-133212-0015-59917-0_sp0.9 from training. Duration: 21.8166875 +2023-05-10 22:23:16,427 INFO [train.py:1021] (1/2) Epoch 35, batch 650, loss[loss=0.1672, simple_loss=0.2652, pruned_loss=0.03458, over 37153.00 frames. ], tot_loss[loss=0.1645, simple_loss=0.2576, pruned_loss=0.03567, over 6931907.46 frames. ], batch size: 102, lr: 3.14e-03, grad_scale: 16.0 +2023-05-10 22:23:52,583 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.bypass.skip_rate, batch_count=621480.0, ans=0.04949747468305833 +2023-05-10 22:23:55,222 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.584e+02 3.029e+02 3.293e+02 3.986e+02 6.168e+02, threshold=6.586e+02, percent-clipped=0.0 +2023-05-10 22:24:34,062 INFO [train.py:1021] (1/2) Epoch 35, batch 700, loss[loss=0.1407, simple_loss=0.2252, pruned_loss=0.02813, over 36823.00 frames. ], tot_loss[loss=0.1643, simple_loss=0.2574, pruned_loss=0.03563, over 6974533.25 frames. ], batch size: 89, lr: 3.14e-03, grad_scale: 16.0 +2023-05-10 22:24:40,811 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.hidden_balancer.prob, batch_count=621630.0, ans=0.125 +2023-05-10 22:24:51,830 WARNING [train.py:1182] (1/2) Exclude cut with ID 4957-30119-0041-23990-0_sp0.9 from training. Duration: 20.22775 +2023-05-10 22:24:58,119 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.balancer1.prob, batch_count=621680.0, ans=0.125 +2023-05-10 22:25:18,475 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=621730.0, ans=0.125 +2023-05-10 22:25:29,727 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.0.conv_module1.whiten, num_groups=1, num_channels=192, metric=12.22 vs. limit=15.0 +2023-05-10 22:25:36,397 WARNING [train.py:1182] (1/2) Exclude cut with ID 5239-32139-0047-9341-0_sp1.1 from training. Duration: 24.67275 +2023-05-10 22:25:44,685 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder_embed.convnext.layerdrop_rate, batch_count=621830.0, ans=0.015 +2023-05-10 22:25:47,957 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.balancer2.prob, batch_count=621830.0, ans=0.125 +2023-05-10 22:25:49,275 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.bypass_mid.scale_min, batch_count=621830.0, ans=0.2 +2023-05-10 22:25:51,991 INFO [train.py:1021] (1/2) Epoch 35, batch 750, loss[loss=0.1559, simple_loss=0.2478, pruned_loss=0.03202, over 37036.00 frames. ], tot_loss[loss=0.1644, simple_loss=0.2574, pruned_loss=0.03563, over 7033102.07 frames. ], batch size: 99, lr: 3.14e-03, grad_scale: 16.0 +2023-05-10 22:25:53,951 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward3.hidden_balancer.prob, batch_count=621880.0, ans=0.125 +2023-05-10 22:25:57,002 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.out_proj.dropout_p, batch_count=621880.0, ans=0.1 +2023-05-10 22:25:58,706 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.conv_module2.balancer1.prob, batch_count=621880.0, ans=0.125 +2023-05-10 22:26:07,773 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.out_combiner.scale_min, batch_count=621930.0, ans=0.2 +2023-05-10 22:26:09,492 WARNING [train.py:1182] (1/2) Exclude cut with ID 3082-165428-0081-50734-0_sp0.9 from training. Duration: 21.8055625 +2023-05-10 22:26:14,552 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.5.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([3.2896, 4.1649, 4.7104, 4.9047], device='cuda:1') +2023-05-10 22:26:31,005 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.592e+02 3.025e+02 3.381e+02 3.861e+02 6.597e+02, threshold=6.762e+02, percent-clipped=1.0 +2023-05-10 22:26:41,059 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=622030.0, ans=0.1 +2023-05-10 22:26:42,560 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.const_attention_rate, batch_count=622030.0, ans=0.025 +2023-05-10 22:26:49,930 WARNING [train.py:1182] (1/2) Exclude cut with ID 3340-169293-0054-76830-0_sp0.9 from training. Duration: 22.6666875 +2023-05-10 22:26:54,740 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_skip_rate, batch_count=622080.0, ans=0.0 +2023-05-10 22:26:54,755 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=622080.0, ans=0.1 +2023-05-10 22:27:10,007 INFO [train.py:1021] (1/2) Epoch 35, batch 800, loss[loss=0.162, simple_loss=0.2585, pruned_loss=0.03275, over 37120.00 frames. ], tot_loss[loss=0.1641, simple_loss=0.2571, pruned_loss=0.03557, over 7094148.03 frames. ], batch size: 107, lr: 3.14e-03, grad_scale: 32.0 +2023-05-10 22:27:27,118 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.bypass.scale_min, batch_count=622180.0, ans=0.2 +2023-05-10 22:27:29,276 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.5.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([2.9884, 3.3389, 3.8127, 3.6858], device='cuda:1') +2023-05-10 22:27:39,676 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module1.balancer1.prob, batch_count=622230.0, ans=0.125 +2023-05-10 22:27:39,737 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.ff2_skip_rate, batch_count=622230.0, ans=0.0 +2023-05-10 22:27:54,363 WARNING [train.py:1182] (1/2) Exclude cut with ID 2411-132532-0017-82279-0_sp1.1 from training. Duration: 0.9681875 +2023-05-10 22:27:54,628 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.bypass.skip_rate, batch_count=622280.0, ans=0.04949747468305833 +2023-05-10 22:28:21,144 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62850-0007-91323-0 from training. Duration: 22.485 +2023-05-10 22:28:27,641 INFO [train.py:1021] (1/2) Epoch 35, batch 850, loss[loss=0.1501, simple_loss=0.2391, pruned_loss=0.03059, over 36941.00 frames. ], tot_loss[loss=0.1643, simple_loss=0.2574, pruned_loss=0.03559, over 7123649.09 frames. ], batch size: 95, lr: 3.14e-03, grad_scale: 32.0 +2023-05-10 22:28:34,056 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass.skip_rate, batch_count=622380.0, ans=0.07 +2023-05-10 22:29:03,235 WARNING [train.py:1182] (1/2) Exclude cut with ID 3972-170212-0014-23379-0_sp1.1 from training. Duration: 23.82275 +2023-05-10 22:29:07,752 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.543e+02 3.190e+02 3.716e+02 4.505e+02 7.597e+02, threshold=7.432e+02, percent-clipped=3.0 +2023-05-10 22:29:09,608 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module1.balancer1.prob, batch_count=622480.0, ans=0.125 +2023-05-10 22:29:12,561 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.attention_skip_rate, batch_count=622530.0, ans=0.0 +2023-05-10 22:29:17,076 WARNING [train.py:1182] (1/2) Exclude cut with ID 4860-13185-0032-76709-0 from training. Duration: 20.77 +2023-05-10 22:29:19,680 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.5.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 22:29:25,127 WARNING [train.py:1182] (1/2) Exclude cut with ID 6426-64292-0017-15984-0_sp0.9 from training. Duration: 24.088875 +2023-05-10 22:29:27,329 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=5.96 vs. limit=15.0 +2023-05-10 22:29:44,755 INFO [train.py:1021] (1/2) Epoch 35, batch 900, loss[loss=0.1719, simple_loss=0.2687, pruned_loss=0.03757, over 36719.00 frames. ], tot_loss[loss=0.1642, simple_loss=0.2574, pruned_loss=0.03551, over 7156721.53 frames. ], batch size: 118, lr: 3.14e-03, grad_scale: 16.0 +2023-05-10 22:29:58,291 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62850-0007-91323-0_sp1.1 from training. Duration: 20.4409375 +2023-05-10 22:30:16,040 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=622730.0, ans=0.125 +2023-05-10 22:31:03,642 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.balancer2.prob, batch_count=622830.0, ans=0.125 +2023-05-10 22:31:03,743 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=622830.0, ans=0.125 +2023-05-10 22:31:06,745 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.ff2_skip_rate, batch_count=622880.0, ans=0.0 +2023-05-10 22:31:07,774 INFO [train.py:1021] (1/2) Epoch 35, batch 950, loss[loss=0.1819, simple_loss=0.2788, pruned_loss=0.04248, over 36311.00 frames. ], tot_loss[loss=0.1637, simple_loss=0.2571, pruned_loss=0.03517, over 7187642.54 frames. ], batch size: 126, lr: 3.14e-03, grad_scale: 16.0 +2023-05-10 22:31:08,137 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=622880.0, ans=0.125 +2023-05-10 22:31:11,340 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.bypass.skip_rate, batch_count=622880.0, ans=0.07 +2023-05-10 22:31:12,821 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module1.balancer1.prob, batch_count=622880.0, ans=0.125 +2023-05-10 22:31:23,813 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0062-62366-0_sp0.9 from training. Duration: 22.511125 +2023-05-10 22:31:25,362 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0031-39906-0 from training. Duration: 20.675 +2023-05-10 22:31:49,014 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.347e+02 2.984e+02 3.271e+02 3.753e+02 7.974e+02, threshold=6.542e+02, percent-clipped=1.0 +2023-05-10 22:32:34,081 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.bypass.scale_min, batch_count=623080.0, ans=0.2 +2023-05-10 22:32:37,614 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([1.7969, 3.2380, 4.5693, 3.1531], device='cuda:1') +2023-05-10 22:32:40,829 INFO [train.py:1021] (1/2) Epoch 35, batch 1000, loss[loss=0.1688, simple_loss=0.267, pruned_loss=0.03525, over 37185.00 frames. ], tot_loss[loss=0.1637, simple_loss=0.2571, pruned_loss=0.03516, over 7188881.17 frames. ], batch size: 102, lr: 3.14e-03, grad_scale: 16.0 +2023-05-10 22:32:50,675 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.5.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.3533, 4.7098, 2.3290, 2.4295], device='cuda:1') +2023-05-10 22:33:03,415 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.0.layers.0.self_attn_weights, attn_weights_entropy = tensor([6.0360, 5.3728, 5.1983, 5.7940], device='cuda:1') +2023-05-10 22:33:09,573 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.1.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([5.1478, 4.9713, 4.3221, 4.7245], device='cuda:1') +2023-05-10 22:33:21,039 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([2.0307, 2.9576, 4.3768, 2.8737], device='cuda:1') +2023-05-10 22:33:28,683 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62850-0007-91323-0_sp0.9 from training. Duration: 24.9833125 +2023-05-10 22:34:01,815 INFO [train.py:1021] (1/2) Epoch 35, batch 1050, loss[loss=0.1669, simple_loss=0.2596, pruned_loss=0.03709, over 37196.00 frames. ], tot_loss[loss=0.164, simple_loss=0.2573, pruned_loss=0.03542, over 7186285.42 frames. ], batch size: 102, lr: 3.14e-03, grad_scale: 16.0 +2023-05-10 22:34:01,913 WARNING [train.py:1182] (1/2) Exclude cut with ID 5239-32139-0047-9341-0 from training. Duration: 27.14 +2023-05-10 22:34:02,970 INFO [scaling.py:969] (1/2) Whitening: name=encoder_embed.convnext.out_whiten, num_groups=1, num_channels=128, metric=4.52 vs. limit=5.0 +2023-05-10 22:34:17,203 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0044-62348-0 from training. Duration: 22.44 +2023-05-10 22:34:42,197 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.413e+02 3.051e+02 3.517e+02 4.482e+02 8.018e+02, threshold=7.034e+02, percent-clipped=4.0 +2023-05-10 22:34:53,135 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward3.hidden_balancer.prob, batch_count=623530.0, ans=0.125 +2023-05-10 22:35:19,707 INFO [train.py:1021] (1/2) Epoch 35, batch 1100, loss[loss=0.1734, simple_loss=0.2735, pruned_loss=0.03666, over 37057.00 frames. ], tot_loss[loss=0.1643, simple_loss=0.2575, pruned_loss=0.03558, over 7172992.66 frames. ], batch size: 110, lr: 3.14e-03, grad_scale: 16.0 +2023-05-10 22:35:23,211 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.4667, 3.9857, 3.7008, 3.9994, 3.3147, 2.9814, 3.4258, 3.0089], + device='cuda:1') +2023-05-10 22:35:39,558 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0060-62364-0_sp0.9 from training. Duration: 21.361125 +2023-05-10 22:35:42,846 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.balancer_ff2.min_abs, batch_count=623680.0, ans=0.1 +2023-05-10 22:35:47,230 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0079-62383-0_sp1.1 from training. Duration: 27.0318125 +2023-05-10 22:35:59,976 WARNING [train.py:1182] (1/2) Exclude cut with ID 5622-44585-0006-90525-0_sp0.9 from training. Duration: 28.638875 +2023-05-10 22:36:19,412 WARNING [train.py:1182] (1/2) Exclude cut with ID 3340-169293-0054-76830-0 from training. Duration: 20.4 +2023-05-10 22:36:28,987 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward3.hidden_balancer.prob, batch_count=623830.0, ans=0.125 +2023-05-10 22:36:38,825 INFO [train.py:1021] (1/2) Epoch 35, batch 1150, loss[loss=0.1658, simple_loss=0.2667, pruned_loss=0.03247, over 36957.00 frames. ], tot_loss[loss=0.1646, simple_loss=0.2578, pruned_loss=0.03572, over 7152738.91 frames. ], batch size: 108, lr: 3.14e-03, grad_scale: 16.0 +2023-05-10 22:36:42,371 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.ff3_skip_rate, batch_count=623880.0, ans=0.0 +2023-05-10 22:36:53,057 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0071-62375-0 from training. Duration: 20.025 +2023-05-10 22:36:53,071 WARNING [train.py:1182] (1/2) Exclude cut with ID 2364-131735-0112-64612-0_sp0.9 from training. Duration: 20.488875 +2023-05-10 22:37:01,112 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0079-62383-0 from training. Duration: 29.735 +2023-05-10 22:37:03,620 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=623930.0, ans=0.1 +2023-05-10 22:37:25,024 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.551e+02 3.047e+02 3.312e+02 3.711e+02 4.995e+02, threshold=6.625e+02, percent-clipped=0.0 +2023-05-10 22:37:42,548 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_skip_rate, batch_count=624030.0, ans=0.0 +2023-05-10 22:38:02,379 INFO [train.py:1021] (1/2) Epoch 35, batch 1200, loss[loss=0.1729, simple_loss=0.2739, pruned_loss=0.0359, over 32526.00 frames. ], tot_loss[loss=0.164, simple_loss=0.2571, pruned_loss=0.0354, over 7173247.37 frames. ], batch size: 170, lr: 3.13e-03, grad_scale: 32.0 +2023-05-10 22:38:20,148 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module1.balancer2.prob, batch_count=624180.0, ans=0.125 +2023-05-10 22:38:21,992 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.self_attn_weights.whiten_keys, num_groups=8, num_channels=256, metric=3.21 vs. limit=6.0 +2023-05-10 22:38:33,548 WARNING [train.py:1182] (1/2) Exclude cut with ID 7276-92427-0014-12983-0_sp0.9 from training. Duration: 21.3055625 +2023-05-10 22:38:35,135 WARNING [train.py:1182] (1/2) Exclude cut with ID 1025-75365-0008-79168-0_sp0.9 from training. Duration: 22.0666875 +2023-05-10 22:38:41,650 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module2.balancer2.prob, batch_count=624230.0, ans=0.125 +2023-05-10 22:38:43,328 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.balancer2.prob, batch_count=624230.0, ans=0.125 +2023-05-10 22:38:58,033 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.self_attn_weights.whiten_keys.whitening_limit, batch_count=624280.0, ans=6.0 +2023-05-10 22:39:20,447 INFO [train.py:1021] (1/2) Epoch 35, batch 1250, loss[loss=0.1687, simple_loss=0.2638, pruned_loss=0.03676, over 36917.00 frames. ], tot_loss[loss=0.1642, simple_loss=0.2574, pruned_loss=0.03549, over 7156527.51 frames. ], batch size: 105, lr: 3.13e-03, grad_scale: 32.0 +2023-05-10 22:39:25,170 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_module2.balancer2.prob, batch_count=624380.0, ans=0.125 +2023-05-10 22:39:44,806 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.ff3_skip_rate, batch_count=624430.0, ans=0.0 +2023-05-10 22:39:46,985 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module1.balancer1.prob, batch_count=624430.0, ans=0.125 +2023-05-10 22:40:00,948 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.559e+02 3.166e+02 3.795e+02 4.494e+02 7.559e+02, threshold=7.589e+02, percent-clipped=2.0 +2023-05-10 22:40:01,404 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=624480.0, ans=0.125 +2023-05-10 22:40:16,637 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([4.8805, 4.1686, 2.8650, 3.1484], device='cuda:1') +2023-05-10 22:40:25,478 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0062-62366-0 from training. Duration: 20.26 +2023-05-10 22:40:37,291 INFO [train.py:1021] (1/2) Epoch 35, batch 1300, loss[loss=0.151, simple_loss=0.2346, pruned_loss=0.03374, over 37095.00 frames. ], tot_loss[loss=0.1649, simple_loss=0.2582, pruned_loss=0.03581, over 7145467.67 frames. ], batch size: 88, lr: 3.13e-03, grad_scale: 32.0 +2023-05-10 22:40:40,456 WARNING [train.py:1182] (1/2) Exclude cut with ID 5239-32139-0030-9324-0_sp0.9 from training. Duration: 21.3444375 +2023-05-10 22:40:52,467 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.3.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 22:41:22,771 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass.scale_min, batch_count=624780.0, ans=0.2 +2023-05-10 22:41:41,452 WARNING [train.py:1182] (1/2) Exclude cut with ID 497-129325-0061-62254-0_sp1.1 from training. Duration: 0.97725 +2023-05-10 22:41:41,949 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.balancer1.max_abs, batch_count=624830.0, ans=10.0 +2023-05-10 22:42:01,355 INFO [train.py:1021] (1/2) Epoch 35, batch 1350, loss[loss=0.1481, simple_loss=0.2319, pruned_loss=0.03217, over 36931.00 frames. ], tot_loss[loss=0.1649, simple_loss=0.2583, pruned_loss=0.0358, over 7155042.09 frames. ], batch size: 86, lr: 3.13e-03, grad_scale: 32.0 +2023-05-10 22:42:03,248 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=624880.0, ans=0.125 +2023-05-10 22:42:04,711 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.0.layers.1.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 22:42:14,378 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=5.28 vs. limit=15.0 +2023-05-10 22:42:31,821 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0031-39906-0_sp0.9 from training. Duration: 22.97225 +2023-05-10 22:42:32,170 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.bypass.skip_rate, batch_count=624980.0, ans=0.07 +2023-05-10 22:42:41,729 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.282e+02 3.020e+02 3.543e+02 3.991e+02 6.876e+02, threshold=7.086e+02, percent-clipped=0.0 +2023-05-10 22:42:46,046 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=624980.0, ans=0.1 +2023-05-10 22:43:06,786 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0047-39922-0_sp0.9 from training. Duration: 21.97775 +2023-05-10 22:43:34,094 INFO [train.py:1021] (1/2) Epoch 35, batch 1400, loss[loss=0.1605, simple_loss=0.2596, pruned_loss=0.03074, over 32567.00 frames. ], tot_loss[loss=0.164, simple_loss=0.2573, pruned_loss=0.03534, over 7167770.75 frames. ], batch size: 170, lr: 3.13e-03, grad_scale: 32.0 +2023-05-10 22:43:34,193 WARNING [train.py:1182] (1/2) Exclude cut with ID 1112-1043-0006-89194-0_sp0.9 from training. Duration: 21.8333125 +2023-05-10 22:43:53,935 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=3.27 vs. limit=15.0 +2023-05-10 22:43:55,873 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0031-94921-0 from training. Duration: 20.47 +2023-05-10 22:44:12,104 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.conv_module2.balancer2.prob, batch_count=625180.0, ans=0.125 +2023-05-10 22:44:24,107 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass.scale_min, batch_count=625230.0, ans=0.2 +2023-05-10 22:44:25,421 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.const_attention_rate, batch_count=625230.0, ans=0.025 +2023-05-10 22:44:52,797 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.5.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.3247, 4.5456, 2.3428, 2.4397], device='cuda:1') +2023-05-10 22:45:11,931 INFO [train.py:1021] (1/2) Epoch 35, batch 1450, loss[loss=0.1897, simple_loss=0.2841, pruned_loss=0.0477, over 36702.00 frames. ], tot_loss[loss=0.1639, simple_loss=0.2572, pruned_loss=0.03528, over 7179923.27 frames. ], batch size: 118, lr: 3.13e-03, grad_scale: 32.0 +2023-05-10 22:45:18,872 WARNING [train.py:1182] (1/2) Exclude cut with ID 7395-89880-0037-39912-0_sp0.9 from training. Duration: 20.67225 +2023-05-10 22:45:35,442 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=6.30 vs. limit=15.0 +2023-05-10 22:45:39,345 WARNING [train.py:1182] (1/2) Exclude cut with ID 1914-133440-0024-94914-0_sp0.9 from training. Duration: 25.2444375 +2023-05-10 22:45:52,922 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.506e+02 3.326e+02 3.792e+02 4.453e+02 6.896e+02, threshold=7.584e+02, percent-clipped=0.0 +2023-05-10 22:45:53,327 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module2.balancer2.prob, batch_count=625480.0, ans=0.125 +2023-05-10 22:46:03,755 WARNING [train.py:1182] (1/2) Exclude cut with ID 3340-169293-0021-76797-0_sp0.9 from training. Duration: 21.1445 +2023-05-10 22:46:37,877 INFO [train.py:1021] (1/2) Epoch 35, batch 1500, loss[loss=0.1512, simple_loss=0.2334, pruned_loss=0.03454, over 35268.00 frames. ], tot_loss[loss=0.1633, simple_loss=0.2566, pruned_loss=0.03498, over 7208124.81 frames. ], batch size: 78, lr: 3.13e-03, grad_scale: 32.0 +2023-05-10 22:46:49,521 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder_embed.convnext.layerdrop_rate, batch_count=625630.0, ans=0.015 +2023-05-10 22:46:51,961 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_skip_rate, batch_count=625630.0, ans=0.0 +2023-05-10 22:46:54,090 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([5.0360, 4.3926, 3.0044, 3.2723], device='cuda:1') +2023-05-10 22:47:10,635 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=192, metric=9.23 vs. limit=15.0 +2023-05-10 22:47:13,042 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.hidden_balancer.prob, batch_count=625730.0, ans=0.125 +2023-05-10 22:47:38,669 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0079-62383-0_sp0.9 from training. Duration: 33.038875 +2023-05-10 22:47:39,000 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.5318, 3.8191, 4.2127, 3.8525], device='cuda:1') +2023-05-10 22:47:40,311 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_skip_rate, batch_count=625780.0, ans=0.0 +2023-05-10 22:48:01,152 INFO [train.py:1021] (1/2) Epoch 35, batch 1550, loss[loss=0.1656, simple_loss=0.2616, pruned_loss=0.03483, over 36896.00 frames. ], tot_loss[loss=0.1635, simple_loss=0.2566, pruned_loss=0.03516, over 7190336.47 frames. ], batch size: 105, lr: 3.13e-03, grad_scale: 32.0 +2023-05-10 22:48:15,976 WARNING [train.py:1182] (1/2) Exclude cut with ID 6426-64291-0000-16059-0_sp0.9 from training. Duration: 20.0944375 +2023-05-10 22:48:16,327 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_skip_rate, batch_count=625930.0, ans=0.0 +2023-05-10 22:48:32,815 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0044-62348-0_sp1.1 from training. Duration: 20.4 +2023-05-10 22:48:40,363 WARNING [train.py:1182] (1/2) Exclude cut with ID 6330-62851-0022-91297-0 from training. Duration: 20.085 +2023-05-10 22:48:41,666 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.585e+02 3.042e+02 3.545e+02 4.069e+02 6.142e+02, threshold=7.089e+02, percent-clipped=0.0 +2023-05-10 22:48:44,298 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.0.nonlin_attention.whiten1, num_groups=1, num_channels=144, metric=8.87 vs. limit=10.0 +2023-05-10 22:48:50,936 WARNING [train.py:1182] (1/2) Exclude cut with ID 4860-13185-0032-76709-0_sp0.9 from training. Duration: 23.07775 +2023-05-10 22:49:18,770 INFO [train.py:1021] (1/2) Epoch 35, batch 1600, loss[loss=0.176, simple_loss=0.2717, pruned_loss=0.04017, over 36806.00 frames. ], tot_loss[loss=0.1638, simple_loss=0.2571, pruned_loss=0.03528, over 7174075.17 frames. ], batch size: 113, lr: 3.13e-03, grad_scale: 32.0 +2023-05-10 22:49:26,847 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.3720, 4.1930, 3.8872, 4.1824, 3.4775, 3.1800, 3.5630, 3.1290], + device='cuda:1') +2023-05-10 22:49:31,405 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.bypass.skip_rate, batch_count=626130.0, ans=0.04949747468305833 +2023-05-10 22:49:38,642 WARNING [train.py:1182] (1/2) Exclude cut with ID 2929-85685-0044-62348-0_sp0.9 from training. Duration: 24.9333125 +2023-05-10 22:49:40,478 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.bypass_mid.scale_min, batch_count=626180.0, ans=0.2 +2023-05-10 22:49:55,460 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([2.9651, 3.5696, 3.3964, 4.2969, 2.3288, 3.6723, 4.2789, 3.6037], + device='cuda:1') +2023-05-10 22:49:57,246 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=8.61 vs. limit=15.0 +2023-05-10 22:50:01,032 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward1.out_proj.dropout_p, batch_count=626230.0, ans=0.1 +2023-05-10 22:50:10,263 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.out_combiner.scale_min, batch_count=626280.0, ans=0.2 +2023-05-10 22:50:11,881 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_skip_rate, batch_count=626280.0, ans=0.0 +2023-05-10 22:50:20,390 INFO [scaling.py:969] (1/2) Whitening: name=encoder_embed.out_whiten, num_groups=1, num_channels=192, metric=7.31 vs. limit=8.0 +2023-05-10 22:50:26,784 WARNING [train.py:1182] (1/2) Exclude cut with ID 5118-111612-0016-124680-0_sp0.9 from training. Duration: 20.388875 +2023-05-10 22:50:34,436 WARNING [train.py:1182] (1/2) Exclude cut with ID 432-122774-0017-62487-0_sp1.1 from training. Duration: 20.3590625 +2023-05-10 22:50:35,922 INFO [train.py:1021] (1/2) Epoch 35, batch 1650, loss[loss=0.1747, simple_loss=0.2648, pruned_loss=0.04234, over 37033.00 frames. ], tot_loss[loss=0.1631, simple_loss=0.2563, pruned_loss=0.03495, over 7200114.33 frames. ], batch size: 116, lr: 3.13e-03, grad_scale: 32.0 +2023-05-10 22:51:17,483 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.476e+02 3.192e+02 3.611e+02 4.191e+02 9.204e+02, threshold=7.223e+02, percent-clipped=1.0 +2023-05-10 22:51:31,421 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.bypass.skip_rate, batch_count=626530.0, ans=0.09899494936611666 +2023-05-10 22:51:49,306 WARNING [train.py:1182] (1/2) Exclude cut with ID 3557-8342-0013-54691-0_sp1.1 from training. Duration: 0.836375 +2023-05-10 22:51:51,148 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.balancer1.prob, batch_count=626580.0, ans=0.125 +2023-05-10 22:51:55,929 INFO [train.py:1021] (1/2) Epoch 35, batch 1700, loss[loss=0.1823, simple_loss=0.2795, pruned_loss=0.04258, over 34748.00 frames. ], tot_loss[loss=0.1641, simple_loss=0.2573, pruned_loss=0.03545, over 7200853.05 frames. ], batch size: 145, lr: 3.13e-03, grad_scale: 32.0 +2023-05-10 22:52:02,524 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_skip_rate, batch_count=626630.0, ans=0.0 +2023-05-10 22:52:07,140 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.1.encoder.layers.0.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 22:52:20,764 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward2.hidden_balancer.prob, batch_count=626680.0, ans=0.125 +2023-05-10 22:52:30,458 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.ff2_skip_rate, batch_count=626730.0, ans=0.0 +2023-05-10 22:52:36,468 WARNING [train.py:1182] (1/2) Exclude cut with ID 8565-290391-0049-67394-0_sp0.9 from training. Duration: 21.3166875 +2023-05-10 22:52:37,014 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.const_attention_rate, batch_count=626730.0, ans=0.025 +2023-05-10 22:52:37,868 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=6.07 vs. limit=15.0 +2023-05-10 22:52:49,862 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module2.balancer1.prob, batch_count=626780.0, ans=0.125 +2023-05-10 22:53:21,124 WARNING [train.py:1182] (1/2) Exclude cut with ID 6533-399-0029-104863-0_sp0.9 from training. Duration: 22.1055625 +2023-05-10 22:53:25,598 INFO [train.py:1021] (1/2) Epoch 35, batch 1750, loss[loss=0.1682, simple_loss=0.2622, pruned_loss=0.0371, over 37092.00 frames. ], tot_loss[loss=0.1648, simple_loss=0.2573, pruned_loss=0.03615, over 7194693.93 frames. ], batch size: 103, lr: 3.13e-03, grad_scale: 32.0 +2023-05-10 22:53:32,055 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0094-26379-0_sp1.1 from training. Duration: 21.77725 +2023-05-10 22:53:42,900 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=626930.0, ans=0.0 +2023-05-10 22:53:53,269 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0005-134304-0_sp0.9 from training. Duration: 27.8166875 +2023-05-10 22:54:05,543 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.nonlin_attention.whiten1.whitening_limit, batch_count=626980.0, ans=10.0 +2023-05-10 22:54:06,156 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.755e+02 3.385e+02 3.814e+02 4.532e+02 6.162e+02, threshold=7.629e+02, percent-clipped=0.0 +2023-05-10 22:54:14,082 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module1.balancer2.prob, batch_count=627030.0, ans=0.125 +2023-05-10 22:54:21,443 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0021-15852-0_sp1.1 from training. Duration: 22.5090625 +2023-05-10 22:54:25,881 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0005-134304-0 from training. Duration: 25.035 +2023-05-10 22:54:35,225 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.0.layers.0.self_attn_weights, attn_weights_entropy = tensor([6.4258, 5.7183, 5.5284, 6.1832], device='cuda:1') +2023-05-10 22:54:38,439 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.conv_skip_rate, batch_count=627080.0, ans=0.0 +2023-05-10 22:54:42,480 INFO [train.py:1021] (1/2) Epoch 35, batch 1800, loss[loss=0.1541, simple_loss=0.2391, pruned_loss=0.03449, over 36777.00 frames. ], tot_loss[loss=0.1654, simple_loss=0.257, pruned_loss=0.03686, over 7211219.46 frames. ], batch size: 89, lr: 3.13e-03, grad_scale: 32.0 +2023-05-10 22:54:45,528 WARNING [train.py:1182] (1/2) Exclude cut with ID 774-127930-0014-10412-0_sp1.1 from training. Duration: 0.95 +2023-05-10 22:54:54,627 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module1.balancer2.prob, batch_count=627130.0, ans=0.125 +2023-05-10 22:55:06,276 WARNING [train.py:1182] (1/2) Exclude cut with ID 3033-130750-0096-55598-0_sp0.9 from training. Duration: 0.92225 +2023-05-10 22:55:29,468 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward3.hidden_balancer.prob, batch_count=627280.0, ans=0.125 +2023-05-10 22:55:33,668 WARNING [train.py:1182] (1/2) Exclude cut with ID 4511-76322-0006-80011-0 from training. Duration: 21.97 +2023-05-10 22:55:37,255 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.5.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.5921, 4.8041, 2.5710, 2.6618], device='cuda:1') +2023-05-10 22:55:40,878 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=192, metric=10.46 vs. limit=15.0 +2023-05-10 22:55:42,107 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=256, metric=2.92 vs. limit=15.0 +2023-05-10 22:55:53,533 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.feed_forward1.out_whiten, num_groups=1, num_channels=192, metric=4.08 vs. limit=15.0 +2023-05-10 22:55:54,061 WARNING [train.py:1182] (1/2) Exclude cut with ID 7492-105653-0055-62765-0_sp0.9 from training. Duration: 21.97225 +2023-05-10 22:55:54,111 WARNING [train.py:1182] (1/2) Exclude cut with ID 453-131332-0000-47844-0_sp0.9 from training. Duration: 25.3333125 +2023-05-10 22:56:00,021 INFO [train.py:1021] (1/2) Epoch 35, batch 1850, loss[loss=0.1959, simple_loss=0.2845, pruned_loss=0.0537, over 24176.00 frames. ], tot_loss[loss=0.1662, simple_loss=0.2571, pruned_loss=0.03765, over 7210022.39 frames. ], batch size: 234, lr: 3.13e-03, grad_scale: 32.0 +2023-05-10 22:56:03,067 WARNING [train.py:1182] (1/2) Exclude cut with ID 5172-29468-0015-19128-0_sp0.9 from training. Duration: 21.5055625 +2023-05-10 22:56:12,313 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.const_attention_rate, batch_count=627380.0, ans=0.025 +2023-05-10 22:56:13,435 WARNING [train.py:1182] (1/2) Exclude cut with ID 453-131332-0000-47844-0_sp1.1 from training. Duration: 20.72725 +2023-05-10 22:56:29,169 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_skip_rate, batch_count=627480.0, ans=0.0 +2023-05-10 22:56:40,827 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.874e+02 3.407e+02 3.671e+02 4.058e+02 1.037e+03, threshold=7.342e+02, percent-clipped=1.0 +2023-05-10 22:56:50,160 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0030-130156-0_sp0.9 from training. Duration: 26.32775 +2023-05-10 22:56:58,234 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=627530.0, ans=0.1 +2023-05-10 22:57:05,802 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_skip_rate, batch_count=627580.0, ans=0.0 +2023-05-10 22:57:15,189 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=256, metric=4.64 vs. limit=15.0 +2023-05-10 22:57:17,395 INFO [train.py:1021] (1/2) Epoch 35, batch 1900, loss[loss=0.1607, simple_loss=0.2451, pruned_loss=0.03817, over 36792.00 frames. ], tot_loss[loss=0.1675, simple_loss=0.2577, pruned_loss=0.03864, over 7184632.73 frames. ], batch size: 89, lr: 3.13e-03, grad_scale: 32.0 +2023-05-10 22:57:22,173 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.ff3_skip_rate, batch_count=627630.0, ans=0.0 +2023-05-10 22:57:23,519 WARNING [train.py:1182] (1/2) Exclude cut with ID 3867-173237-0077-144769-0 from training. Duration: 20.025 +2023-05-10 22:57:29,585 WARNING [train.py:1182] (1/2) Exclude cut with ID 6709-74022-0004-86860-0_sp1.1 from training. Duration: 0.9409375 +2023-05-10 22:57:30,316 WARNING [train.py:1182] (1/2) Exclude cut with ID 4757-1811-0023-62229-0_sp0.9 from training. Duration: 21.37775 +2023-05-10 22:57:50,269 WARNING [train.py:1182] (1/2) Exclude cut with ID 1250-135782-0004-25974-0_sp0.9 from training. Duration: 21.17225 +2023-05-10 22:57:50,281 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0021-15852-0_sp0.9 from training. Duration: 27.511125 +2023-05-10 22:58:05,802 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.const_attention_rate, batch_count=627780.0, ans=0.025 +2023-05-10 22:58:21,280 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module2.balancer2.prob, batch_count=627830.0, ans=0.125 +2023-05-10 22:58:22,487 WARNING [train.py:1182] (1/2) Exclude cut with ID 453-131332-0000-47844-0 from training. Duration: 22.8 +2023-05-10 22:58:26,944 WARNING [train.py:1182] (1/2) Exclude cut with ID 4964-30587-0040-44509-0 from training. Duration: 22.585 +2023-05-10 22:58:35,614 INFO [train.py:1021] (1/2) Epoch 35, batch 1950, loss[loss=0.1984, simple_loss=0.2824, pruned_loss=0.05719, over 24792.00 frames. ], tot_loss[loss=0.1687, simple_loss=0.2582, pruned_loss=0.03958, over 7121283.63 frames. ], batch size: 233, lr: 3.13e-03, grad_scale: 32.0 +2023-05-10 22:58:40,279 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward1.hidden_balancer.prob, batch_count=627880.0, ans=0.125 +2023-05-10 22:59:05,768 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module1.balancer2.prob, batch_count=627930.0, ans=0.125 +2023-05-10 22:59:08,410 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0001-146967-0_sp0.9 from training. Duration: 22.0166875 +2023-05-10 22:59:19,543 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.balancer1.prob, batch_count=627980.0, ans=0.125 +2023-05-10 22:59:24,089 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.859e+02 3.372e+02 3.804e+02 4.466e+02 6.688e+02, threshold=7.609e+02, percent-clipped=0.0 +2023-05-10 22:59:27,208 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0003-134302-0_sp1.1 from training. Duration: 24.395375 +2023-05-10 22:59:27,444 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.out_proj.dropout_p, batch_count=627980.0, ans=0.1 +2023-05-10 22:59:35,383 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0017-41203-0_sp0.9 from training. Duration: 27.47775 +2023-05-10 22:59:39,990 WARNING [train.py:1182] (1/2) Exclude cut with ID 432-122774-0017-62487-0_sp0.9 from training. Duration: 24.8833125 +2023-05-10 22:59:43,154 WARNING [train.py:1182] (1/2) Exclude cut with ID 6758-72288-0033-108368-0 from training. Duration: 23.39 +2023-05-10 22:59:49,759 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0007-12994-0_sp0.9 from training. Duration: 28.72225 +2023-05-10 22:59:50,486 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.conv_module2.whiten, num_groups=1, num_channels=256, metric=3.48 vs. limit=15.0 +2023-05-10 22:59:56,498 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.bypass.scale_min, batch_count=628080.0, ans=0.2 +2023-05-10 23:00:01,306 WARNING [train.py:1182] (1/2) Exclude cut with ID 585-294811-0110-133686-0_sp0.9 from training. Duration: 20.8944375 +2023-05-10 23:00:02,747 INFO [train.py:1021] (1/2) Epoch 35, batch 2000, loss[loss=0.164, simple_loss=0.2557, pruned_loss=0.03619, over 37185.00 frames. ], tot_loss[loss=0.1686, simple_loss=0.258, pruned_loss=0.03962, over 7162936.09 frames. ], batch size: 102, lr: 3.12e-03, grad_scale: 32.0 +2023-05-10 23:00:17,794 WARNING [train.py:1182] (1/2) Exclude cut with ID 5796-66357-0007-116447-0_sp0.9 from training. Duration: 23.8444375 +2023-05-10 23:00:18,029 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module1.balancer1.prob, batch_count=628180.0, ans=0.125 +2023-05-10 23:00:38,227 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=628230.0, ans=0.1 +2023-05-10 23:00:44,414 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0007-12994-0 from training. Duration: 25.85 +2023-05-10 23:00:44,605 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_skip_rate, batch_count=628230.0, ans=0.0 +2023-05-10 23:00:45,837 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0023-13010-0 from training. Duration: 21.39 +2023-05-10 23:00:57,227 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0014-15845-0 from training. Duration: 27.92 +2023-05-10 23:01:00,656 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([5.1047, 4.4584, 3.0561, 2.9294], device='cuda:1') +2023-05-10 23:01:05,111 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.out_combiner.scale_min, batch_count=628330.0, ans=0.2 +2023-05-10 23:01:14,182 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.attention_skip_rate, batch_count=628330.0, ans=0.0 +2023-05-10 23:01:22,539 INFO [train.py:1021] (1/2) Epoch 35, batch 2050, loss[loss=0.1837, simple_loss=0.2754, pruned_loss=0.04599, over 34652.00 frames. ], tot_loss[loss=0.1689, simple_loss=0.2578, pruned_loss=0.03997, over 7172725.48 frames. ], batch size: 145, lr: 3.12e-03, grad_scale: 32.0 +2023-05-10 23:01:24,147 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0039-130165-0_sp0.9 from training. Duration: 20.661125 +2023-05-10 23:01:42,982 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.bypass_mid.scale_min, batch_count=628430.0, ans=0.2 +2023-05-10 23:01:48,880 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0043-15874-0_sp0.9 from training. Duration: 20.07225 +2023-05-10 23:01:58,515 WARNING [train.py:1182] (1/2) Exclude cut with ID 1085-156170-0017-128270-0 from training. Duration: 21.01 +2023-05-10 23:01:59,699 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=9.19 vs. limit=15.0 +2023-05-10 23:02:04,130 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.960e+02 3.545e+02 3.908e+02 4.409e+02 6.951e+02, threshold=7.816e+02, percent-clipped=0.0 +2023-05-10 23:02:32,273 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=628580.0, ans=0.1 +2023-05-10 23:02:44,026 INFO [train.py:1021] (1/2) Epoch 35, batch 2100, loss[loss=0.1455, simple_loss=0.2294, pruned_loss=0.0308, over 37212.00 frames. ], tot_loss[loss=0.1688, simple_loss=0.2574, pruned_loss=0.04012, over 7167195.99 frames. ], batch size: 93, lr: 3.12e-03, grad_scale: 32.0 +2023-05-10 23:02:50,499 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=628630.0, ans=0.125 +2023-05-10 23:03:07,495 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.balancer1.prob, batch_count=628680.0, ans=0.125 +2023-05-10 23:03:14,777 WARNING [train.py:1182] (1/2) Exclude cut with ID 2195-150901-0045-59933-0 from training. Duration: 20.65 +2023-05-10 23:03:23,244 WARNING [train.py:1182] (1/2) Exclude cut with ID 5796-66357-0007-116447-0 from training. Duration: 21.46 +2023-05-10 23:03:43,902 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([2.1498, 2.6391, 3.3714, 2.6211], device='cuda:1') +2023-05-10 23:03:58,113 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.3590, 3.3648, 3.5505, 3.2927], device='cuda:1') +2023-05-10 23:04:07,559 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.const_attention_rate, batch_count=628830.0, ans=0.025 +2023-05-10 23:04:12,823 INFO [train.py:1021] (1/2) Epoch 35, batch 2150, loss[loss=0.1699, simple_loss=0.2627, pruned_loss=0.03861, over 37090.00 frames. ], tot_loss[loss=0.1691, simple_loss=0.2573, pruned_loss=0.04041, over 7141874.99 frames. ], batch size: 103, lr: 3.12e-03, grad_scale: 32.0 +2023-05-10 23:04:18,890 WARNING [train.py:1182] (1/2) Exclude cut with ID 3557-8342-0013-54691-0 from training. Duration: 0.92 +2023-05-10 23:04:25,667 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0023-13010-0_sp0.9 from training. Duration: 23.7666875 +2023-05-10 23:04:25,906 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.bypass_mid.scale_min, batch_count=628880.0, ans=0.2 +2023-05-10 23:04:31,836 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder_embed.convnext.out_balancer.prob, batch_count=628930.0, ans=0.125 +2023-05-10 23:04:38,184 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 23:04:38,212 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module2.balancer1.min_positive, batch_count=628930.0, ans=0.025 +2023-05-10 23:04:38,639 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.whiten, num_groups=1, num_channels=256, metric=3.53 vs. limit=12.0 +2023-05-10 23:04:53,205 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.854e+02 3.614e+02 4.420e+02 5.153e+02 7.465e+02, threshold=8.841e+02, percent-clipped=0.0 +2023-05-10 23:04:55,045 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module2.balancer2.prob, batch_count=628980.0, ans=0.125 +2023-05-10 23:05:04,672 WARNING [train.py:1182] (1/2) Exclude cut with ID 8544-281189-0060-101339-0_sp0.9 from training. Duration: 20.861125 +2023-05-10 23:05:17,389 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-65654-0031-41259-0_sp0.9 from training. Duration: 22.711125 +2023-05-10 23:05:20,789 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.ff2_skip_rate, batch_count=629080.0, ans=0.0 +2023-05-10 23:05:25,230 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=629080.0, ans=0.125 +2023-05-10 23:05:30,869 INFO [train.py:1021] (1/2) Epoch 35, batch 2200, loss[loss=0.1853, simple_loss=0.2728, pruned_loss=0.04885, over 36917.00 frames. ], tot_loss[loss=0.1696, simple_loss=0.2575, pruned_loss=0.04081, over 7119646.31 frames. ], batch size: 105, lr: 3.12e-03, grad_scale: 32.0 +2023-05-10 23:05:37,730 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=256, metric=17.45 vs. limit=22.5 +2023-05-10 23:05:39,203 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=5.00 vs. limit=15.0 +2023-05-10 23:05:43,427 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.out_combiner.scale_min, batch_count=629130.0, ans=0.2 +2023-05-10 23:06:00,536 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0043-132310-0_sp1.1 from training. Duration: 22.986375 +2023-05-10 23:06:19,182 WARNING [train.py:1182] (1/2) Exclude cut with ID 8040-260924-0003-80960-0_sp0.9 from training. Duration: 22.07225 +2023-05-10 23:06:23,699 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0045-26330-0_sp0.9 from training. Duration: 20.3055625 +2023-05-10 23:06:26,751 WARNING [train.py:1182] (1/2) Exclude cut with ID 6356-271890-0060-94317-0_sp0.9 from training. Duration: 20.72225 +2023-05-10 23:06:45,118 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0017-41203-0_sp1.1 from training. Duration: 22.4818125 +2023-05-10 23:06:48,235 INFO [train.py:1021] (1/2) Epoch 35, batch 2250, loss[loss=0.1856, simple_loss=0.2696, pruned_loss=0.05084, over 32507.00 frames. ], tot_loss[loss=0.1702, simple_loss=0.258, pruned_loss=0.04116, over 7128570.09 frames. ], batch size: 170, lr: 3.12e-03, grad_scale: 32.0 +2023-05-10 23:06:55,224 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.bypass.skip_rate, batch_count=629380.0, ans=0.07 +2023-05-10 23:07:13,445 WARNING [train.py:1182] (1/2) Exclude cut with ID 4964-30587-0040-44509-0_sp0.9 from training. Duration: 25.0944375 +2023-05-10 23:07:16,530 WARNING [train.py:1182] (1/2) Exclude cut with ID 6533-399-0047-104881-0 from training. Duration: 21.515 +2023-05-10 23:07:22,463 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0009-15840-0_sp0.9 from training. Duration: 27.02225 +2023-05-10 23:07:28,259 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.920e+02 3.783e+02 4.267e+02 5.012e+02 8.448e+02, threshold=8.533e+02, percent-clipped=0.0 +2023-05-10 23:07:28,430 WARNING [train.py:1182] (1/2) Exclude cut with ID 432-122774-0010-62480-0_sp0.9 from training. Duration: 22.22225 +2023-05-10 23:07:28,670 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=629480.0, ans=0.125 +2023-05-10 23:07:37,516 WARNING [train.py:1182] (1/2) Exclude cut with ID 4964-30587-0085-44554-0_sp0.9 from training. Duration: 20.85 +2023-05-10 23:08:05,661 INFO [train.py:1021] (1/2) Epoch 35, batch 2300, loss[loss=0.142, simple_loss=0.2237, pruned_loss=0.03012, over 36951.00 frames. ], tot_loss[loss=0.1698, simple_loss=0.2577, pruned_loss=0.04097, over 7136224.12 frames. ], batch size: 86, lr: 3.12e-03, grad_scale: 32.0 +2023-05-10 23:08:10,384 WARNING [train.py:1182] (1/2) Exclude cut with ID 4295-39940-0007-92567-0 from training. Duration: 21.54 +2023-05-10 23:08:16,404 WARNING [train.py:1182] (1/2) Exclude cut with ID 4964-30587-0040-44509-0_sp1.1 from training. Duration: 20.5318125 +2023-05-10 23:08:16,963 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=7.45 vs. limit=15.0 +2023-05-10 23:08:19,601 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module1.balancer1.prob, batch_count=629680.0, ans=0.125 +2023-05-10 23:08:23,894 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0012-134311-0_sp0.9 from training. Duration: 21.9333125 +2023-05-10 23:08:31,762 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.balancer_ff2.min_abs, batch_count=629680.0, ans=0.1 +2023-05-10 23:08:32,026 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.nonlin_attention.whiten2, num_groups=1, num_channels=256, metric=3.75 vs. limit=15.0 +2023-05-10 23:08:44,689 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.4.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.6565, 3.8294, 4.2120, 3.7712], device='cuda:1') +2023-05-10 23:09:13,846 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0025-130151-0_sp0.9 from training. Duration: 21.7944375 +2023-05-10 23:09:23,011 INFO [train.py:1021] (1/2) Epoch 35, batch 2350, loss[loss=0.1538, simple_loss=0.238, pruned_loss=0.03476, over 37033.00 frames. ], tot_loss[loss=0.1702, simple_loss=0.258, pruned_loss=0.04121, over 7105658.72 frames. ], batch size: 94, lr: 3.12e-03, grad_scale: 16.0 +2023-05-10 23:09:26,837 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0002-12989-0_sp0.9 from training. Duration: 22.4666875 +2023-05-10 23:09:27,399 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.0.whiten, num_groups=1, num_channels=256, metric=3.39 vs. limit=12.0 +2023-05-10 23:09:35,946 WARNING [train.py:1182] (1/2) Exclude cut with ID 6121-9014-0076-24124-0 from training. Duration: 21.635 +2023-05-10 23:09:40,557 WARNING [train.py:1182] (1/2) Exclude cut with ID 6121-9014-0076-24124-0_sp0.9 from training. Duration: 24.038875 +2023-05-10 23:09:43,871 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.bypass.skip_rate, batch_count=629930.0, ans=0.04949747468305833 +2023-05-10 23:10:01,491 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.hidden_balancer.prob, batch_count=629980.0, ans=0.125 +2023-05-10 23:10:05,524 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.729e+02 3.361e+02 3.913e+02 4.786e+02 7.040e+02, threshold=7.826e+02, percent-clipped=0.0 +2023-05-10 23:10:10,698 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.1.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([5.1308, 4.9046, 4.3684, 4.6835], device='cuda:1') +2023-05-10 23:10:12,564 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=5.25 vs. limit=15.0 +2023-05-10 23:10:19,452 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=256, metric=12.16 vs. limit=15.0 +2023-05-10 23:10:27,557 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0009-134308-0_sp1.1 from training. Duration: 21.786375 +2023-05-10 23:10:38,432 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0002-12989-0 from training. Duration: 20.22 +2023-05-10 23:10:41,864 INFO [train.py:1021] (1/2) Epoch 35, batch 2400, loss[loss=0.1724, simple_loss=0.2615, pruned_loss=0.04171, over 37143.00 frames. ], tot_loss[loss=0.1706, simple_loss=0.2584, pruned_loss=0.04143, over 7093427.24 frames. ], batch size: 107, lr: 3.12e-03, grad_scale: 32.0 +2023-05-10 23:11:25,643 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.const_attention_rate, batch_count=630230.0, ans=0.025 +2023-05-10 23:11:33,328 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.1.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([5.6946, 4.9724, 5.1559, 4.8449], device='cuda:1') +2023-05-10 23:11:58,982 INFO [train.py:1021] (1/2) Epoch 35, batch 2450, loss[loss=0.1839, simple_loss=0.2755, pruned_loss=0.04612, over 37089.00 frames. ], tot_loss[loss=0.1697, simple_loss=0.2571, pruned_loss=0.04115, over 7111547.54 frames. ], batch size: 116, lr: 3.12e-03, grad_scale: 32.0 +2023-05-10 23:12:00,889 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.ff3_skip_rate, batch_count=630380.0, ans=0.0 +2023-05-10 23:12:02,358 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=630380.0, ans=0.1 +2023-05-10 23:12:14,270 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=256, metric=5.55 vs. limit=15.0 +2023-05-10 23:12:24,375 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.conv_module2.balancer1.prob, batch_count=630430.0, ans=0.125 +2023-05-10 23:12:30,557 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.ff3_skip_rate, batch_count=630480.0, ans=0.0 +2023-05-10 23:12:32,419 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.whiten, num_groups=1, num_channels=256, metric=8.78 vs. limit=12.0 +2023-05-10 23:12:34,263 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward3.hidden_balancer.prob, batch_count=630480.0, ans=0.125 +2023-05-10 23:12:41,210 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.818e+02 3.455e+02 3.820e+02 4.344e+02 5.837e+02, threshold=7.640e+02, percent-clipped=0.0 +2023-05-10 23:12:41,609 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_skip_rate, batch_count=630480.0, ans=0.0 +2023-05-10 23:12:51,977 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0043-132310-0 from training. Duration: 25.285 +2023-05-10 23:12:52,285 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.1.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([5.4730, 4.8328, 4.9445, 4.6275], device='cuda:1') +2023-05-10 23:12:53,836 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=630530.0, ans=0.125 +2023-05-10 23:13:03,679 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module2.balancer2.prob, batch_count=630580.0, ans=0.125 +2023-05-10 23:13:08,175 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.ff2_skip_rate, batch_count=630580.0, ans=0.0 +2023-05-10 23:13:16,667 INFO [train.py:1021] (1/2) Epoch 35, batch 2500, loss[loss=0.168, simple_loss=0.2629, pruned_loss=0.03658, over 36925.00 frames. ], tot_loss[loss=0.1694, simple_loss=0.2568, pruned_loss=0.04103, over 7107087.12 frames. ], batch size: 108, lr: 3.12e-03, grad_scale: 32.0 +2023-05-10 23:13:23,209 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.5.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.3933, 4.5657, 2.3477, 2.5243], device='cuda:1') +2023-05-10 23:13:32,657 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.2.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([5.0796, 4.2448, 4.6133, 4.6676], device='cuda:1') +2023-05-10 23:13:59,895 WARNING [train.py:1182] (1/2) Exclude cut with ID 811-130148-0001-63453-0_sp0.9 from training. Duration: 20.861125 +2023-05-10 23:14:04,836 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module2.balancer1.prob, batch_count=630780.0, ans=0.125 +2023-05-10 23:14:26,889 WARNING [train.py:1182] (1/2) Exclude cut with ID 6010-56788-0055-90261-0 from training. Duration: 20.88 +2023-05-10 23:14:30,096 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module1.balancer2.prob, batch_count=630830.0, ans=0.125 +2023-05-10 23:14:34,482 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=630830.0, ans=0.1 +2023-05-10 23:14:38,801 INFO [train.py:1021] (1/2) Epoch 35, batch 2550, loss[loss=0.1758, simple_loss=0.2683, pruned_loss=0.04162, over 37195.00 frames. ], tot_loss[loss=0.1698, simple_loss=0.2573, pruned_loss=0.04117, over 7127676.27 frames. ], batch size: 102, lr: 3.12e-03, grad_scale: 32.0 +2023-05-10 23:14:40,735 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.bypass.scale_min, batch_count=630880.0, ans=0.2 +2023-05-10 23:14:54,703 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.out_proj.dropout_p, batch_count=630930.0, ans=0.1 +2023-05-10 23:15:00,507 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0045-15876-0_sp0.9 from training. Duration: 23.4166875 +2023-05-10 23:15:16,349 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=630980.0, ans=0.1 +2023-05-10 23:15:20,905 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.930e+02 3.487e+02 3.809e+02 4.410e+02 6.349e+02, threshold=7.619e+02, percent-clipped=0.0 +2023-05-10 23:15:34,556 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=9.66 vs. limit=15.0 +2023-05-10 23:15:52,635 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_module2.balancer2.prob, batch_count=631080.0, ans=0.125 +2023-05-10 23:15:59,282 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.balancer1.prob, batch_count=631130.0, ans=0.125 +2023-05-10 23:16:00,532 INFO [train.py:1021] (1/2) Epoch 35, batch 2600, loss[loss=0.1743, simple_loss=0.2683, pruned_loss=0.0402, over 37076.00 frames. ], tot_loss[loss=0.1694, simple_loss=0.2569, pruned_loss=0.04097, over 7153494.38 frames. ], batch size: 103, lr: 3.12e-03, grad_scale: 32.0 +2023-05-10 23:16:08,058 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=4, num_channels=128, metric=4.02 vs. limit=6.0 +2023-05-10 23:16:21,049 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.whiten, num_groups=1, num_channels=256, metric=5.38 vs. limit=12.0 +2023-05-10 23:16:27,775 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0006-134305-0 from training. Duration: 21.24 +2023-05-10 23:16:29,272 WARNING [train.py:1182] (1/2) Exclude cut with ID 6533-399-0047-104881-0_sp0.9 from training. Duration: 23.9055625 +2023-05-10 23:16:39,966 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder_embed.convnext.out_balancer.prob, batch_count=631230.0, ans=0.125 +2023-05-10 23:16:43,346 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.out_combiner.scale_min, batch_count=631230.0, ans=0.2 +2023-05-10 23:17:04,727 WARNING [train.py:1182] (1/2) Exclude cut with ID 6758-72288-0033-108368-0_sp0.9 from training. Duration: 25.988875 +2023-05-10 23:17:14,202 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0001-134300-0_sp0.9 from training. Duration: 20.67225 +2023-05-10 23:17:16,763 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.self_attn1.whiten, num_groups=1, num_channels=192, metric=10.99 vs. limit=22.5 +2023-05-10 23:17:18,662 INFO [train.py:1021] (1/2) Epoch 35, batch 2650, loss[loss=0.1604, simple_loss=0.2493, pruned_loss=0.03574, over 36835.00 frames. ], tot_loss[loss=0.169, simple_loss=0.2566, pruned_loss=0.04074, over 7149097.23 frames. ], batch size: 96, lr: 3.12e-03, grad_scale: 32.0 +2023-05-10 23:17:38,719 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=631430.0, ans=0.125 +2023-05-10 23:17:48,507 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.conv_skip_rate, batch_count=631480.0, ans=0.0 +2023-05-10 23:17:53,242 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_skip_rate, batch_count=631480.0, ans=0.0 +2023-05-10 23:17:55,258 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=256, metric=12.80 vs. limit=15.0 +2023-05-10 23:18:00,081 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.740e+02 3.483e+02 3.932e+02 4.452e+02 6.699e+02, threshold=7.863e+02, percent-clipped=0.0 +2023-05-10 23:18:03,225 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0038-41224-0 from training. Duration: 20.34 +2023-05-10 23:18:16,725 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.feed_forward3.out_whiten, num_groups=1, num_channels=192, metric=9.61 vs. limit=15.0 +2023-05-10 23:18:31,126 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_skip_rate, batch_count=631580.0, ans=0.0 +2023-05-10 23:18:35,967 INFO [train.py:1021] (1/2) Epoch 35, batch 2700, loss[loss=0.1383, simple_loss=0.2213, pruned_loss=0.02759, over 35335.00 frames. ], tot_loss[loss=0.1691, simple_loss=0.2565, pruned_loss=0.04086, over 7125347.94 frames. ], batch size: 78, lr: 3.12e-03, grad_scale: 32.0 +2023-05-10 23:18:50,027 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module2.balancer1.prob, batch_count=631680.0, ans=0.125 +2023-05-10 23:19:16,526 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.1.self_attn_weights.whiten_keys, num_groups=8, num_channels=256, metric=3.32 vs. limit=6.0 +2023-05-10 23:19:21,725 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0026-15857-0_sp0.9 from training. Duration: 25.061125 +2023-05-10 23:19:34,176 WARNING [train.py:1182] (1/2) Exclude cut with ID 3033-130750-0096-55598-0 from training. Duration: 0.83 +2023-05-10 23:19:42,096 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward3.hidden_balancer.prob, batch_count=631830.0, ans=0.125 +2023-05-10 23:19:45,039 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.const_attention_rate, batch_count=631830.0, ans=0.025 +2023-05-10 23:19:52,176 INFO [train.py:1021] (1/2) Epoch 35, batch 2750, loss[loss=0.1847, simple_loss=0.2703, pruned_loss=0.04951, over 36725.00 frames. ], tot_loss[loss=0.1689, simple_loss=0.2563, pruned_loss=0.04082, over 7120058.50 frames. ], batch size: 118, lr: 3.12e-03, grad_scale: 16.0 +2023-05-10 23:19:52,816 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.conv_module1.whiten, num_groups=1, num_channels=256, metric=3.09 vs. limit=15.0 +2023-05-10 23:19:57,592 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.balancer2.prob, batch_count=631880.0, ans=0.125 +2023-05-10 23:20:00,306 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0017-41203-0 from training. Duration: 24.73 +2023-05-10 23:20:15,565 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0009-134308-0 from training. Duration: 23.965 +2023-05-10 23:20:25,104 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0030-146996-0_sp0.9 from training. Duration: 22.088875 +2023-05-10 23:20:28,896 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=5.68 vs. limit=15.0 +2023-05-10 23:20:35,727 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.763e+02 3.488e+02 3.826e+02 4.512e+02 6.591e+02, threshold=7.652e+02, percent-clipped=0.0 +2023-05-10 23:20:40,301 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0006-134305-0_sp0.9 from training. Duration: 23.6 +2023-05-10 23:20:50,169 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.const_attention_rate, batch_count=632030.0, ans=0.025 +2023-05-10 23:21:09,362 INFO [train.py:1021] (1/2) Epoch 35, batch 2800, loss[loss=0.1717, simple_loss=0.2637, pruned_loss=0.03983, over 36813.00 frames. ], tot_loss[loss=0.1688, simple_loss=0.2561, pruned_loss=0.04072, over 7129033.99 frames. ], batch size: 111, lr: 3.11e-03, grad_scale: 32.0 +2023-05-10 23:21:22,346 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.bypass.scale_min, batch_count=632130.0, ans=0.2 +2023-05-10 23:21:25,331 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.bypass.skip_rate, batch_count=632180.0, ans=0.04949747468305833 +2023-05-10 23:21:26,884 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_module1.balancer2.prob, batch_count=632180.0, ans=0.125 +2023-05-10 23:21:34,621 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.0.layers.1.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 23:21:34,725 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([2.9851, 3.5184, 3.2430, 4.1654, 2.6730, 3.5878, 4.1596, 3.5670], + device='cuda:1') +2023-05-10 23:22:14,892 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.nonlin_attention.balancer.max_positive, batch_count=632330.0, ans=0.95 +2023-05-10 23:22:20,029 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.2.encoder.layers.0.feed_forward1.out_whiten, num_groups=1, num_channels=256, metric=4.66 vs. limit=15.0 +2023-05-10 23:22:26,972 INFO [train.py:1021] (1/2) Epoch 35, batch 2850, loss[loss=0.1774, simple_loss=0.269, pruned_loss=0.04284, over 35856.00 frames. ], tot_loss[loss=0.168, simple_loss=0.2555, pruned_loss=0.04023, over 7139586.66 frames. ], batch size: 133, lr: 3.11e-03, grad_scale: 32.0 +2023-05-10 23:22:28,649 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0024-13011-0 from training. Duration: 23.795 +2023-05-10 23:22:34,754 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.feed_forward2.hidden_balancer.prob, batch_count=632380.0, ans=0.125 +2023-05-10 23:22:45,711 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0030-130156-0_sp1.1 from training. Duration: 21.5409375 +2023-05-10 23:22:47,219 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0019-146985-0_sp0.9 from training. Duration: 24.97775 +2023-05-10 23:22:54,861 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 23:22:59,528 WARNING [train.py:1182] (1/2) Exclude cut with ID 1085-156170-0017-128270-0_sp0.9 from training. Duration: 23.3444375 +2023-05-10 23:23:02,843 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward2.hidden_balancer.prob, batch_count=632480.0, ans=0.125 +2023-05-10 23:23:11,267 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.902e+02 3.424e+02 3.787e+02 4.327e+02 7.177e+02, threshold=7.573e+02, percent-clipped=0.0 +2023-05-10 23:23:28,202 WARNING [train.py:1182] (1/2) Exclude cut with ID 6010-56788-0055-90261-0_sp0.9 from training. Duration: 23.2 +2023-05-10 23:23:34,803 WARNING [train.py:1182] (1/2) Exclude cut with ID 5653-46179-0060-117930-0_sp0.9 from training. Duration: 21.17225 +2023-05-10 23:23:43,637 INFO [train.py:1021] (1/2) Epoch 35, batch 2900, loss[loss=0.173, simple_loss=0.2654, pruned_loss=0.04027, over 32115.00 frames. ], tot_loss[loss=0.1684, simple_loss=0.256, pruned_loss=0.04035, over 7127439.16 frames. ], batch size: 170, lr: 3.11e-03, grad_scale: 16.0 +2023-05-10 23:23:45,466 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.0.layers.0.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 23:23:54,782 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0008-134307-0_sp0.9 from training. Duration: 24.6555625 +2023-05-10 23:24:19,374 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.conv_module1.balancer1.max_abs, batch_count=632730.0, ans=10.0 +2023-05-10 23:24:34,116 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.0.self_attn2.whiten, num_groups=1, num_channels=192, metric=10.30 vs. limit=22.5 +2023-05-10 23:24:46,155 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.const_attention_rate, batch_count=632830.0, ans=0.025 +2023-05-10 23:24:47,785 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.5.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.3819, 4.5613, 2.1488, 2.5261], device='cuda:1') +2023-05-10 23:24:51,909 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-65654-0031-41259-0 from training. Duration: 20.44 +2023-05-10 23:25:00,981 INFO [train.py:1021] (1/2) Epoch 35, batch 2950, loss[loss=0.1527, simple_loss=0.2419, pruned_loss=0.0318, over 36863.00 frames. ], tot_loss[loss=0.1685, simple_loss=0.2561, pruned_loss=0.04043, over 7117326.42 frames. ], batch size: 96, lr: 3.11e-03, grad_scale: 16.0 +2023-05-10 23:25:02,927 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=632880.0, ans=0.1 +2023-05-10 23:25:02,933 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.nonlin_attention.balancer.prob, batch_count=632880.0, ans=0.125 +2023-05-10 23:25:08,600 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0018-132285-0_sp0.9 from training. Duration: 23.45 +2023-05-10 23:25:18,648 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([2.8229, 3.4943, 3.1782, 4.1142, 2.1984, 3.5272, 4.1096, 3.5719], + device='cuda:1') +2023-05-10 23:25:26,067 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.nonlin_attention.balancer.prob, batch_count=632930.0, ans=0.125 +2023-05-10 23:25:40,191 WARNING [train.py:1182] (1/2) Exclude cut with ID 6945-60535-0076-12784-0_sp0.9 from training. Duration: 20.52225 +2023-05-10 23:25:45,922 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.888e+02 3.409e+02 3.645e+02 4.101e+02 8.092e+02, threshold=7.289e+02, percent-clipped=1.0 +2023-05-10 23:25:47,557 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0008-134307-0 from training. Duration: 22.19 +2023-05-10 23:25:58,176 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0014-15845-0_sp1.1 from training. Duration: 25.3818125 +2023-05-10 23:26:01,963 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.self_attn1.whiten, num_groups=1, num_channels=256, metric=8.37 vs. limit=22.5 +2023-05-10 23:26:17,011 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0043-132310-0_sp0.9 from training. Duration: 28.0944375 +2023-05-10 23:26:18,469 INFO [train.py:1021] (1/2) Epoch 35, batch 3000, loss[loss=0.1509, simple_loss=0.2311, pruned_loss=0.03539, over 36962.00 frames. ], tot_loss[loss=0.1687, simple_loss=0.2562, pruned_loss=0.04054, over 7106682.22 frames. ], batch size: 86, lr: 3.11e-03, grad_scale: 16.0 +2023-05-10 23:26:18,470 INFO [train.py:1048] (1/2) Computing validation loss +2023-05-10 23:26:29,671 INFO [train.py:1057] (1/2) Epoch 35, validation: loss=0.1514, simple_loss=0.2521, pruned_loss=0.02528, over 944034.00 frames. +2023-05-10 23:26:29,671 INFO [train.py:1058] (1/2) Maximum memory allocated so far is 18883MB +2023-05-10 23:26:33,091 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.feed_forward1.out_proj.dropout_p, batch_count=633130.0, ans=0.1 +2023-05-10 23:26:35,792 WARNING [train.py:1182] (1/2) Exclude cut with ID 2195-150901-0045-59933-0_sp0.9 from training. Duration: 22.9444375 +2023-05-10 23:26:43,122 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0024-13011-0_sp1.1 from training. Duration: 21.6318125 +2023-05-10 23:26:56,141 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.bypass.scale_min, batch_count=633180.0, ans=0.2 +2023-05-10 23:27:02,043 WARNING [train.py:1182] (1/2) Exclude cut with ID 8631-249866-0030-130156-0 from training. Duration: 23.695 +2023-05-10 23:27:22,793 INFO [scaling.py:1059] (1/2) WithLoss: name=encoder.encoders.2.encoder.layers.1.self_attn_weights, loss-sum=0.000e+00 +2023-05-10 23:27:30,011 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0094-26379-0 from training. Duration: 23.955 +2023-05-10 23:27:35,219 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.0.conv_module2.whiten, num_groups=1, num_channels=256, metric=6.42 vs. limit=15.0 +2023-05-10 23:27:46,480 INFO [train.py:1021] (1/2) Epoch 35, batch 3050, loss[loss=0.1741, simple_loss=0.269, pruned_loss=0.03963, over 36830.00 frames. ], tot_loss[loss=0.1687, simple_loss=0.2563, pruned_loss=0.04053, over 7115374.41 frames. ], batch size: 111, lr: 3.11e-03, grad_scale: 16.0 +2023-05-10 23:27:52,025 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=633380.0, ans=0.0 +2023-05-10 23:27:55,017 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module1.balancer2.min_abs, batch_count=633380.0, ans=0.5 +2023-05-10 23:27:59,584 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=633380.0, ans=0.0 +2023-05-10 23:28:02,232 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0024-13011-0_sp0.9 from training. Duration: 26.438875 +2023-05-10 23:28:18,037 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.const_attention_rate, batch_count=633480.0, ans=0.025 +2023-05-10 23:28:26,143 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.conv_module2.whiten, num_groups=1, num_channels=256, metric=8.38 vs. limit=15.0 +2023-05-10 23:28:27,056 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.self_attn_weights.pos_emb_skip_rate, batch_count=633480.0, ans=0.0 +2023-05-10 23:28:31,207 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.859e+02 3.541e+02 4.043e+02 4.790e+02 7.054e+02, threshold=8.086e+02, percent-clipped=0.0 +2023-05-10 23:28:36,287 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.1.conv_skip_rate, batch_count=633530.0, ans=0.0 +2023-05-10 23:28:51,729 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0021-26306-0_sp0.9 from training. Duration: 21.2444375 +2023-05-10 23:28:53,272 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0014-15845-0_sp0.9 from training. Duration: 31.02225 +2023-05-10 23:29:02,347 WARNING [train.py:1182] (1/2) Exclude cut with ID 432-122774-0017-62487-0 from training. Duration: 22.395 +2023-05-10 23:29:03,777 INFO [train.py:1021] (1/2) Epoch 35, batch 3100, loss[loss=0.1626, simple_loss=0.2572, pruned_loss=0.03397, over 37078.00 frames. ], tot_loss[loss=0.1683, simple_loss=0.2559, pruned_loss=0.0404, over 7116735.35 frames. ], batch size: 103, lr: 3.11e-03, grad_scale: 16.0 +2023-05-10 23:29:04,956 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.balancer.prob, batch_count=633630.0, ans=0.125 +2023-05-10 23:29:09,484 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.conv_skip_rate, batch_count=633630.0, ans=0.0 +2023-05-10 23:29:21,115 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0045-15876-0 from training. Duration: 21.075 +2023-05-10 23:29:27,210 WARNING [train.py:1182] (1/2) Exclude cut with ID 6482-98857-0025-147532-0_sp0.9 from training. Duration: 20.0055625 +2023-05-10 23:29:27,222 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0037-132304-0_sp0.9 from training. Duration: 22.05 +2023-05-10 23:29:28,673 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0003-134302-0 from training. Duration: 26.8349375 +2023-05-10 23:29:30,324 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0009-15840-0_sp1.1 from training. Duration: 22.1090625 +2023-05-10 23:29:38,530 WARNING [train.py:1182] (1/2) Exclude cut with ID 7699-105389-0094-26379-0_sp0.9 from training. Duration: 26.6166875 +2023-05-10 23:29:52,706 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.conv_module2.whiten, num_groups=1, num_channels=192, metric=5.29 vs. limit=15.0 +2023-05-10 23:29:58,346 WARNING [train.py:1182] (1/2) Exclude cut with ID 2046-178027-0000-53705-0_sp0.9 from training. Duration: 20.3055625 +2023-05-10 23:30:07,634 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward3.hidden_balancer.prob, batch_count=633830.0, ans=0.125 +2023-05-10 23:30:21,001 INFO [train.py:1021] (1/2) Epoch 35, batch 3150, loss[loss=0.1873, simple_loss=0.2809, pruned_loss=0.04689, over 36717.00 frames. ], tot_loss[loss=0.1691, simple_loss=0.2565, pruned_loss=0.04085, over 7113823.33 frames. ], batch size: 122, lr: 3.11e-03, grad_scale: 16.0 +2023-05-10 23:30:21,100 WARNING [train.py:1182] (1/2) Exclude cut with ID 7205-50138-0008-5373-0_sp0.9 from training. Duration: 20.7 +2023-05-10 23:30:23,399 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.whiten, num_groups=1, num_channels=256, metric=3.46 vs. limit=12.0 +2023-05-10 23:30:29,423 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.1.nonlin_attention.whiten1, num_groups=1, num_channels=192, metric=3.29 vs. limit=10.0 +2023-05-10 23:31:04,933 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0019-146985-0 from training. Duration: 22.48 +2023-05-10 23:31:06,219 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 3.008e+02 3.581e+02 4.099e+02 4.776e+02 7.310e+02, threshold=8.199e+02, percent-clipped=0.0 +2023-05-10 23:31:21,464 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0003-134302-0_sp0.9 from training. Duration: 29.816625 +2023-05-10 23:31:38,547 INFO [train.py:1021] (1/2) Epoch 35, batch 3200, loss[loss=0.1597, simple_loss=0.2441, pruned_loss=0.03765, over 37072.00 frames. ], tot_loss[loss=0.1692, simple_loss=0.2567, pruned_loss=0.04088, over 7100380.76 frames. ], batch size: 94, lr: 3.11e-03, grad_scale: 32.0 +2023-05-10 23:31:45,238 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0005-134304-0_sp1.1 from training. Duration: 22.7590625 +2023-05-10 23:31:51,388 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0026-15857-0 from training. Duration: 22.555 +2023-05-10 23:32:11,342 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.balancer1.prob, batch_count=634230.0, ans=0.125 +2023-05-10 23:32:12,518 WARNING [train.py:1182] (1/2) Exclude cut with ID 1250-135782-0005-25975-0_sp0.9 from training. Duration: 21.688875 +2023-05-10 23:32:40,093 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.feed_forward2.out_whiten, num_groups=1, num_channels=256, metric=10.40 vs. limit=15.0 +2023-05-10 23:32:49,233 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.5.encoder.layers.1.whiten, num_groups=1, num_channels=256, metric=4.04 vs. limit=12.0 +2023-05-10 23:32:50,228 WARNING [train.py:1182] (1/2) Exclude cut with ID 3488-85273-0038-41224-0_sp0.9 from training. Duration: 22.6 +2023-05-10 23:32:56,117 INFO [train.py:1021] (1/2) Epoch 35, batch 3250, loss[loss=0.184, simple_loss=0.2771, pruned_loss=0.04543, over 35917.00 frames. ], tot_loss[loss=0.1691, simple_loss=0.2567, pruned_loss=0.04073, over 7119541.29 frames. ], batch size: 133, lr: 3.11e-03, grad_scale: 16.0 +2023-05-10 23:33:10,092 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.feed_forward1.out_proj.dropout_p, batch_count=634430.0, ans=0.1 +2023-05-10 23:33:28,745 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0009-15840-0 from training. Duration: 24.32 +2023-05-10 23:33:29,104 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.3.encoder.layers.1.self_attn_weights, attn_weights_entropy = tensor([2.0460, 4.0959, 3.7710, 4.0937, 3.5064, 3.1486, 3.5770, 3.1226], + device='cuda:1') +2023-05-10 23:33:43,086 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.884e+02 3.361e+02 3.651e+02 4.223e+02 5.327e+02, threshold=7.301e+02, percent-clipped=0.0 +2023-05-10 23:34:05,207 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.0.layers.1.self_attn1.whiten, num_groups=1, num_channels=192, metric=11.12 vs. limit=22.5 +2023-05-10 23:34:12,848 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.0.ff2_skip_rate, batch_count=634630.0, ans=0.0 +2023-05-10 23:34:13,887 INFO [train.py:1021] (1/2) Epoch 35, batch 3300, loss[loss=0.1788, simple_loss=0.2652, pruned_loss=0.04624, over 36842.00 frames. ], tot_loss[loss=0.1688, simple_loss=0.2565, pruned_loss=0.04059, over 7123113.34 frames. ], batch size: 96, lr: 3.11e-03, grad_scale: 16.0 +2023-05-10 23:34:29,763 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.nonlin_attention.balancer.prob, batch_count=634680.0, ans=0.125 +2023-05-10 23:34:31,167 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-276745-0093-13116-0_sp0.9 from training. Duration: 21.061125 +2023-05-10 23:34:33,573 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.1.encoder.layers.1.conv_module2.whiten, num_groups=1, num_channels=256, metric=8.40 vs. limit=15.0 +2023-05-10 23:34:36,011 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module2.balancer2.prob, batch_count=634680.0, ans=0.125 +2023-05-10 23:34:43,771 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.5.encoder.layers.1.self_attn_weights.pos_emb_skip_rate, batch_count=634730.0, ans=0.0 +2023-05-10 23:34:44,983 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0024-15855-0_sp0.9 from training. Duration: 20.32225 +2023-05-10 23:34:58,717 WARNING [train.py:1182] (1/2) Exclude cut with ID 3033-130750-0096-55598-0_sp1.1 from training. Duration: 0.7545625 +2023-05-10 23:34:58,910 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.const_attention_rate, batch_count=634780.0, ans=0.025 +2023-05-10 23:35:00,346 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_module1.balancer2.prob, batch_count=634780.0, ans=0.125 +2023-05-10 23:35:16,046 WARNING [train.py:1182] (1/2) Exclude cut with ID 4295-39940-0007-92567-0_sp0.9 from training. Duration: 23.9333125 +2023-05-10 23:35:27,360 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.balancer1.prob, batch_count=634830.0, ans=0.125 +2023-05-10 23:35:31,520 INFO [train.py:1021] (1/2) Epoch 35, batch 3350, loss[loss=0.1724, simple_loss=0.2606, pruned_loss=0.04216, over 37128.00 frames. ], tot_loss[loss=0.1691, simple_loss=0.2568, pruned_loss=0.0407, over 7117135.83 frames. ], batch size: 98, lr: 3.11e-03, grad_scale: 16.0 +2023-05-10 23:35:39,417 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.0.conv_module2.balancer1.prob, batch_count=634880.0, ans=0.125 +2023-05-10 23:35:49,718 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0008-134307-0_sp1.1 from training. Duration: 20.17275 +2023-05-10 23:35:58,372 WARNING [train.py:1182] (1/2) Exclude cut with ID 6978-92210-0019-146985-0_sp1.1 from training. Duration: 20.436375 +2023-05-10 23:36:20,331 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.842e+02 3.618e+02 4.042e+02 4.842e+02 8.339e+02, threshold=8.083e+02, percent-clipped=3.0 +2023-05-10 23:36:21,137 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.4.encoder.layers.0.feed_forward3.out_whiten, num_groups=1, num_channels=256, metric=11.07 vs. limit=15.0 +2023-05-10 23:36:50,581 INFO [train.py:1021] (1/2) Epoch 35, batch 3400, loss[loss=0.1395, simple_loss=0.2245, pruned_loss=0.02726, over 36805.00 frames. ], tot_loss[loss=0.1695, simple_loss=0.2572, pruned_loss=0.04091, over 7104943.91 frames. ], batch size: 84, lr: 3.11e-03, grad_scale: 16.0 +2023-05-10 23:37:31,891 WARNING [train.py:1182] (1/2) Exclude cut with ID 4234-40345-0022-142709-0_sp0.9 from training. Duration: 23.1055625 +2023-05-10 23:37:34,130 WARNING [train.py:1182] (1/2) Exclude cut with ID 8291-282929-0007-12994-0_sp1.1 from training. Duration: 23.5 +2023-05-10 23:37:43,661 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.bypass.skip_rate, batch_count=635280.0, ans=0.07 +2023-05-10 23:37:44,838 WARNING [train.py:1182] (1/2) Exclude cut with ID 7255-291500-0009-134308-0_sp0.9 from training. Duration: 26.62775 +2023-05-10 23:37:59,087 WARNING [train.py:1182] (1/2) Exclude cut with ID 6951-79737-0018-132285-0 from training. Duration: 21.105 +2023-05-10 23:37:59,403 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.3.encoder.layers.0.conv_module2.balancer2.prob, batch_count=635280.0, ans=0.125 +2023-05-10 23:38:03,637 WARNING [train.py:1182] (1/2) Exclude cut with ID 4511-76322-0006-80011-0_sp0.9 from training. Duration: 24.411125 +2023-05-10 23:38:17,563 INFO [train.py:1021] (1/2) Epoch 35, batch 3450, loss[loss=0.1519, simple_loss=0.2365, pruned_loss=0.03368, over 36958.00 frames. ], tot_loss[loss=0.1694, simple_loss=0.2572, pruned_loss=0.04084, over 7100625.65 frames. ], batch size: 91, lr: 3.11e-03, grad_scale: 16.0 +2023-05-10 23:38:34,502 WARNING [train.py:1182] (1/2) Exclude cut with ID 6758-72288-0033-108368-0_sp1.1 from training. Duration: 21.263625 +2023-05-10 23:38:37,736 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.0.const_attention_rate, batch_count=635430.0, ans=0.025 +2023-05-10 23:38:39,237 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.1.conv_skip_rate, batch_count=635430.0, ans=0.0 +2023-05-10 23:38:47,421 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.5.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([2.8279, 4.0135, 4.4619, 4.6908], device='cuda:1') +2023-05-10 23:39:06,460 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.839e+02 3.529e+02 4.244e+02 5.161e+02 8.417e+02, threshold=8.487e+02, percent-clipped=1.0 +2023-05-10 23:39:11,226 WARNING [train.py:1182] (1/2) Exclude cut with ID 4234-40345-0022-142709-0 from training. Duration: 20.795 +2023-05-10 23:39:14,657 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=635530.0, ans=0.1 +2023-05-10 23:39:22,306 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0021-15852-0 from training. Duration: 24.76 +2023-05-10 23:39:24,705 WARNING [train.py:1182] (1/2) Exclude cut with ID 3867-173237-0077-144769-0_sp0.9 from training. Duration: 22.25 +2023-05-10 23:39:31,575 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.0.layers.0.feed_forward3.hidden_balancer.prob, batch_count=635580.0, ans=0.125 +2023-05-10 23:39:41,803 INFO [train.py:1021] (1/2) Epoch 35, batch 3500, loss[loss=0.1728, simple_loss=0.2636, pruned_loss=0.04101, over 37078.00 frames. ], tot_loss[loss=0.1697, simple_loss=0.2575, pruned_loss=0.04096, over 7079439.80 frames. ], batch size: 110, lr: 3.11e-03, grad_scale: 16.0 +2023-05-10 23:39:50,903 WARNING [train.py:1182] (1/2) Exclude cut with ID 7357-94126-0026-15857-0_sp1.1 from training. Duration: 20.5045625 +2023-05-10 23:39:51,593 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.self_attn_weights.whiten_keys, num_groups=8, num_channels=256, metric=3.82 vs. limit=6.0 +2023-05-10 23:41:04,326 INFO [train.py:1021] (1/2) Epoch 35, batch 3550, loss[loss=0.1837, simple_loss=0.2739, pruned_loss=0.04669, over 36411.00 frames. ], tot_loss[loss=0.1703, simple_loss=0.258, pruned_loss=0.04128, over 7039742.06 frames. ], batch size: 126, lr: 3.11e-03, grad_scale: 16.0 +2023-05-10 23:41:30,656 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.1.encoder.layers.1.conv_module2.balancer1.prob, batch_count=635930.0, ans=0.125 +2023-05-10 23:41:32,107 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.4.encoder.layers.0.bypass.scale_min, batch_count=635930.0, ans=0.2 +2023-05-10 23:41:49,397 INFO [optim.py:478] (1/2) Clipping_scale=2.0, grad-norm quartiles 2.730e+02 3.468e+02 3.899e+02 4.621e+02 6.941e+02, threshold=7.798e+02, percent-clipped=0.0 +2023-05-10 23:41:54,210 INFO [zipformer.py:1666] (1/2) name=encoder.encoders.5.encoder.layers.0.self_attn_weights, attn_weights_entropy = tensor([3.0126, 4.1310, 4.6464, 4.8791], device='cuda:1') +2023-05-10 23:42:18,605 INFO [train.py:1021] (1/2) Epoch 35, batch 3600, loss[loss=0.1824, simple_loss=0.2721, pruned_loss=0.04636, over 35958.00 frames. ], tot_loss[loss=0.1696, simple_loss=0.2573, pruned_loss=0.04096, over 7076007.90 frames. ], batch size: 133, lr: 3.11e-03, grad_scale: 32.0 +2023-05-10 23:42:29,345 INFO [scaling.py:969] (1/2) Whitening: name=encoder.encoders.3.encoder.layers.0.conv_module1.whiten, num_groups=1, num_channels=256, metric=4.75 vs. limit=15.0 +2023-05-10 23:42:41,440 INFO [scaling.py:178] (1/2) ScheduledFloat: name=encoder.encoders.2.encoder.layers.1.feed_forward1.out_proj.dropout_p, batch_count=636180.0, ans=0.1 +2023-05-10 23:43:13,881 INFO [train.py:1281] (1/2) Done!