diff --git "a/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/train.1.log" "b/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/train.1.log"
new file mode 100644--- /dev/null
+++ "b/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/train.1.log"
@@ -0,0 +1,3175 @@
+# Running on gpua005.delta.ncsa.illinois.edu
+# Started at Tue Dec 19 07:29:08 CST 2023
+# SLURMD_NODENAME=gpua005
+# SLURM_CLUSTER_NAME=delta
+# SLURM_CONF=/var/spool/slurmd/conf-cache/slurm.conf
+# SLURM_CPUS_ON_NODE=64
+# SLURM_CPUS_PER_TASK=64
+# SLURM_EXPORT_ENV=PATH
+# SLURM_GET_USER_ENV=1
+# SLURM_GPUS_ON_NODE=4
+# SLURM_GTIDS=0
+# SLURM_JOBID=2757381
+# SLURM_JOB_ACCOUNT=bbjs-delta-gpu
+# SLURM_JOB_CPUS_PER_NODE='64(x16)'
+# SLURM_JOB_END_TIME=1703165328
+# SLURM_JOB_GID=202
+# SLURM_JOB_GPUS=0,1,2,3
+# SLURM_JOB_ID=2757381
+# SLURM_JOB_NAME=exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/train.log
+# SLURM_JOB_NODELIST='gpua[005,007-008,016,026,030,032,036,039,041,082,086,092,097-099]'
+# SLURM_JOB_NUM_NODES=16
+# SLURM_JOB_PARTITION=gpuA100x4
+# SLURM_JOB_QOS=bbjs-delta-gpu
+# SLURM_JOB_START_TIME=1702992528
+# SLURM_JOB_UID=68077
+# SLURM_JOB_USER=peng6
+# SLURM_LOCALID=0
+# SLURM_MEM_PER_NODE=240000
+# SLURM_NNODES=16
+# SLURM_NODEID=0
+# SLURM_NODELIST='gpua[005,007-008,016,026,030,032,036,039,041,082,086,092,097-099]'
+# SLURM_NODE_ALIASES='(null)'
+# SLURM_OPEN_MODE=a
+# SLURM_PRIO_PROCESS=0
+# SLURM_PROCID=0
+# SLURM_SUBMIT_DIR=/scratch/bbjs/peng6/espnet-whisper-public/egs2/owsm_v3.1/s2t1
+# SLURM_SUBMIT_HOST=dt-login01.delta.ncsa.illinois.edu
+# SLURM_TASKS_PER_NODE='1(x16)'
+# SLURM_TASK_PID=3000674
+# SLURM_TOPOLOGY_ADDR=ss00.ss05.gpua005
+# SLURM_TOPOLOGY_ADDR_PATTERN=switch.switch.node
+# SLURM_WORKING_CLUSTER=delta:dt-sched:6817:9984:109
+# srun --export=ALL python3 -m espnet2.bin.s2t_train --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000 --config conf/train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/owsm_v3.1/s2t1/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/.dist_init_2213554f-2b2b-4da8-aa68-5be2ec04273a 
+/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000 --config conf/train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape /scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000 --config conf/train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape /scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000 --config conf/train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape /scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000 --config conf/train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape /scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000 --config conf/train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape /scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000 --config conf/train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape /scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000 --config conf/train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape /scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000 --config conf/train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape /scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000 --config conf/train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape /scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000 --config conf/train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape /scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000 --config conf/train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape /scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000 --config conf/train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape /scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000 --config conf/train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape /scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000 --config conf/train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape /scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000 --config conf/train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape /scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000 --config conf/train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/owsm_v3.1/s2t1/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/.dist_init_2213554f-2b2b-4da8-aa68-5be2ec04273a
+--fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multipr--fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/owsm_v3.1/s2t1/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/.dist_init_2213554f-2b2b-4da8-aa68-5be2ec04273a
+--fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multipr--fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/owsm_v3.1/s2t1/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/.dist_init_2213554f-2b2b-4da8-aa68-5be2ec04273a
+ocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/owsm_v3.1/s2t1/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/.dist_init_2213554f-2b2b-4da8-aa68-5be2ec04273a
+ocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/owsm_v3.1/s2t1/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/.dist_init_2213554f-2b2b-4da8-aa68-5be2ec04273a
+--fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/owsm_v3.1/s2t1/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/.dist_init_2213554f-2b2b-4da8-aa68-5be2ec04273a
+--fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multipr--fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multipr--fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/owsm_v3.1/s2t1/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/.dist_init_2213554f-2b2b-4da8-aa68-5be2ec04273a
+ocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/owsm_v3.1/s2t1/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/.dist_init_2213554f-2b2b-4da8-aa68-5be2ec04273a
+ocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/owsm_v3.1/s2t1/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/.dist_init_2213554f-2b2b-4da8-aa68-5be2ec04273a
+--fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/owsm_v3.1/s2t1/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/.dist_init_2213554f-2b2b-4da8-aa68-5be2ec04273a
+--fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multipr--fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multipr--fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multipr--fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/owsm_v3.1/s2t1/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/.dist_init_2213554f-2b2b-4da8-aa68-5be2ec04273a
+ocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/owsm_v3.1/s2t1/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/.dist_init_2213554f-2b2b-4da8-aa68-5be2ec04273a
+ocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/owsm_v3.1/s2t1/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/.dist_init_2213554f-2b2b-4da8-aa68-5be2ec04273a
+ocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/owsm_v3.1/s2t1/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/.dist_init_2213554f-2b2b-4da8-aa68-5be2ec04273a
+--fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multipr--fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/owsm_v3.1/s2t1/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/.dist_init_2213554f-2b2b-4da8-aa68-5be2ec04273a
+ocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/owsm_v3.1/s2t1/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/.dist_init_2213554f-2b2b-4da8-aa68-5be2ec04273a
+[gpua005:0/64] 2023-12-19 07:33:07,060 (distributed_c10d:319) INFO: Added key: store_based_barrier_key:1 to store for rank: 0
+[gpua005:0/64] 2023-12-19 07:33:08,654 (distributed_c10d:353) INFO: Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 64 nodes.
+[gpua005:0/64] 2023-12-19 07:33:08,685 (s2t:464) INFO: Vocabulary size: 50002
+[gpua005:0/64] 2023-12-19 07:33:21,834 (abs_task:1231) INFO: pytorch.version=1.13.1, cuda.available=True, cudnn.version=8500, cudnn.benchmark=False, cudnn.deterministic=True
+[gpua005:0/64] 2023-12-19 07:33:21,845 (abs_task:1232) INFO: Model structure:
+ESPnetS2TModel(
+  (frontend): DefaultFrontend(
+    (stft): Stft(n_fft=512, win_length=400, hop_length=160, center=True, normalized=False, onesided=True)
+    (frontend): Frontend()
+    (logmel): LogMel(sr=16000, n_fft=512, n_mels=80, fmin=0, fmax=8000.0, htk=False)
+  )
+  (specaug): SpecAug(
+    (freq_mask): MaskAlongAxis(mask_width_range=[0, 27], num_mask=2, axis=freq)
+    (time_mask): MaskAlongAxisVariableMaxWidth(mask_width_ratio_range=[0.0, 0.05], num_mask=10, axis=time)
+  )
+  (normalize): GlobalMVN(stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz, norm_means=True, norm_vars=True)
+  (encoder): EBranchformerEncoder(
+    (embed): Conv2dSubsampling(
+      (conv): Sequential(
+        (0): Conv2d(1, 1024, kernel_size=(3, 3), stride=(2, 2))
+        (1): ReLU()
+        (2): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(2, 2))
+        (3): ReLU()
+      )
+      (out): Sequential(
+        (0): Linear(in_features=19456, out_features=1024, bias=True)
+        (1): PositionalEncoding(
+          (dropout): Dropout(p=0.1, inplace=False)
+        )
+      )
+    )
+    (encoders): MultiSequential(
+      (0): EBranchformerEncoderLayer(
+        (attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (cgmlp): ConvolutionalGatingMLP(
+          (channel_proj1): Sequential(
+            (0): Linear(in_features=1024, out_features=4096, bias=True)
+            (1): GELU(approximate='none')
+          )
+          (csgu): ConvolutionalSpatialGatingUnit(
+            (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
+            (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+            (act): Identity()
+            (dropout): Dropout(p=0.1, inplace=False)
+          )
+          (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (feed_forward_macaron): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+        (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+        (merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
+      )
+      (1): EBranchformerEncoderLayer(
+        (attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (cgmlp): ConvolutionalGatingMLP(
+          (channel_proj1): Sequential(
+            (0): Linear(in_features=1024, out_features=4096, bias=True)
+            (1): GELU(approximate='none')
+          )
+          (csgu): ConvolutionalSpatialGatingUnit(
+            (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
+            (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+            (act): Identity()
+            (dropout): Dropout(p=0.1, inplace=False)
+          )
+          (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (feed_forward_macaron): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+        (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+        (merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
+      )
+      (2): EBranchformerEncoderLayer(
+        (attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (cgmlp): ConvolutionalGatingMLP(
+          (channel_proj1): Sequential(
+            (0): Linear(in_features=1024, out_features=4096, bias=True)
+            (1): GELU(approximate='none')
+          )
+          (csgu): ConvolutionalSpatialGatingUnit(
+            (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
+            (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+            (act): Identity()
+            (dropout): Dropout(p=0.1, inplace=False)
+          )
+          (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (feed_forward_macaron): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+        (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+        (merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
+      )
+      (3): EBranchformerEncoderLayer(
+        (attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (cgmlp): ConvolutionalGatingMLP(
+          (channel_proj1): Sequential(
+            (0): Linear(in_features=1024, out_features=4096, bias=True)
+            (1): GELU(approximate='none')
+          )
+          (csgu): ConvolutionalSpatialGatingUnit(
+            (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
+            (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+            (act): Identity()
+            (dropout): Dropout(p=0.1, inplace=False)
+          )
+          (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (feed_forward_macaron): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+        (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+        (merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
+      )
+      (4): EBranchformerEncoderLayer(
+        (attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (cgmlp): ConvolutionalGatingMLP(
+          (channel_proj1): Sequential(
+            (0): Linear(in_features=1024, out_features=4096, bias=True)
+            (1): GELU(approximate='none')
+          )
+          (csgu): ConvolutionalSpatialGatingUnit(
+            (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
+            (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+            (act): Identity()
+            (dropout): Dropout(p=0.1, inplace=False)
+          )
+          (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (feed_forward_macaron): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+        (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+        (merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
+      )
+      (5): EBranchformerEncoderLayer(
+        (attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (cgmlp): ConvolutionalGatingMLP(
+          (channel_proj1): Sequential(
+            (0): Linear(in_features=1024, out_features=4096, bias=True)
+            (1): GELU(approximate='none')
+          )
+          (csgu): ConvolutionalSpatialGatingUnit(
+            (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
+            (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+            (act): Identity()
+            (dropout): Dropout(p=0.1, inplace=False)
+          )
+          (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (feed_forward_macaron): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+        (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+        (merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
+      )
+      (6): EBranchformerEncoderLayer(
+        (attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (cgmlp): ConvolutionalGatingMLP(
+          (channel_proj1): Sequential(
+            (0): Linear(in_features=1024, out_features=4096, bias=True)
+            (1): GELU(approximate='none')
+          )
+          (csgu): ConvolutionalSpatialGatingUnit(
+            (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
+            (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+            (act): Identity()
+            (dropout): Dropout(p=0.1, inplace=False)
+          )
+          (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (feed_forward_macaron): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+        (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+        (merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
+      )
+      (7): EBranchformerEncoderLayer(
+        (attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (cgmlp): ConvolutionalGatingMLP(
+          (channel_proj1): Sequential(
+            (0): Linear(in_features=1024, out_features=4096, bias=True)
+            (1): GELU(approximate='none')
+          )
+          (csgu): ConvolutionalSpatialGatingUnit(
+            (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
+            (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+            (act): Identity()
+            (dropout): Dropout(p=0.1, inplace=False)
+          )
+          (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (feed_forward_macaron): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+        (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+        (merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
+      )
+      (8): EBranchformerEncoderLayer(
+        (attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (cgmlp): ConvolutionalGatingMLP(
+          (channel_proj1): Sequential(
+            (0): Linear(in_features=1024, out_features=4096, bias=True)
+            (1): GELU(approximate='none')
+          )
+          (csgu): ConvolutionalSpatialGatingUnit(
+            (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
+            (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+            (act): Identity()
+            (dropout): Dropout(p=0.1, inplace=False)
+          )
+          (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (feed_forward_macaron): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+        (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+        (merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
+      )
+      (9): EBranchformerEncoderLayer(
+        (attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (cgmlp): ConvolutionalGatingMLP(
+          (channel_proj1): Sequential(
+            (0): Linear(in_features=1024, out_features=4096, bias=True)
+            (1): GELU(approximate='none')
+          )
+          (csgu): ConvolutionalSpatialGatingUnit(
+            (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
+            (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+            (act): Identity()
+            (dropout): Dropout(p=0.1, inplace=False)
+          )
+          (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (feed_forward_macaron): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+        (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+        (merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
+      )
+      (10): EBranchformerEncoderLayer(
+        (attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (cgmlp): ConvolutionalGatingMLP(
+          (channel_proj1): Sequential(
+            (0): Linear(in_features=1024, out_features=4096, bias=True)
+            (1): GELU(approximate='none')
+          )
+          (csgu): ConvolutionalSpatialGatingUnit(
+            (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
+            (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+            (act): Identity()
+            (dropout): Dropout(p=0.1, inplace=False)
+          )
+          (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (feed_forward_macaron): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+        (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+        (merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
+      )
+      (11): EBranchformerEncoderLayer(
+        (attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (cgmlp): ConvolutionalGatingMLP(
+          (channel_proj1): Sequential(
+            (0): Linear(in_features=1024, out_features=4096, bias=True)
+            (1): GELU(approximate='none')
+          )
+          (csgu): ConvolutionalSpatialGatingUnit(
+            (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
+            (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+            (act): Identity()
+            (dropout): Dropout(p=0.1, inplace=False)
+          )
+          (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (feed_forward_macaron): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+        (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+        (merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
+      )
+      (12): EBranchformerEncoderLayer(
+        (attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (cgmlp): ConvolutionalGatingMLP(
+          (channel_proj1): Sequential(
+            (0): Linear(in_features=1024, out_features=4096, bias=True)
+            (1): GELU(approximate='none')
+          )
+          (csgu): ConvolutionalSpatialGatingUnit(
+            (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
+            (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+            (act): Identity()
+            (dropout): Dropout(p=0.1, inplace=False)
+          )
+          (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (feed_forward_macaron): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+        (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+        (merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
+      )
+      (13): EBranchformerEncoderLayer(
+        (attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (cgmlp): ConvolutionalGatingMLP(
+          (channel_proj1): Sequential(
+            (0): Linear(in_features=1024, out_features=4096, bias=True)
+            (1): GELU(approximate='none')
+          )
+          (csgu): ConvolutionalSpatialGatingUnit(
+            (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
+            (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+            (act): Identity()
+            (dropout): Dropout(p=0.1, inplace=False)
+          )
+          (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (feed_forward_macaron): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+        (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+        (merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
+      )
+      (14): EBranchformerEncoderLayer(
+        (attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (cgmlp): ConvolutionalGatingMLP(
+          (channel_proj1): Sequential(
+            (0): Linear(in_features=1024, out_features=4096, bias=True)
+            (1): GELU(approximate='none')
+          )
+          (csgu): ConvolutionalSpatialGatingUnit(
+            (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
+            (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+            (act): Identity()
+            (dropout): Dropout(p=0.1, inplace=False)
+          )
+          (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (feed_forward_macaron): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+        (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+        (merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
+      )
+      (15): EBranchformerEncoderLayer(
+        (attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (cgmlp): ConvolutionalGatingMLP(
+          (channel_proj1): Sequential(
+            (0): Linear(in_features=1024, out_features=4096, bias=True)
+            (1): GELU(approximate='none')
+          )
+          (csgu): ConvolutionalSpatialGatingUnit(
+            (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
+            (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+            (act): Identity()
+            (dropout): Dropout(p=0.1, inplace=False)
+          )
+          (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (feed_forward_macaron): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+        (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+        (merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
+      )
+      (16): EBranchformerEncoderLayer(
+        (attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (cgmlp): ConvolutionalGatingMLP(
+          (channel_proj1): Sequential(
+            (0): Linear(in_features=1024, out_features=4096, bias=True)
+            (1): GELU(approximate='none')
+          )
+          (csgu): ConvolutionalSpatialGatingUnit(
+            (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
+            (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+            (act): Identity()
+            (dropout): Dropout(p=0.1, inplace=False)
+          )
+          (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (feed_forward_macaron): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+        (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+        (merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
+      )
+      (17): EBranchformerEncoderLayer(
+        (attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (cgmlp): ConvolutionalGatingMLP(
+          (channel_proj1): Sequential(
+            (0): Linear(in_features=1024, out_features=4096, bias=True)
+            (1): GELU(approximate='none')
+          )
+          (csgu): ConvolutionalSpatialGatingUnit(
+            (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
+            (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+            (act): Identity()
+            (dropout): Dropout(p=0.1, inplace=False)
+          )
+          (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (feed_forward_macaron): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): Swish()
+        )
+        (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+        (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
+        (merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
+      )
+    )
+    (after_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+  )
+  (decoder): TransformerDecoder(
+    (embed): Sequential(
+      (0): Embedding(50002, 1024)
+      (1): PositionalEncoding(
+        (dropout): Dropout(p=0.1, inplace=False)
+      )
+    )
+    (after_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+    (output_layer): Linear(in_features=1024, out_features=50002, bias=True)
+    (decoders): MultiSequential(
+      (0): DecoderLayer(
+        (self_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (src_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): ReLU()
+        )
+        (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+      )
+      (1): DecoderLayer(
+        (self_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (src_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): ReLU()
+        )
+        (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+      )
+      (2): DecoderLayer(
+        (self_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (src_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): ReLU()
+        )
+        (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+      )
+      (3): DecoderLayer(
+        (self_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (src_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): ReLU()
+        )
+        (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+      )
+      (4): DecoderLayer(
+        (self_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (src_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): ReLU()
+        )
+        (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+      )
+      (5): DecoderLayer(
+        (self_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (src_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): ReLU()
+        )
+        (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+      )
+      (6): DecoderLayer(
+        (self_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (src_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): ReLU()
+        )
+        (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+      )
+      (7): DecoderLayer(
+        (self_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (src_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): ReLU()
+        )
+        (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+      )
+      (8): DecoderLayer(
+        (self_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (src_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): ReLU()
+        )
+        (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+      )
+      (9): DecoderLayer(
+        (self_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (src_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): ReLU()
+        )
+        (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+      )
+      (10): DecoderLayer(
+        (self_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (src_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): ReLU()
+        )
+        (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+      )
+      (11): DecoderLayer(
+        (self_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (src_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): ReLU()
+        )
+        (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+      )
+      (12): DecoderLayer(
+        (self_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (src_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): ReLU()
+        )
+        (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+      )
+      (13): DecoderLayer(
+        (self_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (src_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): ReLU()
+        )
+        (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+      )
+      (14): DecoderLayer(
+        (self_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (src_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): ReLU()
+        )
+        (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+      )
+      (15): DecoderLayer(
+        (self_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (src_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): ReLU()
+        )
+        (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+      )
+      (16): DecoderLayer(
+        (self_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (src_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): ReLU()
+        )
+        (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+      )
+      (17): DecoderLayer(
+        (self_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (src_attn): MultiHeadedAttention(
+          (linear_q): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_k): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_v): Linear(in_features=1024, out_features=1024, bias=True)
+          (linear_out): Linear(in_features=1024, out_features=1024, bias=True)
+          (dropout): Identity()
+          (q_norm): Identity()
+          (k_norm): Identity()
+        )
+        (feed_forward): PositionwiseFeedForward(
+          (w_1): Linear(in_features=1024, out_features=4096, bias=True)
+          (w_2): Linear(in_features=4096, out_features=1024, bias=True)
+          (dropout): Dropout(p=0.1, inplace=False)
+          (activation): ReLU()
+        )
+        (norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
+        (dropout): Dropout(p=0.1, inplace=False)
+      )
+    )
+  )
+  (criterion_att): LabelSmoothingLoss(
+    (criterion): KLDivLoss()
+  )
+  (ctc): CTC(
+    (ctc_lo): Linear(in_features=1024, out_features=50002, bias=True)
+    (ctc_loss): CTCLoss()
+  )
+)
+
+Model summary:
+    Class Name: ESPnetS2TModel
+    Total Number of model parameters: 1.02 B
+    Number of trainable parameters: 1.02 B (100.0%)
+    Size: 4.07 GB
+    Type: torch.float32
+[gpua005:0/64] 2023-12-19 07:33:21,846 (abs_task:1235) INFO: Optimizer:
+AdamW (
+Parameter Group 0
+    amsgrad: False
+    betas: [0.9, 0.98]
+    capturable: False
+    eps: 1e-06
+    foreach: None
+    initial_lr: 0.0002
+    lr: 1.6666666666666667e-09
+    maximize: False
+    weight_decay: 0.0
+)
+[gpua005:0/64] 2023-12-19 07:33:21,846 (abs_task:1236) INFO: Scheduler: PiecewiseLinearWarmupLR(warmup_steps_list=[0, 30000, 60000], warmup_lr_list=[0.0, 5e-05, 0.0002])
+[gpua005:0/64] 2023-12-19 07:33:21,847 (abs_task:1245) INFO: Saving the configuration in exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/config.yaml
+[gpua005:0/64] 2023-12-19 07:33:27,223 (s2t:445) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4')
+[gpua005:0/64] 2023-12-19 07:33:28,118 (abs_task:1616) INFO: [valid] dataset:
+ESPnetDataset(
+  speech: {"path": "dump/raw/dev_v3/wav.scp", "type": "kaldi_ark"}
+  text_prev: {"path": "dump/raw/dev_v3/text.prev", "type": "text"}
+  text_ctc: {"path": "dump/raw/dev_v3/text.ctc", "type": "text"}
+  text: {"path": "dump/raw/dev_v3/text", "type": "text"}
+  preprocess: <espnet2.train.preprocessor.S2TPreprocessor object at 0x7f155c4639a0>)
+[gpua005:0/64] 2023-12-19 07:33:28,118 (abs_task:1617) INFO: [valid] Batch sampler: UnsortedBatchSampler(N-batch=4671, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/valid/speech_shape, 
+[gpua005:0/64] 2023-12-19 07:33:28,119 (abs_task:1618) INFO: [valid] mini-batch sizes summary: N-batch=4671, mean=256.0, min=256, max=257
+[gpua005:0/64] 2023-12-19 07:33:55,294 (trainer:159) INFO: The training was resumed using exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/checkpoint.pth
+gpua005:3000819:3000819 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.5<0>
+gpua005:3000819:3000819 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua005:3000819:3000819 [0] NCCL INFO cudaDriverVersion 12020
+NCCL version 2.14.3+cuda11.7
+[gpua005:0/64] 2023-12-19 07:34:01,232 (trainer:284) INFO: 41/45epoch started
+[gpua005:0/64] 2023-12-19 07:34:01,300 (multiple_iter_factory:32) INFO: Building 0th iter-factory...
+[gpua005:0/64] 2023-12-19 07:34:18,504 (s2t:445) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4')
+[gpua005:0/64] 2023-12-19 07:34:21,864 (abs_task:1616) INFO: [train] dataset:
+ESPnetDataset(
+  speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.6", "type": "kaldi_ark"}
+  text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.6", "type": "text"}
+  text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.6", "type": "text"}
+  text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.6", "type": "text"}
+  preprocess: <espnet2.train.preprocessor.S2TPreprocessor object at 0x7f1388e56b90>)
+[gpua005:0/64] 2023-12-19 07:34:21,864 (abs_task:1617) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.6, 
+[gpua005:0/64] 2023-12-19 07:34:21,868 (abs_task:1618) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257
+gpua097:3145433:3145433 [1] NCCL INFO cudaDriverVersion 12020
+gpua097:3145433:3145433 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.97<0>
+gpua097:3145433:3145433 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua097:3145433:3145500 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.97<0>
+gpua097:3145433:3145500 [1] NCCL INFO Using network IB
+gpua097:3145433:3145500 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000
+gpua097:3145433:3145500 [1] NCCL INFO Trees [0] 54/-1/-1->53->52 [1] 54/56/-1->53->52
+gpua097:3145433:3145500 [1] NCCL INFO Channel 00/0 : 53[46000] -> 54[85000] via P2P/IPC/read
+gpua097:3145433:3145500 [1] NCCL INFO Channel 01/0 : 53[46000] -> 54[85000] via P2P/IPC/read
+gpua097:3145433:3145500 [1] NCCL INFO Connected all rings
+gpua097:3145433:3145500 [1] NCCL INFO Channel 01/0 : 53[46000] -> 56[7000] [send] via NET/IB/0
+gpua097:3145433:3145500 [1] NCCL INFO Channel 01/0 : 56[7000] -> 53[46000] [receive] via NET/IB/0
+gpua097:3145433:3145500 [1] NCCL INFO Channel 00/0 : 53[46000] -> 52[7000] via P2P/IPC/read
+gpua097:3145433:3145500 [1] NCCL INFO Channel 01/0 : 53[46000] -> 52[7000] via P2P/IPC/read
+gpua097:3145433:3145500 [1] NCCL INFO Connected all trees
+gpua097:3145433:3145500 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua097:3145433:3145500 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua097:3145433:3145500 [1] NCCL INFO comm 0x13021700 rank 53 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE
+gpua032:1037051:1037051 [2] NCCL INFO cudaDriverVersion 12020
+gpua032:1037051:1037051 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.32<0>
+gpua032:1037051:1037051 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua032:1037051:1037112 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.32<0>
+gpua032:1037051:1037112 [2] NCCL INFO Using network IB
+gpua032:1037051:1037112 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000
+gpua032:1037051:1037112 [2] NCCL INFO Trees [0] 27/-1/-1->26->25 [1] 27/-1/-1->26->25
+gpua032:1037051:1037112 [2] NCCL INFO Channel 00/0 : 26[85000] -> 27[c7000] via P2P/IPC/read
+gpua032:1037051:1037112 [2] NCCL INFO Channel 01/0 : 26[85000] -> 27[c7000] via P2P/IPC/read
+gpua032:1037051:1037112 [2] NCCL INFO Connected all rings
+gpua032:1037051:1037112 [2] NCCL INFO Channel 00/0 : 26[85000] -> 25[46000] via P2P/IPC/read
+gpua032:1037051:1037112 [2] NCCL INFO Channel 01/0 : 26[85000] -> 25[46000] via P2P/IPC/read
+gpua032:1037051:1037112 [2] NCCL INFO Connected all trees
+gpua032:1037051:1037112 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua032:1037051:1037112 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua032:1037051:1037112 [2] NCCL INFO comm 0x7f48183fa530 rank 26 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE
+gpua032:1037049:1037049 [0] NCCL INFO cudaDriverVersion 12020
+gpua032:1037049:1037049 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.32<0>
+gpua032:1037049:1037049 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua032:1037049:1037113 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.32<0>
+gpua032:1037049:1037113 [0] NCCL INFO Using network IB
+gpua032:1037049:1037113 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000
+gpua032:1037049:1037113 [0] NCCL INFO Trees [0] 25/28/-1->24->16 [1] 25/-1/-1->24->21
+gpua032:1037049:1037113 [0] NCCL INFO Channel 00/0 : 23[c7000] -> 24[7000] [receive] via NET/IB/0
+gpua032:1037049:1037113 [0] NCCL INFO Channel 01/0 : 23[c7000] -> 24[7000] [receive] via NET/IB/0
+gpua032:1037049:1037113 [0] NCCL INFO Channel 00/0 : 24[7000] -> 25[46000] via P2P/IPC/read
+gpua032:1037049:1037113 [0] NCCL INFO Channel 01/0 : 24[7000] -> 25[46000] via P2P/IPC/read
+gpua032:1037049:1037113 [0] NCCL INFO Connected all rings
+gpua032:1037049:1037113 [0] NCCL INFO Channel 01/0 : 21[46000] -> 24[7000] [receive] via NET/IB/0
+gpua032:1037049:1037113 [0] NCCL INFO Channel 00/0 : 24[7000] -> 28[7000] [send] via NET/IB/0
+gpua032:1037049:1037113 [0] NCCL INFO Channel 00/0 : 16[7000] -> 24[7000] [receive] via NET/IB/0
+gpua032:1037049:1037113 [0] NCCL INFO Channel 00/0 : 24[7000] -> 16[7000] [send] via NET/IB/0
+gpua032:1037049:1037113 [0] NCCL INFO Channel 00/0 : 28[7000] -> 24[7000] [receive] via NET/IB/0
+gpua032:1037049:1037113 [0] NCCL INFO Channel 01/0 : 24[7000] -> 21[46000] [send] via NET/IB/0
+gpua032:1037049:1037113 [0] NCCL INFO Connected all trees
+gpua032:1037049:1037113 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua032:1037049:1037113 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua032:1037049:1037113 [0] NCCL INFO comm 0x1dc184a0 rank 24 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE
+gpua097:3145432:3145432 [0] NCCL INFO cudaDriverVersion 12020
+gpua097:3145432:3145432 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.97<0>
+gpua097:3145432:3145432 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua097:3145432:3145502 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.97<0>
+gpua097:3145432:3145502 [0] NCCL INFO Using network IB
+gpua097:3145432:3145502 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000
+gpua097:3145432:3145502 [0] NCCL INFO Trees [0] 53/-1/-1->52->57 [1] 53/48/-1->52->45
+gpua097:3145432:3145502 [0] NCCL INFO Channel 00/0 : 51[c7000] -> 52[7000] [receive] via NET/IB/0
+gpua097:3145432:3145502 [0] NCCL INFO Channel 01/0 : 51[c7000] -> 52[7000] [receive] via NET/IB/0
+gpua097:3145432:3145502 [0] NCCL INFO Channel 00/0 : 52[7000] -> 53[46000] via P2P/IPC/read
+gpua097:3145432:3145502 [0] NCCL INFO Channel 01/0 : 52[7000] -> 53[46000] via P2P/IPC/read
+gpua097:3145432:3145502 [0] NCCL INFO Connected all rings
+gpua097:3145432:3145502 [0] NCCL INFO Channel 01/0 : 48[7000] -> 52[7000] [receive] via NET/IB/0
+gpua097:3145432:3145502 [0] NCCL INFO Channel 00/0 : 52[7000] -> 57[46000] [send] via NET/IB/0
+gpua097:3145432:3145502 [0] NCCL INFO Channel 01/0 : 45[46000] -> 52[7000] [receive] via NET/IB/0
+gpua097:3145432:3145502 [0] NCCL INFO Channel 01/0 : 52[7000] -> 45[46000] [send] via NET/IB/0
+gpua097:3145432:3145502 [0] NCCL INFO Channel 00/0 : 57[46000] -> 52[7000] [receive] via NET/IB/0
+gpua097:3145432:3145502 [0] NCCL INFO Channel 01/0 : 52[7000] -> 48[7000] [send] via NET/IB/0
+gpua097:3145432:3145502 [0] NCCL INFO Connected all trees
+gpua097:3145432:3145502 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua097:3145432:3145502 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua097:3145432:3145502 [0] NCCL INFO comm 0xbdfeb50 rank 52 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE
+gpua005:3000819:3000888 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.5<0>
+gpua005:3000819:3000888 [0] NCCL INFO Using network IB
+gpua005:3000819:3000888 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000
+gpua005:3000819:3000888 [0] NCCL INFO Channel 00/02 :    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19
+gpua005:3000819:3000888 [0] NCCL INFO Channel 01/02 :    0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19
+gpua005:3000819:3000888 [0] NCCL INFO Trees [0] 1/32/-1->0->-1 [1] 1/-1/-1->0->4
+gpua005:3000819:3000888 [0] NCCL INFO Channel 00/0 : 63[c7000] -> 0[7000] [receive] via NET/IB/0
+gpua005:3000819:3000888 [0] NCCL INFO Channel 01/0 : 63[c7000] -> 0[7000] [receive] via NET/IB/0
+gpua005:3000819:3000888 [0] NCCL INFO Channel 00/0 : 0[7000] -> 1[46000] via P2P/IPC/read
+gpua005:3000819:3000888 [0] NCCL INFO Channel 01/0 : 0[7000] -> 1[46000] via P2P/IPC/read
+gpua005:3000819:3000888 [0] NCCL INFO Connected all rings
+gpua005:3000819:3000888 [0] NCCL INFO Channel 01/0 : 0[7000] -> 4[7000] [send] via NET/IB/0
+gpua005:3000819:3000888 [0] NCCL INFO Channel 00/0 : 32[7000] -> 0[7000] [receive] via NET/IB/0
+gpua005:3000819:3000888 [0] NCCL INFO Channel 00/0 : 0[7000] -> 32[7000] [send] via NET/IB/0
+gpua005:3000819:3000888 [0] NCCL INFO Channel 01/0 : 4[7000] -> 0[7000] [receive] via NET/IB/0
+gpua005:3000819:3000888 [0] NCCL INFO Connected all trees
+gpua005:3000819:3000888 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua005:3000819:3000888 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua005:3000819:3000888 [0] NCCL INFO comm 0xec8b50a0 rank 0 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE
+gpua005:3000822:3000822 [3] NCCL INFO cudaDriverVersion 12020
+gpua005:3000822:3000822 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.5<0>
+gpua005:3000822:3000822 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua005:3000822:3000891 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.5<0>
+gpua005:3000822:3000891 [3] NCCL INFO Using network IB
+gpua005:3000822:3000891 [3] NCCL INFO Setting affinity for GPU 3 to ffff
+gpua005:3000822:3000891 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2
+gpua005:3000822:3000891 [3] NCCL INFO Channel 00/0 : 3[c7000] -> 4[7000] [send] via NET/IB/0
+gpua005:3000822:3000891 [3] NCCL INFO Channel 01/0 : 3[c7000] -> 4[7000] [send] via NET/IB/0
+gpua005:3000822:3000891 [3] NCCL INFO Connected all rings
+gpua005:3000822:3000891 [3] NCCL INFO Channel 00/0 : 3[c7000] -> 2[85000] via P2P/IPC/read
+gpua005:3000822:3000891 [3] NCCL INFO Channel 01/0 : 3[c7000] -> 2[85000] via P2P/IPC/read
+gpua005:3000822:3000891 [3] NCCL INFO Connected all trees
+gpua005:3000822:3000891 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua005:3000822:3000891 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua005:3000822:3000891 [3] NCCL INFO comm 0xfe4bf10 rank 3 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE
+gpua036:1186942:1186942 [3] NCCL INFO cudaDriverVersion 12020
+gpua036:1186942:1186942 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.36<0>
+gpua036:1186942:1186942 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua036:1186942:1187014 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.36<0>
+gpua036:1186942:1187014 [3] NCCL INFO Using network IB
+gpua036:1186942:1187014 [3] NCCL INFO Setting affinity for GPU 3 to ffff
+gpua036:1186942:1187014 [3] NCCL INFO Trees [0] -1/-1/-1->31->30 [1] -1/-1/-1->31->30
+gpua036:1186942:1187014 [3] NCCL INFO Channel 00/0 : 31[c7000] -> 32[7000] [send] via NET/IB/0
+gpua036:1186942:1187014 [3] NCCL INFO Channel 01/0 : 31[c7000] -> 32[7000] [send] via NET/IB/0
+gpua036:1186942:1187014 [3] NCCL INFO Connected all rings
+gpua036:1186942:1187014 [3] NCCL INFO Channel 00/0 : 31[c7000] -> 30[85000] via P2P/IPC/read
+gpua036:1186942:1187014 [3] NCCL INFO Channel 01/0 : 31[c7000] -> 30[85000] via P2P/IPC/read
+gpua036:1186942:1187014 [3] NCCL INFO Connected all trees
+gpua036:1186942:1187014 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua036:1186942:1187014 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua036:1186942:1187014 [3] NCCL INFO comm 0x1915ef60 rank 31 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE
+gpua008:2393224:2393224 [2] NCCL INFO cudaDriverVersion 12020
+gpua008:2393224:2393224 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.8<0>
+gpua008:2393224:2393224 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua008:2393224:2393284 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.8<0>
+gpua008:2393224:2393284 [2] NCCL INFO Using network IB
+gpua008:2393224:2393284 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000
+gpua008:2393224:2393284 [2] NCCL INFO Trees [0] 11/-1/-1->10->9 [1] 11/-1/-1->10->9
+gpua008:2393224:2393284 [2] NCCL INFO Channel 00/0 : 10[85000] -> 11[c7000] via P2P/IPC/read
+gpua008:2393224:2393284 [2] NCCL INFO Channel 01/0 : 10[85000] -> 11[c7000] via P2P/IPC/read
+gpua008:2393224:2393284 [2] NCCL INFO Connected all rings
+gpua008:2393224:2393284 [2] NCCL INFO Channel 00/0 : 10[85000] -> 9[46000] via P2P/IPC/read
+gpua008:2393224:2393284 [2] NCCL INFO Channel 01/0 : 10[85000] -> 9[46000] via P2P/IPC/read
+gpua008:2393224:2393284 [2] NCCL INFO Connected all trees
+gpua008:2393224:2393284 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua008:2393224:2393284 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua008:2393224:2393284 [2] NCCL INFO comm 0xad73630 rank 10 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE
+gpua041:912684:912684 [1] NCCL INFO cudaDriverVersion 12020
+gpua041:912684:912684 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.41<0>
+gpua041:912684:912684 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua041:912684:912753 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.41<0>
+gpua041:912684:912753 [1] NCCL INFO Using network IB
+gpua041:912684:912753 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000
+gpua041:912684:912753 [1] NCCL INFO Trees [0] 38/-1/-1->37->36 [1] 38/40/-1->37->36
+gpua041:912684:912753 [1] NCCL INFO Channel 00/0 : 37[46000] -> 38[85000] via P2P/IPC/read
+gpua041:912684:912753 [1] NCCL INFO Channel 01/0 : 37[46000] -> 38[85000] via P2P/IPC/read
+gpua041:912684:912753 [1] NCCL INFO Connected all rings
+gpua041:912684:912753 [1] NCCL INFO Channel 01/0 : 37[46000] -> 40[7000] [send] via NET/IB/0
+gpua041:912684:912753 [1] NCCL INFO Channel 01/0 : 40[7000] -> 37[46000] [receive] via NET/IB/0
+gpua041:912684:912753 [1] NCCL INFO Channel 00/0 : 37[46000] -> 36[7000] via P2P/IPC/read
+gpua041:912684:912753 [1] NCCL INFO Channel 01/0 : 37[46000] -> 36[7000] via P2P/IPC/read
+gpua041:912684:912753 [1] NCCL INFO Connected all trees
+gpua041:912684:912753 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua041:912684:912753 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua041:912684:912753 [1] NCCL INFO comm 0x101741b0 rank 37 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE
+gpua098:480539:480539 [2] NCCL INFO cudaDriverVersion 12020
+gpua098:480539:480539 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.98<0>
+gpua098:480539:480539 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua098:480539:480603 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.98<0>
+gpua098:480539:480603 [2] NCCL INFO Using network IB
+gpua098:480539:480603 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000
+gpua098:480539:480603 [2] NCCL INFO Trees [0] 59/-1/-1->58->57 [1] 59/-1/-1->58->57
+gpua098:480539:480603 [2] NCCL INFO Channel 00/0 : 58[85000] -> 59[c7000] via P2P/IPC/read
+gpua098:480539:480603 [2] NCCL INFO Channel 01/0 : 58[85000] -> 59[c7000] via P2P/IPC/read
+gpua098:480539:480603 [2] NCCL INFO Connected all rings
+gpua098:480539:480603 [2] NCCL INFO Channel 00/0 : 58[85000] -> 57[46000] via P2P/IPC/read
+gpua098:480539:480603 [2] NCCL INFO Channel 01/0 : 58[85000] -> 57[46000] via P2P/IPC/read
+gpua098:480539:480603 [2] NCCL INFO Connected all trees
+gpua098:480539:480603 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua098:480539:480603 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua098:480539:480603 [2] NCCL INFO comm 0x93235d10 rank 58 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE
+gpua092:1040047:1040047 [1] NCCL INFO cudaDriverVersion 12020
+gpua092:1040047:1040047 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.92<0>
+gpua092:1040047:1040047 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua092:1040047:1040119 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.92<0>
+gpua092:1040047:1040119 [1] NCCL INFO Using network IB
+gpua092:1040047:1040119 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000
+gpua092:1040047:1040119 [1] NCCL INFO Trees [0] 50/40/-1->49->48 [1] 50/-1/-1->49->48
+gpua092:1040047:1040119 [1] NCCL INFO Channel 00/0 : 49[46000] -> 50[85000] via P2P/IPC/read
+gpua092:1040047:1040119 [1] NCCL INFO Channel 01/0 : 49[46000] -> 50[85000] via P2P/IPC/read
+gpua092:1040047:1040119 [1] NCCL INFO Connected all rings
+gpua092:1040047:1040119 [1] NCCL INFO Channel 00/0 : 40[7000] -> 49[46000] [receive] via NET/IB/0
+gpua092:1040047:1040119 [1] NCCL INFO Channel 00/0 : 49[46000] -> 40[7000] [send] via NET/IB/0
+gpua092:1040047:1040119 [1] NCCL INFO Channel 00/0 : 49[46000] -> 48[7000] via P2P/IPC/read
+gpua092:1040047:1040119 [1] NCCL INFO Channel 01/0 : 49[46000] -> 48[7000] via P2P/IPC/read
+gpua092:1040047:1040119 [1] NCCL INFO Connected all trees
+gpua092:1040047:1040119 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua092:1040047:1040119 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua092:1040047:1040119 [1] NCCL INFO comm 0xf9379e0 rank 49 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE
+gpua016:1316130:1316130 [3] NCCL INFO cudaDriverVersion 12020
+gpua016:1316130:1316130 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.16<0>
+gpua016:1316130:1316130 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua016:1316130:1316194 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.16<0>
+gpua016:1316130:1316194 [3] NCCL INFO Using network IB
+gpua016:1316130:1316194 [3] NCCL INFO Setting affinity for GPU 3 to ffff
+gpua016:1316130:1316194 [3] NCCL INFO Trees [0] -1/-1/-1->15->14 [1] -1/-1/-1->15->14
+gpua016:1316130:1316194 [3] NCCL INFO Channel 00/0 : 15[c7000] -> 16[7000] [send] via NET/IB/0
+gpua016:1316130:1316194 [3] NCCL INFO Channel 01/0 : 15[c7000] -> 16[7000] [send] via NET/IB/0
+gpua016:1316130:1316194 [3] NCCL INFO Connected all rings
+gpua016:1316130:1316194 [3] NCCL INFO Channel 00/0 : 15[c7000] -> 14[85000] via P2P/IPC/read
+gpua016:1316130:1316194 [3] NCCL INFO Channel 01/0 : 15[c7000] -> 14[85000] via P2P/IPC/read
+gpua016:1316130:1316194 [3] NCCL INFO Connected all trees
+gpua016:1316130:1316194 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua016:1316130:1316194 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua016:1316130:1316194 [3] NCCL INFO comm 0x10199290 rank 15 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE
+gpua016:1316127:1316127 [0] NCCL INFO cudaDriverVersion 12020
+gpua016:1316127:1316127 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.16<0>
+gpua016:1316127:1316127 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua016:1316127:1316195 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.16<0>
+gpua016:1316127:1316195 [0] NCCL INFO Using network IB
+gpua016:1316127:1316195 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000
+gpua016:1316127:1316195 [0] NCCL INFO Trees [0] 13/-1/-1->12->8 [1] 13/4/-1->12->28
+gpua016:1316127:1316195 [0] NCCL INFO Channel 00/0 : 11[c7000] -> 12[7000] [receive] via NET/IB/0
+gpua016:1316127:1316195 [0] NCCL INFO Channel 01/0 : 11[c7000] -> 12[7000] [receive] via NET/IB/0
+gpua016:1316127:1316195 [0] NCCL INFO Channel 00/0 : 12[7000] -> 13[46000] via P2P/IPC/read
+gpua016:1316127:1316195 [0] NCCL INFO Channel 01/0 : 12[7000] -> 13[46000] via P2P/IPC/read
+gpua016:1316127:1316195 [0] NCCL INFO Connected all rings
+gpua016:1316127:1316195 [0] NCCL INFO Channel 00/0 : 8[7000] -> 12[7000] [receive] via NET/IB/0
+gpua016:1316127:1316195 [0] NCCL INFO Channel 01/0 : 4[7000] -> 12[7000] [receive] via NET/IB/0
+gpua016:1316127:1316195 [0] NCCL INFO Channel 01/0 : 12[7000] -> 28[7000] [send] via NET/IB/0
+gpua016:1316127:1316195 [0] NCCL INFO Channel 01/0 : 28[7000] -> 12[7000] [receive] via NET/IB/0
+gpua016:1316127:1316195 [0] NCCL INFO Channel 01/0 : 12[7000] -> 4[7000] [send] via NET/IB/0
+gpua016:1316127:1316195 [0] NCCL INFO Channel 00/0 : 12[7000] -> 8[7000] [send] via NET/IB/0
+gpua016:1316127:1316195 [0] NCCL INFO Connected all trees
+gpua016:1316127:1316195 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua016:1316127:1316195 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua016:1316127:1316195 [0] NCCL INFO comm 0xf1c430a0 rank 12 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE
+gpua008:2393225:2393225 [3] NCCL INFO cudaDriverVersion 12020
+gpua008:2393225:2393225 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.8<0>
+gpua008:2393225:2393225 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua008:2393225:2393290 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.8<0>
+gpua008:2393225:2393290 [3] NCCL INFO Using network IB
+gpua008:2393225:2393290 [3] NCCL INFO Setting affinity for GPU 3 to ffff
+gpua008:2393225:2393290 [3] NCCL INFO Trees [0] -1/-1/-1->11->10 [1] -1/-1/-1->11->10
+gpua008:2393225:2393290 [3] NCCL INFO Channel 00/0 : 11[c7000] -> 12[7000] [send] via NET/IB/0
+gpua008:2393225:2393290 [3] NCCL INFO Channel 01/0 : 11[c7000] -> 12[7000] [send] via NET/IB/0
+gpua008:2393225:2393290 [3] NCCL INFO Connected all rings
+gpua008:2393225:2393290 [3] NCCL INFO Channel 00/0 : 11[c7000] -> 10[85000] via P2P/IPC/read
+gpua008:2393225:2393290 [3] NCCL INFO Channel 01/0 : 11[c7000] -> 10[85000] via P2P/IPC/read
+gpua008:2393225:2393290 [3] NCCL INFO Connected all trees
+gpua008:2393225:2393290 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua008:2393225:2393290 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua008:2393225:2393290 [3] NCCL INFO comm 0x1b692a90 rank 11 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE
+gpua092:1040046:1040046 [0] NCCL INFO cudaDriverVersion 12020
+gpua092:1040046:1040046 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.92<0>
+gpua092:1040046:1040046 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua092:1040046:1040116 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.92<0>
+gpua092:1040046:1040116 [0] NCCL INFO Using network IB
+gpua092:1040046:1040116 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000
+gpua092:1040046:1040116 [0] NCCL INFO Trees [0] 49/56/-1->48->32 [1] 49/-1/-1->48->52
+gpua092:1040046:1040116 [0] NCCL INFO Channel 00/0 : 47[c7000] -> 48[7000] [receive] via NET/IB/0
+gpua092:1040046:1040116 [0] NCCL INFO Channel 01/0 : 47[c7000] -> 48[7000] [receive] via NET/IB/0
+gpua092:1040046:1040116 [0] NCCL INFO Channel 00/0 : 48[7000] -> 49[46000] via P2P/IPC/read
+gpua092:1040046:1040116 [0] NCCL INFO Channel 01/0 : 48[7000] -> 49[46000] via P2P/IPC/read
+gpua092:1040046:1040116 [0] NCCL INFO Connected all rings
+gpua032:1037052:1037052 [3] NCCL INFO cudaDriverVersion 12020
+gpua032:1037052:1037052 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.32<0>
+gpua032:1037052:1037052 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua032:1037052:1037111 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.32<0>
+gpua032:1037052:1037111 [3] NCCL INFO Using network IB
+gpua032:1037052:1037111 [3] NCCL INFO Setting affinity for GPU 3 to ffff
+gpua032:1037052:1037111 [3] NCCL INFO Trees [0] -1/-1/-1->27->26 [1] -1/-1/-1->27->26
+gpua032:1037052:1037111 [3] NCCL INFO Channel 00/0 : 27[c7000] -> 28[7000] [send] via NET/IB/0
+gpua032:1037052:1037111 [3] NCCL INFO Channel 01/0 : 27[c7000] -> 28[7000] [send] via NET/IB/0
+gpua032:1037052:1037111 [3] NCCL INFO Connected all rings
+gpua032:1037052:1037111 [3] NCCL INFO Channel 00/0 : 27[c7000] -> 26[85000] via P2P/IPC/read
+gpua032:1037052:1037111 [3] NCCL INFO Channel 01/0 : 27[c7000] -> 26[85000] via P2P/IPC/read
+gpua092:1040046:1040116 [0] NCCL INFO Channel 01/0 : 48[7000] -> 52[7000] [send] via NET/IB/0
+gpua092:1040046:1040116 [0] NCCL INFO Channel 00/0 : 48[7000] -> 56[7000] [send] via NET/IB/0
+gpua092:1040046:1040116 [0] NCCL INFO Channel 00/0 : 32[7000] -> 48[7000] [receive] via NET/IB/0
+gpua092:1040046:1040116 [0] NCCL INFO Channel 00/0 : 48[7000] -> 32[7000] [send] via NET/IB/0
+gpua092:1040046:1040116 [0] NCCL INFO Channel 00/0 : 56[7000] -> 48[7000] [receive] via NET/IB/0
+gpua092:1040046:1040116 [0] NCCL INFO Channel 01/0 : 52[7000] -> 48[7000] [receive] via NET/IB/0
+gpua092:1040046:1040116 [0] NCCL INFO Connected all trees
+gpua092:1040046:1040116 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua092:1040046:1040116 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua092:1040046:1040116 [0] NCCL INFO comm 0xcd2ba40 rank 48 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE
+gpua032:1037052:1037111 [3] NCCL INFO Connected all trees
+gpua032:1037052:1037111 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua032:1037052:1037111 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua032:1037052:1037111 [3] NCCL INFO comm 0x1e40f0d0 rank 27 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE
+gpua036:1186941:1186941 [2] NCCL INFO cudaDriverVersion 12020
+gpua036:1186941:1186941 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.36<0>
+gpua036:1186941:1186941 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua036:1186941:1187016 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.36<0>
+gpua036:1186941:1187016 [2] NCCL INFO Using network IB
+gpua036:1186941:1187016 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000
+gpua036:1186941:1187016 [2] NCCL INFO Trees [0] 31/-1/-1->30->29 [1] 31/-1/-1->30->29
+gpua036:1186941:1187016 [2] NCCL INFO Channel 00/0 : 30[85000] -> 31[c7000] via P2P/IPC/read
+gpua036:1186941:1187016 [2] NCCL INFO Channel 01/0 : 30[85000] -> 31[c7000] via P2P/IPC/read
+gpua036:1186941:1187016 [2] NCCL INFO Connected all rings
+gpua036:1186941:1187016 [2] NCCL INFO Channel 00/0 : 30[85000] -> 29[46000] via P2P/IPC/read
+gpua036:1186941:1187016 [2] NCCL INFO Channel 01/0 : 30[85000] -> 29[46000] via P2P/IPC/read
+gpua036:1186941:1187016 [2] NCCL INFO Connected all trees
+gpua036:1186941:1187016 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua036:1186941:1187016 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua036:1186941:1187016 [2] NCCL INFO comm 0x1ddfcf40 rank 30 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE
+gpua032:1037050:1037050 [1] NCCL INFO cudaDriverVersion 12020
+gpua032:1037050:1037050 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.32<0>
+gpua032:1037050:1037050 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua032:1037050:1037114 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.32<0>
+gpua032:1037050:1037114 [1] NCCL INFO Using network IB
+gpua032:1037050:1037114 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000
+gpua032:1037050:1037114 [1] NCCL INFO Trees [0] 26/20/-1->25->24 [1] 26/-1/-1->25->24
+gpua032:1037050:1037114 [1] NCCL INFO Channel 00/0 : 25[46000] -> 26[85000] via P2P/IPC/read
+gpua032:1037050:1037114 [1] NCCL INFO Channel 01/0 : 25[46000] -> 26[85000] via P2P/IPC/read
+gpua032:1037050:1037114 [1] NCCL INFO Connected all rings
+gpua032:1037050:1037114 [1] NCCL INFO Channel 00/0 : 20[7000] -> 25[46000] [receive] via NET/IB/0
+gpua032:1037050:1037114 [1] NCCL INFO Channel 00/0 : 25[46000] -> 20[7000] [send] via NET/IB/0
+gpua032:1037050:1037114 [1] NCCL INFO Channel 00/0 : 25[46000] -> 24[7000] via P2P/IPC/read
+gpua032:1037050:1037114 [1] NCCL INFO Channel 01/0 : 25[46000] -> 24[7000] via P2P/IPC/read
+gpua032:1037050:1037114 [1] NCCL INFO Connected all trees
+gpua032:1037050:1037114 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua032:1037050:1037114 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua032:1037050:1037114 [1] NCCL INFO comm 0x9c5e4c50 rank 25 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE
+gpua098:480537:480537 [0] NCCL INFO cudaDriverVersion 12020
+gpua098:480537:480537 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.98<0>
+gpua098:480537:480537 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua098:480537:480601 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.98<0>
+gpua098:480537:480601 [0] NCCL INFO Using network IB
+gpua098:480537:480601 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000
+gpua098:480537:480601 [0] NCCL INFO Trees [0] 57/60/-1->56->48 [1] 57/-1/-1->56->53
+gpua098:480537:480601 [0] NCCL INFO Channel 00/0 : 55[c7000] -> 56[7000] [receive] via NET/IB/0
+gpua098:480537:480601 [0] NCCL INFO Channel 01/0 : 55[c7000] -> 56[7000] [receive] via NET/IB/0
+gpua098:480537:480601 [0] NCCL INFO Channel 00/0 : 56[7000] -> 57[46000] via P2P/IPC/read
+gpua098:480537:480601 [0] NCCL INFO Channel 01/0 : 56[7000] -> 57[46000] via P2P/IPC/read
+gpua098:480537:480601 [0] NCCL INFO Connected all rings
+gpua099:959113:959113 [2] NCCL INFO cudaDriverVersion 12020
+gpua099:959113:959113 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.99<0>
+gpua099:959113:959113 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua099:959113:959175 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.99<0>
+gpua099:959113:959175 [2] NCCL INFO Using network IB
+gpua099:959113:959175 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000
+gpua099:959113:959175 [2] NCCL INFO Trees [0] 63/-1/-1->62->61 [1] 63/-1/-1->62->61
+gpua099:959113:959175 [2] NCCL INFO Channel 00/0 : 62[85000] -> 63[c7000] via P2P/IPC/read
+gpua099:959113:959175 [2] NCCL INFO Channel 01/0 : 62[85000] -> 63[c7000] via P2P/IPC/read
+gpua099:959113:959175 [2] NCCL INFO Connected all rings
+gpua099:959113:959175 [2] NCCL INFO Channel 00/0 : 62[85000] -> 61[46000] via P2P/IPC/read
+gpua099:959113:959175 [2] NCCL INFO Channel 01/0 : 62[85000] -> 61[46000] via P2P/IPC/read
+gpua036:1186939:1186939 [0] NCCL INFO cudaDriverVersion 12020
+gpua036:1186939:1186939 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.36<0>
+gpua036:1186939:1186939 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua036:1186939:1187013 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.36<0>
+gpua036:1186939:1187013 [0] NCCL INFO Using network IB
+gpua036:1186939:1187013 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000
+gpua036:1186939:1187013 [0] NCCL INFO Trees [0] 29/-1/-1->28->24 [1] 29/12/-1->28->60
+gpua036:1186939:1187013 [0] NCCL INFO Channel 00/0 : 27[c7000] -> 28[7000] [receive] via NET/IB/0
+gpua036:1186939:1187013 [0] NCCL INFO Channel 01/0 : 27[c7000] -> 28[7000] [receive] via NET/IB/0
+gpua036:1186939:1187013 [0] NCCL INFO Channel 00/0 : 28[7000] -> 29[46000] via P2P/IPC/read
+gpua036:1186939:1187013 [0] NCCL INFO Channel 01/0 : 28[7000] -> 29[46000] via P2P/IPC/read
+gpua036:1186939:1187013 [0] NCCL INFO Connected all rings
+gpua098:480537:480601 [0] NCCL INFO Channel 01/0 : 53[46000] -> 56[7000] [receive] via NET/IB/0
+gpua098:480537:480601 [0] NCCL INFO Channel 00/0 : 56[7000] -> 60[7000] [send] via NET/IB/0
+gpua098:480537:480601 [0] NCCL INFO Channel 00/0 : 48[7000] -> 56[7000] [receive] via NET/IB/0
+gpua098:480537:480601 [0] NCCL INFO Channel 00/0 : 56[7000] -> 48[7000] [send] via NET/IB/0
+gpua098:480537:480601 [0] NCCL INFO Channel 00/0 : 60[7000] -> 56[7000] [receive] via NET/IB/0
+gpua098:480537:480601 [0] NCCL INFO Channel 01/0 : 56[7000] -> 53[46000] [send] via NET/IB/0
+gpua098:480537:480601 [0] NCCL INFO Connected all trees
+gpua098:480537:480601 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua098:480537:480601 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua098:480537:480601 [0] NCCL INFO comm 0x9bf03fe0 rank 56 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE
+gpua099:959113:959175 [2] NCCL INFO Connected all trees
+gpua099:959113:959175 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua099:959113:959175 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua099:959113:959175 [2] NCCL INFO comm 0x11bee360 rank 62 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE
+gpua036:1186939:1187013 [0] NCCL INFO Channel 00/0 : 24[7000] -> 28[7000] [receive] via NET/IB/0
+gpua036:1186939:1187013 [0] NCCL INFO Channel 01/0 : 12[7000] -> 28[7000] [receive] via NET/IB/0
+gpua036:1186939:1187013 [0] NCCL INFO Channel 01/0 : 60[7000] -> 28[7000] [receive] via NET/IB/0
+gpua036:1186939:1187013 [0] NCCL INFO Channel 01/0 : 28[7000] -> 60[7000] [send] via NET/IB/0
+gpua036:1186939:1187013 [0] NCCL INFO Channel 01/0 : 28[7000] -> 12[7000] [send] via NET/IB/0
+gpua036:1186939:1187013 [0] NCCL INFO Channel 00/0 : 28[7000] -> 24[7000] [send] via NET/IB/0
+gpua036:1186939:1187013 [0] NCCL INFO Connected all trees
+gpua036:1186939:1187013 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua036:1186939:1187013 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua036:1186939:1187013 [0] NCCL INFO comm 0xb92bc00 rank 28 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE
+gpua099:959111:959111 [0] NCCL INFO cudaDriverVersion 12020
+gpua099:959111:959111 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.99<0>
+gpua099:959111:959111 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua099:959111:959181 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.99<0>
+gpua099:959111:959181 [0] NCCL INFO Using network IB
+gpua099:959111:959181 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000
+gpua099:959111:959181 [0] NCCL INFO Trees [0] 61/-1/-1->60->56 [1] 61/28/-1->60->-1
+gpua099:959111:959181 [0] NCCL INFO Channel 00/0 : 59[c7000] -> 60[7000] [receive] via NET/IB/0
+gpua099:959111:959181 [0] NCCL INFO Channel 01/0 : 59[c7000] -> 60[7000] [receive] via NET/IB/0
+gpua099:959111:959181 [0] NCCL INFO Channel 00/0 : 60[7000] -> 61[46000] via P2P/IPC/read
+gpua099:959111:959181 [0] NCCL INFO Channel 01/0 : 60[7000] -> 61[46000] via P2P/IPC/read
+gpua099:959111:959181 [0] NCCL INFO Connected all rings
+gpua099:959111:959181 [0] NCCL INFO Channel 00/0 : 56[7000] -> 60[7000] [receive] via NET/IB/0
+gpua099:959111:959181 [0] NCCL INFO Channel 01/0 : 28[7000] -> 60[7000] [receive] via NET/IB/0
+gpua099:959111:959181 [0] NCCL INFO Channel 01/0 : 60[7000] -> 28[7000] [send] via NET/IB/0
+gpua099:959111:959181 [0] NCCL INFO Channel 00/0 : 60[7000] -> 56[7000] [send] via NET/IB/0
+gpua099:959111:959181 [0] NCCL INFO Connected all trees
+gpua099:959111:959181 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua099:959111:959181 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua099:959111:959181 [0] NCCL INFO comm 0xcf27c40 rank 60 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE
+gpua036:1186940:1186940 [1] NCCL INFO cudaDriverVersion 12020
+gpua036:1186940:1186940 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.36<0>
+gpua036:1186940:1186940 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua036:1186940:1187015 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.36<0>
+gpua036:1186940:1187015 [1] NCCL INFO Using network IB
+gpua036:1186940:1187015 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000
+gpua036:1186940:1187015 [1] NCCL INFO Trees [0] 30/-1/-1->29->28 [1] 30/44/-1->29->28
+gpua036:1186940:1187015 [1] NCCL INFO Channel 00/0 : 29[46000] -> 30[85000] via P2P/IPC/read
+gpua036:1186940:1187015 [1] NCCL INFO Channel 01/0 : 29[46000] -> 30[85000] via P2P/IPC/read
+gpua036:1186940:1187015 [1] NCCL INFO Connected all rings
+gpua036:1186940:1187015 [1] NCCL INFO Channel 01/0 : 29[46000] -> 44[7000] [send] via NET/IB/0
+gpua036:1186940:1187015 [1] NCCL INFO Channel 01/0 : 44[7000] -> 29[46000] [receive] via NET/IB/0
+gpua036:1186940:1187015 [1] NCCL INFO Channel 00/0 : 29[46000] -> 28[7000] via P2P/IPC/read
+gpua036:1186940:1187015 [1] NCCL INFO Channel 01/0 : 29[46000] -> 28[7000] via P2P/IPC/read
+gpua036:1186940:1187015 [1] NCCL INFO Connected all trees
+gpua036:1186940:1187015 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua036:1186940:1187015 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua036:1186940:1187015 [1] NCCL INFO comm 0xe706d820 rank 29 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE
+gpua041:912686:912686 [3] NCCL INFO cudaDriverVersion 12020
+gpua041:912686:912686 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.41<0>
+gpua041:912686:912686 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua041:912686:912754 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.41<0>
+gpua041:912686:912754 [3] NCCL INFO Using network IB
+gpua041:912686:912754 [3] NCCL INFO Setting affinity for GPU 3 to ffff
+gpua041:912686:912754 [3] NCCL INFO Trees [0] -1/-1/-1->39->38 [1] -1/-1/-1->39->38
+gpua041:912686:912754 [3] NCCL INFO Channel 00/0 : 39[c7000] -> 40[7000] [send] via NET/IB/0
+gpua041:912686:912754 [3] NCCL INFO Channel 01/0 : 39[c7000] -> 40[7000] [send] via NET/IB/0
+gpua041:912686:912754 [3] NCCL INFO Connected all rings
+gpua041:912686:912754 [3] NCCL INFO Channel 00/0 : 39[c7000] -> 38[85000] via P2P/IPC/read
+gpua041:912686:912754 [3] NCCL INFO Channel 01/0 : 39[c7000] -> 38[85000] via P2P/IPC/read
+gpua041:912686:912754 [3] NCCL INFO Connected all trees
+gpua041:912686:912754 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua041:912686:912754 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua041:912686:912754 [3] NCCL INFO comm 0x15030540 rank 39 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE
+gpua041:912685:912685 [2] NCCL INFO cudaDriverVersion 12020
+gpua041:912685:912685 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.41<0>
+gpua041:912685:912685 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua041:912685:912751 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.41<0>
+gpua041:912685:912751 [2] NCCL INFO Using network IB
+gpua041:912685:912751 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000
+gpua041:912685:912751 [2] NCCL INFO Trees [0] 39/-1/-1->38->37 [1] 39/-1/-1->38->37
+gpua041:912685:912751 [2] NCCL INFO Channel 00/0 : 38[85000] -> 39[c7000] via P2P/IPC/read
+gpua041:912685:912751 [2] NCCL INFO Channel 01/0 : 38[85000] -> 39[c7000] via P2P/IPC/read
+gpua041:912685:912751 [2] NCCL INFO Connected all rings
+gpua041:912685:912751 [2] NCCL INFO Channel 00/0 : 38[85000] -> 37[46000] via P2P/IPC/read
+gpua041:912685:912751 [2] NCCL INFO Channel 01/0 : 38[85000] -> 37[46000] via P2P/IPC/read
+gpua092:1040048:1040048 [2] NCCL INFO cudaDriverVersion 12020
+gpua092:1040048:1040048 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.92<0>
+gpua092:1040048:1040048 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua092:1040048:1040114 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.92<0>
+gpua092:1040048:1040114 [2] NCCL INFO Using network IB
+gpua092:1040048:1040114 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000
+gpua092:1040048:1040114 [2] NCCL INFO Trees [0] 51/-1/-1->50->49 [1] 51/-1/-1->50->49
+gpua092:1040048:1040114 [2] NCCL INFO Channel 00/0 : 50[85000] -> 51[c7000] via P2P/IPC/read
+gpua092:1040048:1040114 [2] NCCL INFO Channel 01/0 : 50[85000] -> 51[c7000] via P2P/IPC/read
+gpua092:1040048:1040114 [2] NCCL INFO Connected all rings
+gpua092:1040048:1040114 [2] NCCL INFO Channel 00/0 : 50[85000] -> 49[46000] via P2P/IPC/read
+gpua092:1040048:1040114 [2] NCCL INFO Channel 01/0 : 50[85000] -> 49[46000] via P2P/IPC/read
+gpua005:3000820:3000820 [1] NCCL INFO cudaDriverVersion 12020
+gpua005:3000820:3000820 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.5<0>
+gpua005:3000820:3000820 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua005:3000820:3000889 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.5<0>
+gpua005:3000820:3000889 [1] NCCL INFO Using network IB
+gpua005:3000820:3000889 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000
+gpua005:3000820:3000889 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0
+gpua005:3000820:3000889 [1] NCCL INFO Channel 00/0 : 1[46000] -> 2[85000] via P2P/IPC/read
+gpua005:3000820:3000889 [1] NCCL INFO Channel 01/0 : 1[46000] -> 2[85000] via P2P/IPC/read
+gpua005:3000820:3000889 [1] NCCL INFO Connected all rings
+gpua005:3000820:3000889 [1] NCCL INFO Channel 00/0 : 1[46000] -> 0[7000] via P2P/IPC/read
+gpua005:3000820:3000889 [1] NCCL INFO Channel 01/0 : 1[46000] -> 0[7000] via P2P/IPC/read
+gpua041:912685:912751 [2] NCCL INFO Connected all trees
+gpua041:912685:912751 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua041:912685:912751 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua041:912685:912751 [2] NCCL INFO comm 0xe323e00 rank 38 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE
+gpua092:1040048:1040114 [2] NCCL INFO Connected all trees
+gpua092:1040048:1040114 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua092:1040048:1040114 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua092:1040048:1040114 [2] NCCL INFO comm 0x7fcefa7d9560 rank 50 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE
+gpua005:3000820:3000889 [1] NCCL INFO Connected all trees
+gpua005:3000820:3000889 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua005:3000820:3000889 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua005:3000820:3000889 [1] NCCL INFO comm 0xea2683a0 rank 1 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE
+gpua041:912683:912683 [0] NCCL INFO cudaDriverVersion 12020
+gpua041:912683:912683 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.41<0>
+gpua041:912683:912683 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua041:912683:912752 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.41<0>
+gpua041:912683:912752 [0] NCCL INFO Using network IB
+gpua041:912683:912752 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000
+gpua041:912683:912752 [0] NCCL INFO Trees [0] 37/-1/-1->36->41 [1] 37/32/-1->36->44
+gpua041:912683:912752 [0] NCCL INFO Channel 00/0 : 35[c7000] -> 36[7000] [receive] via NET/IB/0
+gpua041:912683:912752 [0] NCCL INFO Channel 01/0 : 35[c7000] -> 36[7000] [receive] via NET/IB/0
+gpua041:912683:912752 [0] NCCL INFO Channel 00/0 : 36[7000] -> 37[46000] via P2P/IPC/read
+gpua041:912683:912752 [0] NCCL INFO Channel 01/0 : 36[7000] -> 37[46000] via P2P/IPC/read
+gpua041:912683:912752 [0] NCCL INFO Connected all rings
+gpua041:912683:912752 [0] NCCL INFO Channel 01/0 : 32[7000] -> 36[7000] [receive] via NET/IB/0
+gpua041:912683:912752 [0] NCCL INFO Channel 00/0 : 36[7000] -> 41[46000] [send] via NET/IB/0
+gpua041:912683:912752 [0] NCCL INFO Channel 01/0 : 36[7000] -> 44[7000] [send] via NET/IB/0
+gpua041:912683:912752 [0] NCCL INFO Channel 01/0 : 44[7000] -> 36[7000] [receive] via NET/IB/0
+gpua041:912683:912752 [0] NCCL INFO Channel 00/0 : 41[46000] -> 36[7000] [receive] via NET/IB/0
+gpua041:912683:912752 [0] NCCL INFO Channel 01/0 : 36[7000] -> 32[7000] [send] via NET/IB/0
+gpua041:912683:912752 [0] NCCL INFO Connected all trees
+gpua041:912683:912752 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua041:912683:912752 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua041:912683:912752 [0] NCCL INFO comm 0x18ba7e70 rank 36 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE
+gpua005:3000821:3000821 [2] NCCL INFO cudaDriverVersion 12020
+gpua005:3000821:3000821 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.5<0>
+gpua005:3000821:3000821 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua005:3000821:3000890 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.5<0>
+gpua005:3000821:3000890 [2] NCCL INFO Using network IB
+gpua005:3000821:3000890 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000
+gpua005:3000821:3000890 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1
+gpua005:3000821:3000890 [2] NCCL INFO Channel 00/0 : 2[85000] -> 3[c7000] via P2P/IPC/read
+gpua005:3000821:3000890 [2] NCCL INFO Channel 01/0 : 2[85000] -> 3[c7000] via P2P/IPC/read
+gpua005:3000821:3000890 [2] NCCL INFO Connected all rings
+gpua005:3000821:3000890 [2] NCCL INFO Channel 00/0 : 2[85000] -> 1[46000] via P2P/IPC/read
+gpua005:3000821:3000890 [2] NCCL INFO Channel 01/0 : 2[85000] -> 1[46000] via P2P/IPC/read
+gpua005:3000821:3000890 [2] NCCL INFO Connected all trees
+gpua005:3000821:3000890 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua005:3000821:3000890 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua005:3000821:3000890 [2] NCCL INFO comm 0xb3e7930 rank 2 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE
+gpua016:1316128:1316128 [1] NCCL INFO cudaDriverVersion 12020
+gpua016:1316128:1316128 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.16<0>
+gpua016:1316128:1316128 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua016:1316128:1316193 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.16<0>
+gpua016:1316128:1316193 [1] NCCL INFO Using network IB
+gpua016:1316128:1316193 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000
+gpua016:1316128:1316193 [1] NCCL INFO Trees [0] 14/-1/-1->13->12 [1] 14/20/-1->13->12
+gpua016:1316128:1316193 [1] NCCL INFO Channel 00/0 : 13[46000] -> 14[85000] via P2P/IPC/read
+gpua016:1316128:1316193 [1] NCCL INFO Channel 01/0 : 13[46000] -> 14[85000] via P2P/IPC/read
+gpua016:1316128:1316193 [1] NCCL INFO Connected all rings
+gpua016:1316128:1316193 [1] NCCL INFO Channel 01/0 : 13[46000] -> 20[7000] [send] via NET/IB/0
+gpua016:1316128:1316193 [1] NCCL INFO Channel 01/0 : 20[7000] -> 13[46000] [receive] via NET/IB/0
+gpua016:1316128:1316193 [1] NCCL INFO Channel 00/0 : 13[46000] -> 12[7000] via P2P/IPC/read
+gpua016:1316128:1316193 [1] NCCL INFO Channel 01/0 : 13[46000] -> 12[7000] via P2P/IPC/read
+gpua016:1316128:1316193 [1] NCCL INFO Connected all trees
+gpua016:1316128:1316193 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua016:1316128:1316193 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua016:1316128:1316193 [1] NCCL INFO comm 0x9a131ad0 rank 13 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE
+gpua099:959112:959112 [1] NCCL INFO cudaDriverVersion 12020
+gpua099:959112:959112 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.99<0>
+gpua099:959112:959112 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua099:959112:959178 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.99<0>
+gpua099:959112:959178 [1] NCCL INFO Using network IB
+gpua099:959112:959178 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000
+gpua099:959112:959178 [1] NCCL INFO Trees [0] 62/-1/-1->61->60 [1] 62/-1/-1->61->60
+gpua099:959112:959178 [1] NCCL INFO Channel 00/0 : 61[46000] -> 62[85000] via P2P/IPC/read
+gpua099:959112:959178 [1] NCCL INFO Channel 01/0 : 61[46000] -> 62[85000] via P2P/IPC/read
+gpua099:959112:959178 [1] NCCL INFO Connected all rings
+gpua099:959112:959178 [1] NCCL INFO Channel 00/0 : 61[46000] -> 60[7000] via P2P/IPC/read
+gpua099:959112:959178 [1] NCCL INFO Channel 01/0 : 61[46000] -> 60[7000] via P2P/IPC/read
+gpua099:959112:959178 [1] NCCL INFO Connected all trees
+gpua099:959112:959178 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua099:959112:959178 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua099:959112:959178 [1] NCCL INFO comm 0xdc8fe60 rank 61 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE
+gpua098:480540:480540 [3] NCCL INFO cudaDriverVersion 12020
+gpua098:480540:480540 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.98<0>
+gpua098:480540:480540 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua098:480540:480597 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.98<0>
+gpua098:480540:480597 [3] NCCL INFO Using network IB
+gpua098:480540:480597 [3] NCCL INFO Setting affinity for GPU 3 to ffff
+gpua098:480540:480597 [3] NCCL INFO Trees [0] -1/-1/-1->59->58 [1] -1/-1/-1->59->58
+gpua098:480540:480597 [3] NCCL INFO Channel 00/0 : 59[c7000] -> 60[7000] [send] via NET/IB/0
+gpua098:480540:480597 [3] NCCL INFO Channel 01/0 : 59[c7000] -> 60[7000] [send] via NET/IB/0
+gpua098:480540:480597 [3] NCCL INFO Connected all rings
+gpua098:480540:480597 [3] NCCL INFO Channel 00/0 : 59[c7000] -> 58[85000] via P2P/IPC/read
+gpua098:480540:480597 [3] NCCL INFO Channel 01/0 : 59[c7000] -> 58[85000] via P2P/IPC/read
+gpua098:480540:480597 [3] NCCL INFO Connected all trees
+gpua098:480540:480597 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua098:480540:480597 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua098:480540:480597 [3] NCCL INFO comm 0x7f6ec38fa560 rank 59 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE
+gpua008:2393223:2393223 [1] NCCL INFO cudaDriverVersion 12020
+gpua008:2393223:2393223 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.8<0>
+gpua008:2393223:2393223 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua008:2393223:2393286 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.8<0>
+gpua008:2393223:2393286 [1] NCCL INFO Using network IB
+gpua008:2393223:2393286 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000
+gpua008:2393223:2393286 [1] NCCL INFO Trees [0] 10/4/-1->9->8 [1] 10/-1/-1->9->8
+gpua008:2393223:2393286 [1] NCCL INFO Channel 00/0 : 9[46000] -> 10[85000] via P2P/IPC/read
+gpua008:2393223:2393286 [1] NCCL INFO Channel 01/0 : 9[46000] -> 10[85000] via P2P/IPC/read
+gpua008:2393223:2393286 [1] NCCL INFO Connected all rings
+gpua008:2393223:2393286 [1] NCCL INFO Channel 00/0 : 4[7000] -> 9[46000] [receive] via NET/IB/0
+gpua008:2393223:2393286 [1] NCCL INFO Channel 00/0 : 9[46000] -> 4[7000] [send] via NET/IB/0
+gpua008:2393223:2393286 [1] NCCL INFO Channel 00/0 : 9[46000] -> 8[7000] via P2P/IPC/read
+gpua008:2393223:2393286 [1] NCCL INFO Channel 01/0 : 9[46000] -> 8[7000] via P2P/IPC/read
+gpua008:2393223:2393286 [1] NCCL INFO Connected all trees
+gpua008:2393223:2393286 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua008:2393223:2393286 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua008:2393223:2393286 [1] NCCL INFO comm 0xe9ce07a0 rank 9 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE
+gpua098:480538:480538 [1] NCCL INFO cudaDriverVersion 12020
+gpua098:480538:480538 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.98<0>
+gpua098:480538:480538 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua098:480538:480599 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.98<0>
+gpua098:480538:480599 [1] NCCL INFO Using network IB
+gpua098:480538:480599 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000
+gpua098:480538:480599 [1] NCCL INFO Trees [0] 58/52/-1->57->56 [1] 58/-1/-1->57->56
+gpua098:480538:480599 [1] NCCL INFO Channel 00/0 : 57[46000] -> 58[85000] via P2P/IPC/read
+gpua098:480538:480599 [1] NCCL INFO Channel 01/0 : 57[46000] -> 58[85000] via P2P/IPC/read
+gpua098:480538:480599 [1] NCCL INFO Connected all rings
+gpua098:480538:480599 [1] NCCL INFO Channel 00/0 : 52[7000] -> 57[46000] [receive] via NET/IB/0
+gpua098:480538:480599 [1] NCCL INFO Channel 00/0 : 57[46000] -> 52[7000] [send] via NET/IB/0
+gpua098:480538:480599 [1] NCCL INFO Channel 00/0 : 57[46000] -> 56[7000] via P2P/IPC/read
+gpua098:480538:480599 [1] NCCL INFO Channel 01/0 : 57[46000] -> 56[7000] via P2P/IPC/read
+gpua098:480538:480599 [1] NCCL INFO Connected all trees
+gpua098:480538:480599 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua098:480538:480599 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua098:480538:480599 [1] NCCL INFO comm 0xd690810 rank 57 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE
+gpua099:959114:959114 [3] NCCL INFO cudaDriverVersion 12020
+gpua099:959114:959114 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.99<0>
+gpua099:959114:959114 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua099:959114:959177 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.99<0>
+gpua099:959114:959177 [3] NCCL INFO Using network IB
+gpua099:959114:959177 [3] NCCL INFO Setting affinity for GPU 3 to ffff
+gpua099:959114:959177 [3] NCCL INFO Trees [0] -1/-1/-1->63->62 [1] -1/-1/-1->63->62
+gpua099:959114:959177 [3] NCCL INFO Channel 00/0 : 63[c7000] -> 0[7000] [send] via NET/IB/0
+gpua099:959114:959177 [3] NCCL INFO Channel 01/0 : 63[c7000] -> 0[7000] [send] via NET/IB/0
+gpua099:959114:959177 [3] NCCL INFO Connected all rings
+gpua099:959114:959177 [3] NCCL INFO Channel 00/0 : 63[c7000] -> 62[85000] via P2P/IPC/read
+gpua099:959114:959177 [3] NCCL INFO Channel 01/0 : 63[c7000] -> 62[85000] via P2P/IPC/read
+gpua099:959114:959177 [3] NCCL INFO Connected all trees
+gpua099:959114:959177 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua099:959114:959177 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua099:959114:959177 [3] NCCL INFO comm 0xc6ab920 rank 63 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE
+gpua008:2393222:2393222 [0] NCCL INFO cudaDriverVersion 12020
+gpua008:2393222:2393222 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.8<0>
+gpua008:2393222:2393222 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua008:2393222:2393287 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.8<0>
+gpua008:2393222:2393287 [0] NCCL INFO Using network IB
+gpua008:2393222:2393287 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000
+gpua008:2393222:2393287 [0] NCCL INFO Trees [0] 9/12/-1->8->17 [1] 9/-1/-1->8->5
+gpua008:2393222:2393287 [0] NCCL INFO Channel 00/0 : 7[c7000] -> 8[7000] [receive] via NET/IB/0
+gpua008:2393222:2393287 [0] NCCL INFO Channel 01/0 : 7[c7000] -> 8[7000] [receive] via NET/IB/0
+gpua008:2393222:2393287 [0] NCCL INFO Channel 00/0 : 8[7000] -> 9[46000] via P2P/IPC/read
+gpua008:2393222:2393287 [0] NCCL INFO Channel 01/0 : 8[7000] -> 9[46000] via P2P/IPC/read
+gpua008:2393222:2393287 [0] NCCL INFO Connected all rings
+gpua008:2393222:2393287 [0] NCCL INFO Channel 01/0 : 5[46000] -> 8[7000] [receive] via NET/IB/0
+gpua008:2393222:2393287 [0] NCCL INFO Channel 00/0 : 8[7000] -> 12[7000] [send] via NET/IB/0
+gpua008:2393222:2393287 [0] NCCL INFO Channel 00/0 : 8[7000] -> 17[46000] [send] via NET/IB/0
+gpua008:2393222:2393287 [0] NCCL INFO Channel 00/0 : 17[46000] -> 8[7000] [receive] via NET/IB/0
+gpua008:2393222:2393287 [0] NCCL INFO Channel 00/0 : 12[7000] -> 8[7000] [receive] via NET/IB/0
+gpua008:2393222:2393287 [0] NCCL INFO Channel 01/0 : 8[7000] -> 5[46000] [send] via NET/IB/0
+gpua008:2393222:2393287 [0] NCCL INFO Connected all trees
+gpua008:2393222:2393287 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua008:2393222:2393287 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua008:2393222:2393287 [0] NCCL INFO comm 0xe742950 rank 8 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE
+gpua097:3145435:3145435 [3] NCCL INFO cudaDriverVersion 12020
+gpua097:3145435:3145435 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.97<0>
+gpua097:3145435:3145435 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua097:3145435:3145501 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.97<0>
+gpua097:3145435:3145501 [3] NCCL INFO Using network IB
+gpua097:3145435:3145501 [3] NCCL INFO Setting affinity for GPU 3 to ffff
+gpua097:3145435:3145501 [3] NCCL INFO Trees [0] -1/-1/-1->55->54 [1] -1/-1/-1->55->54
+gpua097:3145435:3145501 [3] NCCL INFO Channel 00/0 : 55[c7000] -> 56[7000] [send] via NET/IB/0
+gpua097:3145435:3145501 [3] NCCL INFO Channel 01/0 : 55[c7000] -> 56[7000] [send] via NET/IB/0
+gpua097:3145435:3145501 [3] NCCL INFO Connected all rings
+gpua097:3145435:3145501 [3] NCCL INFO Channel 00/0 : 55[c7000] -> 54[85000] via P2P/IPC/read
+gpua097:3145435:3145501 [3] NCCL INFO Channel 01/0 : 55[c7000] -> 54[85000] via P2P/IPC/read
+gpua097:3145435:3145501 [3] NCCL INFO Connected all trees
+gpua097:3145435:3145501 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua097:3145435:3145501 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua097:3145435:3145501 [3] NCCL INFO comm 0x39f5a890 rank 55 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE
+gpua097:3145434:3145434 [2] NCCL INFO cudaDriverVersion 12020
+gpua097:3145434:3145434 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.97<0>
+gpua097:3145434:3145434 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua097:3145434:3145499 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.97<0>
+gpua097:3145434:3145499 [2] NCCL INFO Using network IB
+gpua097:3145434:3145499 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000
+gpua097:3145434:3145499 [2] NCCL INFO Trees [0] 55/-1/-1->54->53 [1] 55/-1/-1->54->53
+gpua097:3145434:3145499 [2] NCCL INFO Channel 00/0 : 54[85000] -> 55[c7000] via P2P/IPC/read
+gpua097:3145434:3145499 [2] NCCL INFO Channel 01/0 : 54[85000] -> 55[c7000] via P2P/IPC/read
+gpua097:3145434:3145499 [2] NCCL INFO Connected all rings
+gpua097:3145434:3145499 [2] NCCL INFO Channel 00/0 : 54[85000] -> 53[46000] via P2P/IPC/read
+gpua097:3145434:3145499 [2] NCCL INFO Channel 01/0 : 54[85000] -> 53[46000] via P2P/IPC/read
+gpua097:3145434:3145499 [2] NCCL INFO Connected all trees
+gpua097:3145434:3145499 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua097:3145434:3145499 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua097:3145434:3145499 [2] NCCL INFO comm 0xd238f60 rank 54 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE
+gpua086:1391751:1391751 [1] NCCL INFO cudaDriverVersion 12020
+gpua086:1391751:1391751 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.86<0>
+gpua086:1391751:1391751 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua086:1391751:1391806 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.86<0>
+gpua086:1391751:1391806 [1] NCCL INFO Using network IB
+gpua086:1391751:1391806 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000
+gpua086:1391751:1391806 [1] NCCL INFO Trees [0] 46/-1/-1->45->44 [1] 46/52/-1->45->44
+gpua086:1391751:1391806 [1] NCCL INFO Channel 00/0 : 45[46000] -> 46[85000] via P2P/IPC/read
+gpua086:1391751:1391806 [1] NCCL INFO Channel 01/0 : 45[46000] -> 46[85000] via P2P/IPC/read
+gpua086:1391751:1391806 [1] NCCL INFO Connected all rings
+gpua086:1391751:1391806 [1] NCCL INFO Channel 01/0 : 45[46000] -> 52[7000] [send] via NET/IB/0
+gpua086:1391751:1391806 [1] NCCL INFO Channel 01/0 : 52[7000] -> 45[46000] [receive] via NET/IB/0
+gpua086:1391751:1391806 [1] NCCL INFO Channel 00/0 : 45[46000] -> 44[7000] via P2P/IPC/read
+gpua086:1391751:1391806 [1] NCCL INFO Channel 01/0 : 45[46000] -> 44[7000] via P2P/IPC/read
+gpua086:1391751:1391806 [1] NCCL INFO Connected all trees
+gpua086:1391751:1391806 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua086:1391751:1391806 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua086:1391751:1391806 [1] NCCL INFO comm 0xc157520 rank 45 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE
+gpua007:1077993:1077993 [1] NCCL INFO cudaDriverVersion 12020
+gpua007:1077993:1077993 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.7<0>
+gpua007:1077993:1077993 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua007:1077993:1078062 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.7<0>
+gpua007:1077993:1078062 [1] NCCL INFO Using network IB
+gpua007:1077993:1078062 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000
+gpua007:1077993:1078062 [1] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/8/-1->5->4
+gpua007:1077993:1078062 [1] NCCL INFO Channel 00/0 : 5[46000] -> 6[85000] via P2P/IPC/read
+gpua007:1077993:1078062 [1] NCCL INFO Channel 01/0 : 5[46000] -> 6[85000] via P2P/IPC/read
+gpua007:1077993:1078062 [1] NCCL INFO Connected all rings
+gpua007:1077993:1078062 [1] NCCL INFO Channel 01/0 : 5[46000] -> 8[7000] [send] via NET/IB/0
+gpua007:1077993:1078062 [1] NCCL INFO Channel 01/0 : 8[7000] -> 5[46000] [receive] via NET/IB/0
+gpua007:1077993:1078062 [1] NCCL INFO Channel 00/0 : 5[46000] -> 4[7000] via P2P/IPC/read
+gpua007:1077993:1078062 [1] NCCL INFO Channel 01/0 : 5[46000] -> 4[7000] via P2P/IPC/read
+gpua007:1077993:1078062 [1] NCCL INFO Connected all trees
+gpua007:1077993:1078062 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua007:1077993:1078062 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua007:1077993:1078062 [1] NCCL INFO comm 0x9d75a550 rank 5 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE
+gpua086:1391753:1391753 [3] NCCL INFO cudaDriverVersion 12020
+gpua086:1391753:1391753 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.86<0>
+gpua086:1391753:1391753 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua086:1391753:1391808 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.86<0>
+gpua086:1391753:1391808 [3] NCCL INFO Using network IB
+gpua086:1391753:1391808 [3] NCCL INFO Setting affinity for GPU 3 to ffff
+gpua086:1391753:1391808 [3] NCCL INFO Trees [0] -1/-1/-1->47->46 [1] -1/-1/-1->47->46
+gpua086:1391753:1391808 [3] NCCL INFO Channel 00/0 : 47[c7000] -> 48[7000] [send] via NET/IB/0
+gpua086:1391753:1391808 [3] NCCL INFO Channel 01/0 : 47[c7000] -> 48[7000] [send] via NET/IB/0
+gpua086:1391753:1391808 [3] NCCL INFO Connected all rings
+gpua086:1391753:1391808 [3] NCCL INFO Channel 00/0 : 47[c7000] -> 46[85000] via P2P/IPC/read
+gpua086:1391753:1391808 [3] NCCL INFO Channel 01/0 : 47[c7000] -> 46[85000] via P2P/IPC/read
+gpua086:1391753:1391808 [3] NCCL INFO Connected all trees
+gpua086:1391753:1391808 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua086:1391753:1391808 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua086:1391753:1391808 [3] NCCL INFO comm 0xea4d0560 rank 47 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE
+gpua030:1461129:1461129 [3] NCCL INFO cudaDriverVersion 12020
+gpua030:1461129:1461129 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.30<0>
+gpua030:1461129:1461129 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua030:1461129:1461200 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.30<0>
+gpua030:1461129:1461200 [3] NCCL INFO Using network IB
+gpua030:1461129:1461200 [3] NCCL INFO Setting affinity for GPU 3 to ffff
+gpua030:1461129:1461200 [3] NCCL INFO Trees [0] -1/-1/-1->23->22 [1] -1/-1/-1->23->22
+gpua030:1461129:1461200 [3] NCCL INFO Channel 00/0 : 23[c7000] -> 24[7000] [send] via NET/IB/0
+gpua030:1461129:1461200 [3] NCCL INFO Channel 01/0 : 23[c7000] -> 24[7000] [send] via NET/IB/0
+gpua030:1461129:1461200 [3] NCCL INFO Connected all rings
+gpua030:1461129:1461200 [3] NCCL INFO Channel 00/0 : 23[c7000] -> 22[85000] via P2P/IPC/read
+gpua030:1461129:1461200 [3] NCCL INFO Channel 01/0 : 23[c7000] -> 22[85000] via P2P/IPC/read
+gpua030:1461129:1461200 [3] NCCL INFO Connected all trees
+gpua030:1461129:1461200 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua030:1461129:1461200 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua030:1461129:1461200 [3] NCCL INFO comm 0xe91241e0 rank 23 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE
+gpua030:1461126:1461126 [0] NCCL INFO cudaDriverVersion 12020
+gpua030:1461126:1461126 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.30<0>
+gpua030:1461126:1461126 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua030:1461126:1461205 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.30<0>
+gpua030:1461126:1461205 [0] NCCL INFO Using network IB
+gpua030:1461126:1461205 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000
+gpua030:1461126:1461205 [0] NCCL INFO Trees [0] 21/-1/-1->20->25 [1] 21/16/-1->20->13
+gpua030:1461126:1461205 [0] NCCL INFO Channel 00/0 : 19[c7000] -> 20[7000] [receive] via NET/IB/0
+gpua030:1461126:1461205 [0] NCCL INFO Channel 01/0 : 19[c7000] -> 20[7000] [receive] via NET/IB/0
+gpua030:1461126:1461205 [0] NCCL INFO Channel 00/0 : 20[7000] -> 21[46000] via P2P/IPC/read
+gpua030:1461126:1461205 [0] NCCL INFO Channel 01/0 : 20[7000] -> 21[46000] via P2P/IPC/read
+gpua030:1461126:1461205 [0] NCCL INFO Connected all rings
+gpua030:1461126:1461205 [0] NCCL INFO Channel 01/0 : 16[7000] -> 20[7000] [receive] via NET/IB/0
+gpua030:1461126:1461205 [0] NCCL INFO Channel 00/0 : 20[7000] -> 25[46000] [send] via NET/IB/0
+gpua030:1461126:1461205 [0] NCCL INFO Channel 01/0 : 13[46000] -> 20[7000] [receive] via NET/IB/0
+gpua030:1461126:1461205 [0] NCCL INFO Channel 01/0 : 20[7000] -> 13[46000] [send] via NET/IB/0
+gpua030:1461126:1461205 [0] NCCL INFO Channel 00/0 : 25[46000] -> 20[7000] [receive] via NET/IB/0
+gpua030:1461126:1461205 [0] NCCL INFO Channel 01/0 : 20[7000] -> 16[7000] [send] via NET/IB/0
+gpua030:1461126:1461205 [0] NCCL INFO Connected all trees
+gpua030:1461126:1461205 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua030:1461126:1461205 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua030:1461126:1461205 [0] NCCL INFO comm 0xe01c96d0 rank 20 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE
+gpua092:1040049:1040049 [3] NCCL INFO cudaDriverVersion 12020
+gpua092:1040049:1040049 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.92<0>
+gpua092:1040049:1040049 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua092:1040049:1040113 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.92<0>
+gpua092:1040049:1040113 [3] NCCL INFO Using network IB
+gpua092:1040049:1040113 [3] NCCL INFO Setting affinity for GPU 3 to ffff
+gpua092:1040049:1040113 [3] NCCL INFO Trees [0] -1/-1/-1->51->50 [1] -1/-1/-1->51->50
+gpua092:1040049:1040113 [3] NCCL INFO Channel 00/0 : 51[c7000] -> 52[7000] [send] via NET/IB/0
+gpua092:1040049:1040113 [3] NCCL INFO Channel 01/0 : 51[c7000] -> 52[7000] [send] via NET/IB/0
+gpua092:1040049:1040113 [3] NCCL INFO Connected all rings
+gpua092:1040049:1040113 [3] NCCL INFO Channel 00/0 : 51[c7000] -> 50[85000] via P2P/IPC/read
+gpua092:1040049:1040113 [3] NCCL INFO Channel 01/0 : 51[c7000] -> 50[85000] via P2P/IPC/read
+gpua092:1040049:1040113 [3] NCCL INFO Connected all trees
+gpua092:1040049:1040113 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua092:1040049:1040113 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua092:1040049:1040113 [3] NCCL INFO comm 0xc08e250 rank 51 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE
+gpua030:1461127:1461127 [1] NCCL INFO cudaDriverVersion 12020
+gpua030:1461127:1461127 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.30<0>
+gpua030:1461127:1461127 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua030:1461127:1461203 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.30<0>
+gpua030:1461127:1461203 [1] NCCL INFO Using network IB
+gpua030:1461127:1461203 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000
+gpua030:1461127:1461203 [1] NCCL INFO Trees [0] 22/-1/-1->21->20 [1] 22/24/-1->21->20
+gpua030:1461127:1461203 [1] NCCL INFO Channel 00/0 : 21[46000] -> 22[85000] via P2P/IPC/read
+gpua030:1461127:1461203 [1] NCCL INFO Channel 01/0 : 21[46000] -> 22[85000] via P2P/IPC/read
+gpua030:1461127:1461203 [1] NCCL INFO Connected all rings
+gpua030:1461127:1461203 [1] NCCL INFO Channel 01/0 : 21[46000] -> 24[7000] [send] via NET/IB/0
+gpua030:1461127:1461203 [1] NCCL INFO Channel 01/0 : 24[7000] -> 21[46000] [receive] via NET/IB/0
+gpua030:1461127:1461203 [1] NCCL INFO Channel 00/0 : 21[46000] -> 20[7000] via P2P/IPC/read
+gpua030:1461127:1461203 [1] NCCL INFO Channel 01/0 : 21[46000] -> 20[7000] via P2P/IPC/read
+gpua030:1461127:1461203 [1] NCCL INFO Connected all trees
+gpua030:1461127:1461203 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua030:1461127:1461203 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua030:1461127:1461203 [1] NCCL INFO comm 0x1ada7820 rank 21 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE
+gpua026:1592969:1592969 [0] NCCL INFO cudaDriverVersion 12020
+gpua026:1592969:1592969 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.26<0>
+gpua026:1592969:1592969 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua026:1592969:1593033 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.26<0>
+gpua026:1592969:1593033 [0] NCCL INFO Using network IB
+gpua026:1592969:1593033 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000
+gpua026:1592969:1593033 [0] NCCL INFO Trees [0] 17/24/-1->16->33 [1] 17/-1/-1->16->20
+gpua026:1592969:1593033 [0] NCCL INFO Channel 00/0 : 15[c7000] -> 16[7000] [receive] via NET/IB/0
+gpua026:1592969:1593033 [0] NCCL INFO Channel 01/0 : 15[c7000] -> 16[7000] [receive] via NET/IB/0
+gpua026:1592969:1593033 [0] NCCL INFO Channel 00/0 : 16[7000] -> 17[46000] via P2P/IPC/read
+gpua026:1592969:1593033 [0] NCCL INFO Channel 01/0 : 16[7000] -> 17[46000] via P2P/IPC/read
+gpua026:1592969:1593033 [0] NCCL INFO Connected all rings
+gpua026:1592969:1593033 [0] NCCL INFO Channel 01/0 : 16[7000] -> 20[7000] [send] via NET/IB/0
+gpua026:1592969:1593033 [0] NCCL INFO Channel 00/0 : 16[7000] -> 24[7000] [send] via NET/IB/0
+gpua026:1592969:1593033 [0] NCCL INFO Channel 00/0 : 16[7000] -> 33[46000] [send] via NET/IB/0
+gpua026:1592969:1593033 [0] NCCL INFO Channel 00/0 : 33[46000] -> 16[7000] [receive] via NET/IB/0
+gpua026:1592969:1593033 [0] NCCL INFO Channel 00/0 : 24[7000] -> 16[7000] [receive] via NET/IB/0
+gpua026:1592969:1593033 [0] NCCL INFO Channel 01/0 : 20[7000] -> 16[7000] [receive] via NET/IB/0
+gpua026:1592969:1593033 [0] NCCL INFO Connected all trees
+gpua026:1592969:1593033 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua026:1592969:1593033 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua026:1592969:1593033 [0] NCCL INFO comm 0xd86df90 rank 16 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE
+gpua016:1316129:1316129 [2] NCCL INFO cudaDriverVersion 12020
+gpua016:1316129:1316129 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.16<0>
+gpua016:1316129:1316129 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua016:1316129:1316192 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.16<0>
+gpua016:1316129:1316192 [2] NCCL INFO Using network IB
+gpua016:1316129:1316192 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000
+gpua016:1316129:1316192 [2] NCCL INFO Trees [0] 15/-1/-1->14->13 [1] 15/-1/-1->14->13
+gpua016:1316129:1316192 [2] NCCL INFO Channel 00/0 : 14[85000] -> 15[c7000] via P2P/IPC/read
+gpua016:1316129:1316192 [2] NCCL INFO Channel 01/0 : 14[85000] -> 15[c7000] via P2P/IPC/read
+gpua016:1316129:1316192 [2] NCCL INFO Connected all rings
+gpua016:1316129:1316192 [2] NCCL INFO Channel 00/0 : 14[85000] -> 13[46000] via P2P/IPC/read
+gpua016:1316129:1316192 [2] NCCL INFO Channel 01/0 : 14[85000] -> 13[46000] via P2P/IPC/read
+gpua016:1316129:1316192 [2] NCCL INFO Connected all trees
+gpua016:1316129:1316192 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua016:1316129:1316192 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua016:1316129:1316192 [2] NCCL INFO comm 0xabaed60 rank 14 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE
+gpua030:1461128:1461128 [2] NCCL INFO cudaDriverVersion 12020
+gpua030:1461128:1461128 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.30<0>
+gpua030:1461128:1461128 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua030:1461128:1461199 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.30<0>
+gpua030:1461128:1461199 [2] NCCL INFO Using network IB
+gpua030:1461128:1461199 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000
+gpua030:1461128:1461199 [2] NCCL INFO Trees [0] 23/-1/-1->22->21 [1] 23/-1/-1->22->21
+gpua030:1461128:1461199 [2] NCCL INFO Channel 00/0 : 22[85000] -> 23[c7000] via P2P/IPC/read
+gpua030:1461128:1461199 [2] NCCL INFO Channel 01/0 : 22[85000] -> 23[c7000] via P2P/IPC/read
+gpua030:1461128:1461199 [2] NCCL INFO Connected all rings
+gpua030:1461128:1461199 [2] NCCL INFO Channel 00/0 : 22[85000] -> 21[46000] via P2P/IPC/read
+gpua030:1461128:1461199 [2] NCCL INFO Channel 01/0 : 22[85000] -> 21[46000] via P2P/IPC/read
+gpua030:1461128:1461199 [2] NCCL INFO Connected all trees
+gpua030:1461128:1461199 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua030:1461128:1461199 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua030:1461128:1461199 [2] NCCL INFO comm 0xf629850 rank 22 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE
+gpua082:1180102:1180102 [1] NCCL INFO cudaDriverVersion 12020
+gpua082:1180102:1180102 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.82<0>
+gpua082:1180102:1180102 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua082:1180102:1180163 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.82<0>
+gpua082:1180102:1180163 [1] NCCL INFO Using network IB
+gpua082:1180102:1180163 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000
+gpua082:1180102:1180163 [1] NCCL INFO Trees [0] 42/36/-1->41->40 [1] 42/-1/-1->41->40
+gpua082:1180102:1180163 [1] NCCL INFO Channel 00/0 : 41[46000] -> 42[85000] via P2P/IPC/read
+gpua082:1180102:1180163 [1] NCCL INFO Channel 01/0 : 41[46000] -> 42[85000] via P2P/IPC/read
+gpua082:1180102:1180163 [1] NCCL INFO Connected all rings
+gpua082:1180102:1180163 [1] NCCL INFO Channel 00/0 : 36[7000] -> 41[46000] [receive] via NET/IB/0
+gpua082:1180102:1180163 [1] NCCL INFO Channel 00/0 : 41[46000] -> 36[7000] [send] via NET/IB/0
+gpua082:1180102:1180163 [1] NCCL INFO Channel 00/0 : 41[46000] -> 40[7000] via P2P/IPC/read
+gpua082:1180102:1180163 [1] NCCL INFO Channel 01/0 : 41[46000] -> 40[7000] via P2P/IPC/read
+gpua082:1180102:1180163 [1] NCCL INFO Connected all trees
+gpua082:1180102:1180163 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua082:1180102:1180163 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua082:1180102:1180163 [1] NCCL INFO comm 0xd6383a0 rank 41 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE
+gpua026:1592970:1592970 [1] NCCL INFO cudaDriverVersion 12020
+gpua026:1592970:1592970 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.26<0>
+gpua026:1592970:1592970 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua026:1592970:1593036 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.26<0>
+gpua026:1592970:1593036 [1] NCCL INFO Using network IB
+gpua026:1592970:1593036 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000
+gpua026:1592970:1593036 [1] NCCL INFO Trees [0] 18/8/-1->17->16 [1] 18/-1/-1->17->16
+gpua026:1592970:1593036 [1] NCCL INFO Channel 00/0 : 17[46000] -> 18[85000] via P2P/IPC/read
+gpua026:1592970:1593036 [1] NCCL INFO Channel 01/0 : 17[46000] -> 18[85000] via P2P/IPC/read
+gpua026:1592970:1593036 [1] NCCL INFO Connected all rings
+gpua026:1592970:1593036 [1] NCCL INFO Channel 00/0 : 8[7000] -> 17[46000] [receive] via NET/IB/0
+gpua026:1592970:1593036 [1] NCCL INFO Channel 00/0 : 17[46000] -> 8[7000] [send] via NET/IB/0
+gpua026:1592970:1593036 [1] NCCL INFO Channel 00/0 : 17[46000] -> 16[7000] via P2P/IPC/read
+gpua026:1592970:1593036 [1] NCCL INFO Channel 01/0 : 17[46000] -> 16[7000] via P2P/IPC/read
+gpua026:1592970:1593036 [1] NCCL INFO Connected all trees
+gpua026:1592970:1593036 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua026:1592970:1593036 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua026:1592970:1593036 [1] NCCL INFO comm 0xd1bd9d0 rank 17 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE
+gpua007:1077995:1077995 [3] NCCL INFO cudaDriverVersion 12020
+gpua007:1077995:1077995 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.7<0>
+gpua007:1077995:1077995 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua007:1077995:1078060 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.7<0>
+gpua007:1077995:1078060 [3] NCCL INFO Using network IB
+gpua007:1077995:1078060 [3] NCCL INFO Setting affinity for GPU 3 to ffff
+gpua007:1077995:1078060 [3] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] -1/-1/-1->7->6
+gpua007:1077995:1078060 [3] NCCL INFO Channel 00/0 : 7[c7000] -> 8[7000] [send] via NET/IB/0
+gpua007:1077995:1078060 [3] NCCL INFO Channel 01/0 : 7[c7000] -> 8[7000] [send] via NET/IB/0
+gpua007:1077995:1078060 [3] NCCL INFO Connected all rings
+gpua007:1077995:1078060 [3] NCCL INFO Channel 00/0 : 7[c7000] -> 6[85000] via P2P/IPC/read
+gpua007:1077995:1078060 [3] NCCL INFO Channel 01/0 : 7[c7000] -> 6[85000] via P2P/IPC/read
+gpua007:1077995:1078060 [3] NCCL INFO Connected all trees
+gpua007:1077995:1078060 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua007:1077995:1078060 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua007:1077995:1078060 [3] NCCL INFO comm 0xe5cc7a60 rank 7 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE
+gpua086:1391750:1391750 [0] NCCL INFO cudaDriverVersion 12020
+gpua086:1391750:1391750 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.86<0>
+gpua086:1391750:1391750 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua086:1391750:1391805 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.86<0>
+gpua086:1391750:1391805 [0] NCCL INFO Using network IB
+gpua086:1391750:1391805 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000
+gpua086:1391750:1391805 [0] NCCL INFO Trees [0] 45/-1/-1->44->40 [1] 45/36/-1->44->29
+gpua086:1391750:1391805 [0] NCCL INFO Channel 00/0 : 43[c7000] -> 44[7000] [receive] via NET/IB/0
+gpua086:1391750:1391805 [0] NCCL INFO Channel 01/0 : 43[c7000] -> 44[7000] [receive] via NET/IB/0
+gpua086:1391750:1391805 [0] NCCL INFO Channel 00/0 : 44[7000] -> 45[46000] via P2P/IPC/read
+gpua086:1391750:1391805 [0] NCCL INFO Channel 01/0 : 44[7000] -> 45[46000] via P2P/IPC/read
+gpua086:1391750:1391805 [0] NCCL INFO Connected all rings
+gpua086:1391750:1391805 [0] NCCL INFO Channel 00/0 : 40[7000] -> 44[7000] [receive] via NET/IB/0
+gpua086:1391750:1391805 [0] NCCL INFO Channel 01/0 : 36[7000] -> 44[7000] [receive] via NET/IB/0
+gpua086:1391750:1391805 [0] NCCL INFO Channel 01/0 : 29[46000] -> 44[7000] [receive] via NET/IB/0
+gpua086:1391750:1391805 [0] NCCL INFO Channel 01/0 : 44[7000] -> 29[46000] [send] via NET/IB/0
+gpua086:1391750:1391805 [0] NCCL INFO Channel 01/0 : 44[7000] -> 36[7000] [send] via NET/IB/0
+gpua086:1391750:1391805 [0] NCCL INFO Channel 00/0 : 44[7000] -> 40[7000] [send] via NET/IB/0
+gpua086:1391750:1391805 [0] NCCL INFO Connected all trees
+gpua086:1391750:1391805 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua086:1391750:1391805 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua086:1391750:1391805 [0] NCCL INFO comm 0x99615e60 rank 44 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE
+gpua039:1017707:1017707 [2] NCCL INFO cudaDriverVersion 12020
+gpua039:1017707:1017707 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.39<0>
+gpua039:1017707:1017707 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua039:1017707:1017762 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.39<0>
+gpua039:1017707:1017762 [2] NCCL INFO Using network IB
+gpua039:1017707:1017762 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000
+gpua039:1017707:1017762 [2] NCCL INFO Trees [0] 35/-1/-1->34->33 [1] 35/-1/-1->34->33
+gpua039:1017707:1017762 [2] NCCL INFO Channel 00/0 : 34[85000] -> 35[c7000] via P2P/IPC/read
+gpua039:1017707:1017762 [2] NCCL INFO Channel 01/0 : 34[85000] -> 35[c7000] via P2P/IPC/read
+gpua039:1017707:1017762 [2] NCCL INFO Connected all rings
+gpua039:1017707:1017762 [2] NCCL INFO Channel 00/0 : 34[85000] -> 33[46000] via P2P/IPC/read
+gpua039:1017707:1017762 [2] NCCL INFO Channel 01/0 : 34[85000] -> 33[46000] via P2P/IPC/read
+gpua039:1017707:1017762 [2] NCCL INFO Connected all trees
+gpua039:1017707:1017762 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua039:1017707:1017762 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua039:1017707:1017762 [2] NCCL INFO comm 0x1430f280 rank 34 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE
+gpua039:1017708:1017708 [3] NCCL INFO cudaDriverVersion 12020
+gpua039:1017708:1017708 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.39<0>
+gpua039:1017708:1017708 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua039:1017708:1017764 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.39<0>
+gpua039:1017708:1017764 [3] NCCL INFO Using network IB
+gpua039:1017708:1017764 [3] NCCL INFO Setting affinity for GPU 3 to ffff
+gpua039:1017708:1017764 [3] NCCL INFO Trees [0] -1/-1/-1->35->34 [1] -1/-1/-1->35->34
+gpua039:1017708:1017764 [3] NCCL INFO Channel 00/0 : 35[c7000] -> 36[7000] [send] via NET/IB/0
+gpua039:1017708:1017764 [3] NCCL INFO Channel 01/0 : 35[c7000] -> 36[7000] [send] via NET/IB/0
+gpua039:1017708:1017764 [3] NCCL INFO Connected all rings
+gpua039:1017708:1017764 [3] NCCL INFO Channel 00/0 : 35[c7000] -> 34[85000] via P2P/IPC/read
+gpua039:1017708:1017764 [3] NCCL INFO Channel 01/0 : 35[c7000] -> 34[85000] via P2P/IPC/read
+gpua039:1017708:1017764 [3] NCCL INFO Connected all trees
+gpua039:1017708:1017764 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua039:1017708:1017764 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua039:1017708:1017764 [3] NCCL INFO comm 0xe85acd60 rank 35 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE
+gpua026:1592971:1592971 [2] NCCL INFO cudaDriverVersion 12020
+gpua026:1592971:1592971 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.26<0>
+gpua026:1592971:1592971 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua026:1592971:1593032 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.26<0>
+gpua026:1592971:1593032 [2] NCCL INFO Using network IB
+gpua026:1592971:1593032 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000
+gpua026:1592971:1593032 [2] NCCL INFO Trees [0] 19/-1/-1->18->17 [1] 19/-1/-1->18->17
+gpua026:1592971:1593032 [2] NCCL INFO Channel 00/0 : 18[85000] -> 19[c7000] via P2P/IPC/read
+gpua026:1592971:1593032 [2] NCCL INFO Channel 01/0 : 18[85000] -> 19[c7000] via P2P/IPC/read
+gpua026:1592971:1593032 [2] NCCL INFO Connected all rings
+gpua026:1592971:1593032 [2] NCCL INFO Channel 00/0 : 18[85000] -> 17[46000] via P2P/IPC/read
+gpua026:1592971:1593032 [2] NCCL INFO Channel 01/0 : 18[85000] -> 17[46000] via P2P/IPC/read
+gpua007:1077992:1077992 [0] NCCL INFO cudaDriverVersion 12020
+gpua007:1077992:1077992 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.7<0>
+gpua007:1077992:1077992 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua007:1077992:1078056 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.7<0>
+gpua007:1077992:1078056 [0] NCCL INFO Using network IB
+gpua007:1077992:1078056 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000
+gpua007:1077992:1078056 [0] NCCL INFO Trees [0] 5/-1/-1->4->9 [1] 5/0/-1->4->12
+gpua007:1077992:1078056 [0] NCCL INFO Channel 00/0 : 3[c7000] -> 4[7000] [receive] via NET/IB/0
+gpua007:1077992:1078056 [0] NCCL INFO Channel 01/0 : 3[c7000] -> 4[7000] [receive] via NET/IB/0
+gpua007:1077992:1078056 [0] NCCL INFO Channel 00/0 : 4[7000] -> 5[46000] via P2P/IPC/read
+gpua007:1077992:1078056 [0] NCCL INFO Channel 01/0 : 4[7000] -> 5[46000] via P2P/IPC/read
+gpua007:1077992:1078056 [0] NCCL INFO Connected all rings
+gpua026:1592971:1593032 [2] NCCL INFO Connected all trees
+gpua026:1592971:1593032 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua026:1592971:1593032 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua026:1592971:1593032 [2] NCCL INFO comm 0x4afd8f60 rank 18 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE
+gpua007:1077992:1078056 [0] NCCL INFO Channel 01/0 : 0[7000] -> 4[7000] [receive] via NET/IB/0
+gpua007:1077992:1078056 [0] NCCL INFO Channel 00/0 : 4[7000] -> 9[46000] [send] via NET/IB/0
+gpua007:1077992:1078056 [0] NCCL INFO Channel 01/0 : 4[7000] -> 12[7000] [send] via NET/IB/0
+gpua007:1077992:1078056 [0] NCCL INFO Channel 01/0 : 12[7000] -> 4[7000] [receive] via NET/IB/0
+gpua007:1077992:1078056 [0] NCCL INFO Channel 00/0 : 9[46000] -> 4[7000] [receive] via NET/IB/0
+gpua007:1077992:1078056 [0] NCCL INFO Channel 01/0 : 4[7000] -> 0[7000] [send] via NET/IB/0
+gpua007:1077992:1078056 [0] NCCL INFO Connected all trees
+gpua007:1077992:1078056 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua007:1077992:1078056 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua007:1077992:1078056 [0] NCCL INFO comm 0x27d85fb0 rank 4 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE
+gpua026:1592972:1592972 [3] NCCL INFO cudaDriverVersion 12020
+gpua026:1592972:1592972 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.26<0>
+gpua026:1592972:1592972 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua026:1592972:1593030 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.26<0>
+gpua026:1592972:1593030 [3] NCCL INFO Using network IB
+gpua026:1592972:1593030 [3] NCCL INFO Setting affinity for GPU 3 to ffff
+gpua026:1592972:1593030 [3] NCCL INFO Trees [0] -1/-1/-1->19->18 [1] -1/-1/-1->19->18
+gpua026:1592972:1593030 [3] NCCL INFO Channel 00/0 : 19[c7000] -> 20[7000] [send] via NET/IB/0
+gpua026:1592972:1593030 [3] NCCL INFO Channel 01/0 : 19[c7000] -> 20[7000] [send] via NET/IB/0
+gpua026:1592972:1593030 [3] NCCL INFO Connected all rings
+gpua026:1592972:1593030 [3] NCCL INFO Channel 00/0 : 19[c7000] -> 18[85000] via P2P/IPC/read
+gpua026:1592972:1593030 [3] NCCL INFO Channel 01/0 : 19[c7000] -> 18[85000] via P2P/IPC/read
+gpua026:1592972:1593030 [3] NCCL INFO Connected all trees
+gpua026:1592972:1593030 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua026:1592972:1593030 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua026:1592972:1593030 [3] NCCL INFO comm 0xd4d7540 rank 19 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE
+gpua086:1391752:1391752 [2] NCCL INFO cudaDriverVersion 12020
+gpua086:1391752:1391752 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.86<0>
+gpua086:1391752:1391752 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua086:1391752:1391807 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.86<0>
+gpua086:1391752:1391807 [2] NCCL INFO Using network IB
+gpua086:1391752:1391807 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000
+gpua086:1391752:1391807 [2] NCCL INFO Trees [0] 47/-1/-1->46->45 [1] 47/-1/-1->46->45
+gpua086:1391752:1391807 [2] NCCL INFO Channel 00/0 : 46[85000] -> 47[c7000] via P2P/IPC/read
+gpua086:1391752:1391807 [2] NCCL INFO Channel 01/0 : 46[85000] -> 47[c7000] via P2P/IPC/read
+gpua086:1391752:1391807 [2] NCCL INFO Connected all rings
+gpua086:1391752:1391807 [2] NCCL INFO Channel 00/0 : 46[85000] -> 45[46000] via P2P/IPC/read
+gpua086:1391752:1391807 [2] NCCL INFO Channel 01/0 : 46[85000] -> 45[46000] via P2P/IPC/read
+gpua086:1391752:1391807 [2] NCCL INFO Connected all trees
+gpua086:1391752:1391807 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua086:1391752:1391807 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua086:1391752:1391807 [2] NCCL INFO comm 0x191a7a30 rank 46 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE
+gpua039:1017706:1017706 [1] NCCL INFO cudaDriverVersion 12020
+gpua039:1017706:1017706 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.39<0>
+gpua039:1017706:1017706 [1] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua039:1017706:1017765 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.39<0>
+gpua039:1017706:1017765 [1] NCCL INFO Using network IB
+gpua039:1017706:1017765 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000
+gpua039:1017706:1017765 [1] NCCL INFO Trees [0] 34/16/-1->33->32 [1] 34/-1/-1->33->32
+gpua039:1017706:1017765 [1] NCCL INFO Channel 00/0 : 33[46000] -> 34[85000] via P2P/IPC/read
+gpua039:1017706:1017765 [1] NCCL INFO Channel 01/0 : 33[46000] -> 34[85000] via P2P/IPC/read
+gpua039:1017706:1017765 [1] NCCL INFO Connected all rings
+gpua039:1017706:1017765 [1] NCCL INFO Channel 00/0 : 16[7000] -> 33[46000] [receive] via NET/IB/0
+gpua039:1017706:1017765 [1] NCCL INFO Channel 00/0 : 33[46000] -> 16[7000] [send] via NET/IB/0
+gpua039:1017706:1017765 [1] NCCL INFO Channel 00/0 : 33[46000] -> 32[7000] via P2P/IPC/read
+gpua039:1017706:1017765 [1] NCCL INFO Channel 01/0 : 33[46000] -> 32[7000] via P2P/IPC/read
+gpua039:1017706:1017765 [1] NCCL INFO Connected all trees
+gpua039:1017706:1017765 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua039:1017706:1017765 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua039:1017706:1017765 [1] NCCL INFO comm 0xac11f30 rank 33 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE
+gpua082:1180104:1180104 [3] NCCL INFO cudaDriverVersion 12020
+gpua082:1180104:1180104 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.82<0>
+gpua082:1180104:1180104 [3] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua082:1180104:1180164 [3] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.82<0>
+gpua082:1180104:1180164 [3] NCCL INFO Using network IB
+gpua082:1180104:1180164 [3] NCCL INFO Setting affinity for GPU 3 to ffff
+gpua082:1180104:1180164 [3] NCCL INFO Trees [0] -1/-1/-1->43->42 [1] -1/-1/-1->43->42
+gpua082:1180104:1180164 [3] NCCL INFO Channel 00/0 : 43[c7000] -> 44[7000] [send] via NET/IB/0
+gpua082:1180104:1180164 [3] NCCL INFO Channel 01/0 : 43[c7000] -> 44[7000] [send] via NET/IB/0
+gpua082:1180104:1180164 [3] NCCL INFO Connected all rings
+gpua082:1180104:1180164 [3] NCCL INFO Channel 00/0 : 43[c7000] -> 42[85000] via P2P/IPC/read
+gpua082:1180104:1180164 [3] NCCL INFO Channel 01/0 : 43[c7000] -> 42[85000] via P2P/IPC/read
+gpua082:1180104:1180164 [3] NCCL INFO Connected all trees
+gpua082:1180104:1180164 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua082:1180104:1180164 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua082:1180104:1180164 [3] NCCL INFO comm 0x1806bd00 rank 43 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE
+gpua082:1180101:1180101 [0] NCCL INFO cudaDriverVersion 12020
+gpua082:1180101:1180101 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.82<0>
+gpua082:1180101:1180101 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua082:1180101:1180165 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.82<0>
+gpua082:1180101:1180165 [0] NCCL INFO Using network IB
+gpua082:1180101:1180165 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000
+gpua082:1180101:1180165 [0] NCCL INFO Trees [0] 41/44/-1->40->49 [1] 41/-1/-1->40->37
+gpua082:1180101:1180165 [0] NCCL INFO Channel 00/0 : 39[c7000] -> 40[7000] [receive] via NET/IB/0
+gpua082:1180101:1180165 [0] NCCL INFO Channel 01/0 : 39[c7000] -> 40[7000] [receive] via NET/IB/0
+gpua082:1180101:1180165 [0] NCCL INFO Channel 00/0 : 40[7000] -> 41[46000] via P2P/IPC/read
+gpua082:1180101:1180165 [0] NCCL INFO Channel 01/0 : 40[7000] -> 41[46000] via P2P/IPC/read
+gpua082:1180101:1180165 [0] NCCL INFO Connected all rings
+gpua082:1180101:1180165 [0] NCCL INFO Channel 01/0 : 37[46000] -> 40[7000] [receive] via NET/IB/0
+gpua082:1180101:1180165 [0] NCCL INFO Channel 00/0 : 40[7000] -> 44[7000] [send] via NET/IB/0
+gpua082:1180101:1180165 [0] NCCL INFO Channel 00/0 : 40[7000] -> 49[46000] [send] via NET/IB/0
+gpua082:1180101:1180165 [0] NCCL INFO Channel 00/0 : 49[46000] -> 40[7000] [receive] via NET/IB/0
+gpua082:1180101:1180165 [0] NCCL INFO Channel 00/0 : 44[7000] -> 40[7000] [receive] via NET/IB/0
+gpua082:1180101:1180165 [0] NCCL INFO Channel 01/0 : 40[7000] -> 37[46000] [send] via NET/IB/0
+gpua082:1180101:1180165 [0] NCCL INFO Connected all trees
+gpua082:1180101:1180165 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua082:1180101:1180165 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua082:1180101:1180165 [0] NCCL INFO comm 0x9a1c2260 rank 40 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE
+gpua039:1017705:1017705 [0] NCCL INFO cudaDriverVersion 12020
+gpua039:1017705:1017705 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.39<0>
+gpua039:1017705:1017705 [0] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua039:1017705:1017763 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.39<0>
+gpua039:1017705:1017763 [0] NCCL INFO Using network IB
+gpua039:1017705:1017763 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000
+gpua039:1017705:1017763 [0] NCCL INFO Trees [0] 33/48/-1->32->0 [1] 33/-1/-1->32->36
+gpua039:1017705:1017763 [0] NCCL INFO Channel 00/0 : 31[c7000] -> 32[7000] [receive] via NET/IB/0
+gpua039:1017705:1017763 [0] NCCL INFO Channel 01/0 : 31[c7000] -> 32[7000] [receive] via NET/IB/0
+gpua039:1017705:1017763 [0] NCCL INFO Channel 00/0 : 32[7000] -> 33[46000] via P2P/IPC/read
+gpua039:1017705:1017763 [0] NCCL INFO Channel 01/0 : 32[7000] -> 33[46000] via P2P/IPC/read
+gpua039:1017705:1017763 [0] NCCL INFO Connected all rings
+gpua039:1017705:1017763 [0] NCCL INFO Channel 01/0 : 32[7000] -> 36[7000] [send] via NET/IB/0
+gpua039:1017705:1017763 [0] NCCL INFO Channel 00/0 : 32[7000] -> 48[7000] [send] via NET/IB/0
+gpua039:1017705:1017763 [0] NCCL INFO Channel 00/0 : 0[7000] -> 32[7000] [receive] via NET/IB/0
+gpua039:1017705:1017763 [0] NCCL INFO Channel 00/0 : 32[7000] -> 0[7000] [send] via NET/IB/0
+gpua039:1017705:1017763 [0] NCCL INFO Channel 00/0 : 48[7000] -> 32[7000] [receive] via NET/IB/0
+gpua039:1017705:1017763 [0] NCCL INFO Channel 01/0 : 36[7000] -> 32[7000] [receive] via NET/IB/0
+gpua039:1017705:1017763 [0] NCCL INFO Connected all trees
+gpua039:1017705:1017763 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua039:1017705:1017763 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua039:1017705:1017763 [0] NCCL INFO comm 0xeba6590 rank 32 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE
+gpua007:1077994:1077994 [2] NCCL INFO cudaDriverVersion 12020
+gpua007:1077994:1077994 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.7<0>
+gpua007:1077994:1077994 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua007:1077994:1078057 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.7<0>
+gpua007:1077994:1078057 [2] NCCL INFO Using network IB
+gpua007:1077994:1078057 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000
+gpua007:1077994:1078057 [2] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5
+gpua007:1077994:1078057 [2] NCCL INFO Channel 00/0 : 6[85000] -> 7[c7000] via P2P/IPC/read
+gpua007:1077994:1078057 [2] NCCL INFO Channel 01/0 : 6[85000] -> 7[c7000] via P2P/IPC/read
+gpua007:1077994:1078057 [2] NCCL INFO Connected all rings
+gpua007:1077994:1078057 [2] NCCL INFO Channel 00/0 : 6[85000] -> 5[46000] via P2P/IPC/read
+gpua007:1077994:1078057 [2] NCCL INFO Channel 01/0 : 6[85000] -> 5[46000] via P2P/IPC/read
+gpua007:1077994:1078057 [2] NCCL INFO Connected all trees
+gpua007:1077994:1078057 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua007:1077994:1078057 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua007:1077994:1078057 [2] NCCL INFO comm 0xb4d5ee0 rank 6 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE
+gpua082:1180103:1180103 [2] NCCL INFO cudaDriverVersion 12020
+gpua082:1180103:1180103 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.82<0>
+gpua082:1180103:1180103 [2] NCCL INFO NET/Plugin : No plugin found (libnccl-net.so), using internal implementation
+gpua082:1180103:1180162 [2] NCCL INFO NET/IB : Using [0]mlx5_0:1/RoCE [RO]; OOB eth1:172.28.23.82<0>
+gpua082:1180103:1180162 [2] NCCL INFO Using network IB
+gpua082:1180103:1180162 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000
+gpua082:1180103:1180162 [2] NCCL INFO Trees [0] 43/-1/-1->42->41 [1] 43/-1/-1->42->41
+gpua082:1180103:1180162 [2] NCCL INFO Channel 00/0 : 42[85000] -> 43[c7000] via P2P/IPC/read
+gpua082:1180103:1180162 [2] NCCL INFO Channel 01/0 : 42[85000] -> 43[c7000] via P2P/IPC/read
+gpua082:1180103:1180162 [2] NCCL INFO Connected all rings
+gpua082:1180103:1180162 [2] NCCL INFO Channel 00/0 : 42[85000] -> 41[46000] via P2P/IPC/read
+gpua082:1180103:1180162 [2] NCCL INFO Channel 01/0 : 42[85000] -> 41[46000] via P2P/IPC/read
+gpua082:1180103:1180162 [2] NCCL INFO Connected all trees
+gpua082:1180103:1180162 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512
+gpua082:1180103:1180162 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer
+gpua082:1180103:1180162 [2] NCCL INFO comm 0xdd044a40 rank 42 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE
+[gpua005:0/64] 2023-12-19 07:41:23,409 (distributed:1027) INFO: Reducer buckets have been rebuilt in this iteration.
+[gpua005:0/64] 2023-12-19 07:43:59,500 (trainer:737) INFO: 41epoch:train:1-100batch: iter_time=1.204, forward_time=0.221, loss_ctc=55.100, loss_att=42.845, acc=0.739, loss=46.522, backward_time=0.348, grad_norm=94.327, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.133, optim0_lr0=6.325e-05, train_time=5.981
+[gpua005:0/64] 2023-12-19 07:47:35,053 (trainer:737) INFO: 41epoch:train:101-200batch: iter_time=9.309e-05, forward_time=0.146, loss_ctc=65.088, loss_att=57.248, acc=0.723, loss=59.600, backward_time=0.479, grad_norm=98.315, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.132, optim0_lr0=6.325e-05, train_time=2.155
+[gpua005:0/64] 2023-12-19 07:50:44,189 (trainer:737) INFO: 41epoch:train:201-300batch: iter_time=8.978e-05, forward_time=0.194, loss_ctc=73.501, loss_att=58.783, acc=0.739, loss=63.198, backward_time=0.356, grad_norm=74.569, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.133, optim0_lr0=6.324e-05, train_time=1.892
+[gpua005:0/64] 2023-12-19 07:53:54,300 (trainer:737) INFO: 41epoch:train:301-400batch: iter_time=9.069e-05, forward_time=0.143, loss_ctc=75.424, loss_att=55.108, acc=0.746, loss=61.203, backward_time=0.375, grad_norm=72.005, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.132, optim0_lr0=6.324e-05, train_time=1.901
+[gpua005:0/64] 2023-12-19 07:56:40,893 (trainer:737) INFO: 41epoch:train:401-500batch: iter_time=8.743e-05, forward_time=0.142, loss_ctc=63.892, loss_att=53.441, acc=0.730, loss=56.576, backward_time=0.346, grad_norm=77.405, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.132, optim0_lr0=6.323e-05, train_time=1.666
+[gpua005:0/64] 2023-12-19 07:59:22,626 (trainer:737) INFO: 41epoch:train:501-600batch: iter_time=8.408e-05, forward_time=0.143, loss_ctc=58.122, loss_att=48.200, acc=0.724, loss=51.176, backward_time=0.340, grad_norm=66.126, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.132, optim0_lr0=6.322e-05, train_time=1.617
+[gpua005:0/64] 2023-12-19 08:02:42,944 (trainer:737) INFO: 41epoch:train:601-700batch: iter_time=9.171e-05, forward_time=0.143, loss_ctc=60.198, loss_att=50.087, acc=0.732, loss=53.120, backward_time=0.354, grad_norm=57.926, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.132, optim0_lr0=6.322e-05, train_time=2.003
+[gpua005:0/64] 2023-12-19 08:05:55,056 (trainer:737) INFO: 41epoch:train:701-800batch: iter_time=9.396e-05, forward_time=0.143, loss_ctc=70.155, loss_att=52.466, acc=0.733, loss=57.773, backward_time=0.378, grad_norm=77.194, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.132, optim0_lr0=6.321e-05, train_time=1.921
+[gpua005:0/64] 2023-12-19 08:08:56,852 (trainer:737) INFO: 41epoch:train:801-900batch: iter_time=9.545e-05, forward_time=0.144, loss_ctc=71.162, loss_att=54.327, acc=0.727, loss=59.378, backward_time=0.308, grad_norm=68.312, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.133, optim0_lr0=6.321e-05, train_time=1.818
+[gpua005:0/64] 2023-12-19 08:12:01,194 (trainer:737) INFO: 41epoch:train:901-1000batch: iter_time=1.004e-04, forward_time=0.148, loss_ctc=63.893, loss_att=49.375, acc=0.747, loss=53.731, backward_time=0.324, grad_norm=68.012, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.133, optim0_lr0=6.320e-05, train_time=1.843
+[gpua005:0/64] 2023-12-19 08:14:47,587 (trainer:737) INFO: 41epoch:train:1001-1100batch: iter_time=9.204e-05, forward_time=0.148, loss_ctc=56.876, loss_att=42.663, acc=0.761, loss=46.927, backward_time=0.327, grad_norm=57.369, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.132, optim0_lr0=6.320e-05, train_time=1.664
+[gpua005:0/64] 2023-12-19 08:15:45,541 (trainer:668) WARNING: The grad norm is nan. Skipping updating the model.
+[gpua005:0/64] 2023-12-19 08:17:42,751 (trainer:737) INFO: 41epoch:train:1101-1200batch: iter_time=8.498e-05, forward_time=0.144, loss_ctc=69.304, loss_att=52.364, acc=0.726, loss=57.446, backward_time=0.326, grad_norm=78.668, clip=100.000, loss_scale=2.725e+31, optim_step_time=0.132, optim0_lr0=6.319e-05, train_time=1.751
+[gpua005:0/64] 2023-12-19 08:19:31,941 (multiple_iter_factory:32) INFO: Building 1th iter-factory...
+[gpua005:0/64] 2023-12-19 08:19:50,124 (s2t:445) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4')
+[gpua005:0/64] 2023-12-19 08:19:53,507 (abs_task:1616) INFO: [train] dataset:
+ESPnetDataset(
+  speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.5", "type": "kaldi_ark"}
+  text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.5", "type": "text"}
+  text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.5", "type": "text"}
+  text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.5", "type": "text"}
+  preprocess: <espnet2.train.preprocessor.S2TPreprocessor object at 0x7f1388fdf640>)
+[gpua005:0/64] 2023-12-19 08:19:53,507 (abs_task:1617) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.5, 
+[gpua005:0/64] 2023-12-19 08:19:53,566 (abs_task:1618) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257
+[gpua005:0/64] 2023-12-19 08:27:40,914 (trainer:737) INFO: 41epoch:train:1201-1300batch: iter_time=2.693, forward_time=0.223, loss_ctc=53.641, loss_att=42.368, acc=0.744, loss=45.750, backward_time=0.341, grad_norm=62.726, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.135, optim0_lr0=6.319e-05, train_time=5.981
+[gpua005:0/64] 2023-12-19 08:30:21,989 (trainer:737) INFO: 41epoch:train:1301-1400batch: iter_time=8.381e-05, forward_time=0.145, loss_ctc=57.597, loss_att=49.414, acc=0.730, loss=51.869, backward_time=0.302, grad_norm=67.107, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.132, optim0_lr0=6.318e-05, train_time=1.611
+[gpua005:0/64] 2023-12-19 08:32:59,887 (trainer:737) INFO: 41epoch:train:1401-1500batch: iter_time=8.186e-05, forward_time=0.146, loss_ctc=67.510, loss_att=56.394, acc=0.724, loss=59.729, backward_time=0.317, grad_norm=71.678, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.132, optim0_lr0=6.318e-05, train_time=1.579
+[gpua005:0/64] 2023-12-19 08:35:34,591 (trainer:737) INFO: 41epoch:train:1501-1600batch: iter_time=8.572e-05, forward_time=0.145, loss_ctc=72.170, loss_att=51.113, acc=0.751, loss=57.430, backward_time=0.340, grad_norm=68.267, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.132, optim0_lr0=6.317e-05, train_time=1.547
+[gpua005:0/64] 2023-12-19 08:38:14,819 (trainer:737) INFO: 41epoch:train:1601-1700batch: iter_time=8.660e-05, forward_time=0.145, loss_ctc=73.828, loss_att=60.163, acc=0.721, loss=64.262, backward_time=0.360, grad_norm=71.617, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.132, optim0_lr0=6.317e-05, train_time=1.602
+[gpua005:0/64] 2023-12-19 08:40:41,561 (trainer:737) INFO: 41epoch:train:1701-1800batch: iter_time=8.562e-05, forward_time=0.145, loss_ctc=64.245, loss_att=51.511, acc=0.725, loss=55.331, backward_time=0.317, grad_norm=67.463, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.132, optim0_lr0=6.316e-05, train_time=1.467
+[gpua005:0/64] 2023-12-19 08:43:35,151 (trainer:737) INFO: 41epoch:train:1801-1900batch: iter_time=8.584e-05, forward_time=0.147, loss_ctc=57.025, loss_att=49.961, acc=0.729, loss=52.080, backward_time=0.368, grad_norm=64.262, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.132, optim0_lr0=6.316e-05, train_time=1.736
+[gpua005:0/64] 2023-12-19 08:46:18,187 (trainer:737) INFO: 41epoch:train:1901-2000batch: iter_time=7.982e-05, forward_time=0.147, loss_ctc=62.316, loss_att=45.337, acc=0.731, loss=50.431, backward_time=0.312, grad_norm=70.445, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.132, optim0_lr0=6.315e-05, train_time=1.630
+[gpua005:0/64] 2023-12-19 08:48:50,786 (trainer:737) INFO: 41epoch:train:2001-2100batch: iter_time=8.173e-05, forward_time=0.238, loss_ctc=66.783, loss_att=50.277, acc=0.732, loss=55.229, backward_time=0.298, grad_norm=75.956, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.134, optim0_lr0=6.315e-05, train_time=1.526
+[gpua005:0/64] 2023-12-19 08:51:18,434 (trainer:737) INFO: 41epoch:train:2101-2200batch: iter_time=8.662e-05, forward_time=0.189, loss_ctc=72.082, loss_att=54.772, acc=0.727, loss=59.965, backward_time=0.295, grad_norm=71.436, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.133, optim0_lr0=6.314e-05, train_time=1.476
+[gpua005:0/64] 2023-12-19 08:53:46,204 (trainer:737) INFO: 41epoch:train:2201-2300batch: iter_time=8.823e-05, forward_time=0.146, loss_ctc=56.295, loss_att=47.373, acc=0.747, loss=50.049, backward_time=0.295, grad_norm=67.410, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.132, optim0_lr0=6.314e-05, train_time=1.478
+[gpua005:0/64] 2023-12-19 08:56:07,296 (trainer:737) INFO: 41epoch:train:2301-2400batch: iter_time=8.550e-05, forward_time=0.146, loss_ctc=66.330, loss_att=48.793, acc=0.727, loss=54.054, backward_time=0.282, grad_norm=81.286, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.132, optim0_lr0=6.313e-05, train_time=1.411
+[gpua005:0/64] 2023-12-19 08:58:17,945 (trainer:737) INFO: 41epoch:train:2401-2500batch: iter_time=8.492e-05, forward_time=0.146, loss_ctc=53.221, loss_att=41.264, acc=0.747, loss=44.851, backward_time=0.280, grad_norm=63.762, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.132, optim0_lr0=6.312e-05, train_time=1.306
+[gpua005:0/64] 2023-12-19 08:58:37,973 (multiple_iter_factory:32) INFO: Building 2th iter-factory...
+[gpua005:0/64] 2023-12-19 08:58:56,550 (s2t:445) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4')
+[gpua005:0/64] 2023-12-19 08:59:00,107 (abs_task:1616) INFO: [train] dataset:
+ESPnetDataset(
+  speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.11", "type": "kaldi_ark"}
+  text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.11", "type": "text"}
+  text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.11", "type": "text"}
+  text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.11", "type": "text"}
+  preprocess: <espnet2.train.preprocessor.S2TPreprocessor object at 0x7f12cc343b20>)
+[gpua005:0/64] 2023-12-19 08:59:00,107 (abs_task:1617) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.11, 
+[gpua005:0/64] 2023-12-19 08:59:00,110 (abs_task:1618) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257
+[gpua005:0/64] 2023-12-19 09:14:14,870 (trainer:737) INFO: 41epoch:train:2501-2600batch: iter_time=2.718, forward_time=0.181, loss_ctc=53.607, loss_att=42.170, acc=0.738, loss=45.601, backward_time=0.292, grad_norm=77.053, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.133, optim0_lr0=6.312e-05, train_time=9.569
+[gpua005:0/64] 2023-12-19 09:17:19,560 (trainer:737) INFO: 41epoch:train:2601-2700batch: iter_time=8.235e-05, forward_time=0.145, loss_ctc=64.664, loss_att=56.559, acc=0.719, loss=58.990, backward_time=0.346, grad_norm=72.153, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.132, optim0_lr0=6.311e-05, train_time=1.847
+[gpua005:0/64] 2023-12-19 09:20:15,034 (trainer:737) INFO: 41epoch:train:2701-2800batch: iter_time=8.415e-05, forward_time=0.146, loss_ctc=72.968, loss_att=56.246, acc=0.741, loss=61.263, backward_time=0.384, grad_norm=68.266, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.132, optim0_lr0=6.311e-05, train_time=1.755
+[gpua005:0/64] 2023-12-19 09:23:14,422 (trainer:737) INFO: 41epoch:train:2801-2900batch: iter_time=7.883e-05, forward_time=0.145, loss_ctc=74.263, loss_att=54.230, acc=0.741, loss=60.240, backward_time=0.314, grad_norm=72.915, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.132, optim0_lr0=6.310e-05, train_time=1.794
+[gpua005:0/64] 2023-12-19 09:26:15,287 (trainer:737) INFO: 41epoch:train:2901-3000batch: iter_time=8.588e-05, forward_time=0.146, loss_ctc=63.412, loss_att=51.536, acc=0.732, loss=55.099, backward_time=0.343, grad_norm=70.457, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.132, optim0_lr0=6.310e-05, train_time=1.808
+[gpua005:0/64] 2023-12-19 09:28:54,096 (trainer:737) INFO: 41epoch:train:3001-3100batch: iter_time=8.478e-05, forward_time=0.156, loss_ctc=57.227, loss_att=46.843, acc=0.721, loss=49.958, backward_time=0.307, grad_norm=71.884, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.133, optim0_lr0=6.309e-05, train_time=1.588
+[gpua005:0/64] 2023-12-19 09:31:32,292 (trainer:737) INFO: 41epoch:train:3101-3200batch: iter_time=8.015e-05, forward_time=0.181, loss_ctc=59.774, loss_att=49.081, acc=0.732, loss=52.289, backward_time=0.306, grad_norm=65.986, clip=100.000, loss_scale=3.347e+31, optim_step_time=0.132, optim0_lr0=6.309e-05, train_time=1.582
+[gpua005:0/64] 2023-12-19 09:34:14,253 (trainer:737) INFO: 41epoch:train:3201-3300batch: iter_time=8.396e-05, forward_time=0.182, loss_ctc=68.134, loss_att=51.476, acc=0.725, loss=56.474, backward_time=0.329, grad_norm=103.821, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.135, optim0_lr0=6.308e-05, train_time=1.619
+[gpua005:0/64] 2023-12-19 09:36:42,943 (trainer:737) INFO: 41epoch:train:3301-3400batch: iter_time=7.865e-05, forward_time=0.146, loss_ctc=69.935, loss_att=53.243, acc=0.724, loss=58.251, backward_time=0.302, grad_norm=70.259, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.132, optim0_lr0=6.308e-05, train_time=1.487
+[gpua005:0/64] 2023-12-19 09:39:10,682 (trainer:737) INFO: 41epoch:train:3401-3500batch: iter_time=7.748e-05, forward_time=0.146, loss_ctc=63.566, loss_att=49.274, acc=0.739, loss=53.561, backward_time=0.300, grad_norm=73.946, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.132, optim0_lr0=6.307e-05, train_time=1.477
+[gpua005:0/64] 2023-12-19 09:41:52,569 (trainer:737) INFO: 41epoch:train:3501-3600batch: iter_time=8.281e-05, forward_time=0.146, loss_ctc=56.550, loss_att=42.518, acc=0.756, loss=46.727, backward_time=0.301, grad_norm=58.228, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.132, optim0_lr0=6.307e-05, train_time=1.619
+[gpua005:0/64] 2023-12-19 09:44:20,433 (trainer:737) INFO: 41epoch:train:3601-3700batch: iter_time=8.177e-05, forward_time=0.145, loss_ctc=65.648, loss_att=51.761, acc=0.719, loss=55.927, backward_time=0.293, grad_norm=92.792, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.132, optim0_lr0=6.306e-05, train_time=1.478
+[gpua005:0/64] 2023-12-19 09:45:54,105 (multiple_iter_factory:32) INFO: Building 3th iter-factory...
+[gpua005:0/64] 2023-12-19 09:46:12,276 (s2t:445) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4')
+[gpua005:0/64] 2023-12-19 09:46:15,691 (abs_task:1616) INFO: [train] dataset:
+ESPnetDataset(
+  speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.1", "type": "kaldi_ark"}
+  text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.1", "type": "text"}
+  text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.1", "type": "text"}
+  text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.1", "type": "text"}
+  preprocess: <espnet2.train.preprocessor.S2TPreprocessor object at 0x7f12ccff8cd0>)
+[gpua005:0/64] 2023-12-19 09:46:15,692 (abs_task:1617) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.1, 
+[gpua005:0/64] 2023-12-19 09:46:15,766 (abs_task:1618) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257
+[gpua005:0/64] 2023-12-19 09:51:56,426 (trainer:737) INFO: 41epoch:train:3701-3800batch: iter_time=2.254, forward_time=0.201, loss_ctc=52.426, loss_att=42.095, acc=0.745, loss=45.195, backward_time=0.291, grad_norm=68.568, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.134, optim0_lr0=6.306e-05, train_time=4.560
+[gpua005:0/64] 2023-12-19 09:53:56,730 (trainer:737) INFO: 41epoch:train:3801-3900batch: iter_time=7.674e-05, forward_time=0.145, loss_ctc=57.324, loss_att=47.854, acc=0.736, loss=50.695, backward_time=0.277, grad_norm=69.337, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.132, optim0_lr0=6.305e-05, train_time=1.203
+[gpua005:0/64] 2023-12-19 09:56:03,264 (trainer:737) INFO: 41epoch:train:3901-4000batch: iter_time=8.383e-05, forward_time=0.145, loss_ctc=67.113, loss_att=55.495, acc=0.727, loss=58.981, backward_time=0.280, grad_norm=75.479, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.132, optim0_lr0=6.305e-05, train_time=1.265
+[gpua005:0/64] 2023-12-19 09:58:48,214 (trainer:737) INFO: 41epoch:train:4001-4100batch: iter_time=8.483e-05, forward_time=0.145, loss_ctc=71.100, loss_att=49.833, acc=0.756, loss=56.213, backward_time=0.286, grad_norm=66.334, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.132, optim0_lr0=6.304e-05, train_time=1.649
+[gpua005:0/64] 2023-12-19 10:01:34,261 (trainer:737) INFO: 41epoch:train:4101-4200batch: iter_time=8.412e-05, forward_time=0.145, loss_ctc=73.392, loss_att=59.372, acc=0.725, loss=63.578, backward_time=0.293, grad_norm=78.160, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.132, optim0_lr0=6.304e-05, train_time=1.660
+[gpua005:0/64] 2023-12-19 10:03:43,370 (trainer:737) INFO: 41epoch:train:4201-4300batch: iter_time=8.539e-05, forward_time=0.189, loss_ctc=64.164, loss_att=50.874, acc=0.728, loss=54.861, backward_time=0.309, grad_norm=81.778, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.138, optim0_lr0=6.303e-05, train_time=1.291
+[gpua005:0/64] 2023-12-19 10:06:08,902 (trainer:737) INFO: 41epoch:train:4301-4400batch: iter_time=8.289e-05, forward_time=0.182, loss_ctc=56.845, loss_att=49.396, acc=0.732, loss=51.630, backward_time=0.309, grad_norm=58.320, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.135, optim0_lr0=6.303e-05, train_time=1.455
+[gpua005:0/64] 2023-12-19 10:08:12,580 (trainer:737) INFO: 41epoch:train:4401-4500batch: iter_time=8.583e-05, forward_time=0.145, loss_ctc=61.408, loss_att=44.612, acc=0.733, loss=49.651, backward_time=0.283, grad_norm=71.209, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.132, optim0_lr0=6.302e-05, train_time=1.237
+[gpua005:0/64] 2023-12-19 10:11:20,866 (trainer:737) INFO: 41epoch:train:4501-4600batch: iter_time=8.357e-05, forward_time=0.144, loss_ctc=66.772, loss_att=49.896, acc=0.735, loss=54.959, backward_time=0.355, grad_norm=71.320, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.131, optim0_lr0=6.302e-05, train_time=1.883
+[gpua005:0/64] 2023-12-19 10:14:16,285 (trainer:737) INFO: 41epoch:train:4601-4700batch: iter_time=8.832e-05, forward_time=0.145, loss_ctc=70.842, loss_att=53.694, acc=0.731, loss=58.839, backward_time=0.307, grad_norm=75.111, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.132, optim0_lr0=6.301e-05, train_time=1.754
+[gpua005:0/64] 2023-12-19 10:17:06,376 (trainer:737) INFO: 41epoch:train:4701-4800batch: iter_time=8.662e-05, forward_time=0.145, loss_ctc=56.266, loss_att=46.901, acc=0.749, loss=49.710, backward_time=0.287, grad_norm=71.465, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.132, optim0_lr0=6.300e-05, train_time=1.701
+[gpua005:0/64] 2023-12-19 10:19:16,313 (trainer:737) INFO: 41epoch:train:4801-4900batch: iter_time=1.011e-04, forward_time=0.145, loss_ctc=64.957, loss_att=48.404, acc=0.730, loss=53.370, backward_time=0.279, grad_norm=78.988, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.132, optim0_lr0=6.300e-05, train_time=1.299
+[gpua005:0/64] 2023-12-19 10:21:50,429 (trainer:737) INFO: 41epoch:train:4901-5000batch: iter_time=9.809e-05, forward_time=0.164, loss_ctc=53.077, loss_att=41.009, acc=0.751, loss=44.630, backward_time=0.303, grad_norm=67.147, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.132, optim0_lr0=6.299e-05, train_time=1.541
+[gpua005:0/64] 2023-12-19 10:22:10,458 (multiple_iter_factory:32) INFO: Building 4th iter-factory...
+[gpua005:0/64] 2023-12-19 10:22:28,649 (s2t:445) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4')
+[gpua005:0/64] 2023-12-19 10:22:32,022 (abs_task:1616) INFO: [train] dataset:
+ESPnetDataset(
+  speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.0", "type": "kaldi_ark"}
+  text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.0", "type": "text"}
+  text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.0", "type": "text"}
+  text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.0", "type": "text"}
+  preprocess: <espnet2.train.preprocessor.S2TPreprocessor object at 0x7f0ac09de380>)
+[gpua005:0/64] 2023-12-19 10:22:32,022 (abs_task:1617) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.0, 
+[gpua005:0/64] 2023-12-19 10:22:32,025 (abs_task:1618) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257
+[gpua005:0/64] 2023-12-19 10:31:47,805 (trainer:737) INFO: 41epoch:train:5001-5100batch: iter_time=2.748, forward_time=0.188, loss_ctc=52.917, loss_att=42.306, acc=0.748, loss=45.489, backward_time=0.311, grad_norm=61.879, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.134, optim0_lr0=6.299e-05, train_time=5.974
+[gpua005:0/64] 2023-12-19 10:34:05,524 (trainer:737) INFO: 41epoch:train:5101-5200batch: iter_time=8.652e-05, forward_time=0.196, loss_ctc=64.123, loss_att=57.449, acc=0.733, loss=59.451, backward_time=0.289, grad_norm=67.630, clip=100.000, loss_scale=6.693e+31, optim_step_time=0.134, optim0_lr0=6.298e-05, train_time=1.376
+[gpua005:0/64] 2023-12-19 10:37:17,511 (trainer:737) INFO: 41epoch:train:5201-5300batch: iter_time=8.939e-05, forward_time=0.147, loss_ctc=72.253, loss_att=58.251, acc=0.747, loss=62.452, backward_time=0.383, grad_norm=67.092, clip=100.000, loss_scale=8.113e+31, optim_step_time=0.133, optim0_lr0=6.298e-05, train_time=1.921
+[gpua005:0/64] 2023-12-19 10:39:44,009 (trainer:737) INFO: 41epoch:train:5301-5400batch: iter_time=8.708e-05, forward_time=0.147, loss_ctc=73.609, loss_att=53.958, acc=0.755, loss=59.853, backward_time=0.296, grad_norm=67.130, clip=100.000, loss_scale=8.113e+31, optim_step_time=0.132, optim0_lr0=6.297e-05, train_time=1.465
+[gpua005:0/64] 2023-12-19 10:41:33,622 (trainer:668) WARNING: The grad norm is nan. Skipping updating the model.
+[gpua005:0/64] 2023-12-19 10:42:01,958 (trainer:737) INFO: 41epoch:train:5401-5500batch: iter_time=8.416e-05, forward_time=0.147, loss_ctc=63.118, loss_att=52.780, acc=0.741, loss=55.881, backward_time=0.289, grad_norm=66.031, clip=100.000, loss_scale=7.171e+31, optim_step_time=0.132, optim0_lr0=6.297e-05, train_time=1.379
+[gpua005:0/64] 2023-12-19 10:44:40,614 (trainer:737) INFO: 41epoch:train:5501-5600batch: iter_time=8.185e-05, forward_time=0.146, loss_ctc=56.811, loss_att=47.285, acc=0.735, loss=50.143, backward_time=0.317, grad_norm=64.172, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.132, optim0_lr0=6.296e-05, train_time=1.586
+[gpua005:0/64] 2023-12-19 10:47:34,932 (trainer:737) INFO: 41epoch:train:5601-5700batch: iter_time=8.558e-05, forward_time=0.146, loss_ctc=59.268, loss_att=49.541, acc=0.737, loss=52.459, backward_time=0.317, grad_norm=62.037, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.132, optim0_lr0=6.296e-05, train_time=1.743
+[gpua005:0/64] 2023-12-19 10:49:56,810 (trainer:737) INFO: 41epoch:train:5701-5800batch: iter_time=9.038e-05, forward_time=0.147, loss_ctc=67.576, loss_att=52.309, acc=0.735, loss=56.889, backward_time=0.295, grad_norm=83.113, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.133, optim0_lr0=6.295e-05, train_time=1.419
+[gpua005:0/64] 2023-12-19 10:52:43,462 (trainer:737) INFO: 41epoch:train:5801-5900batch: iter_time=8.339e-05, forward_time=0.151, loss_ctc=68.976, loss_att=53.033, acc=0.735, loss=57.816, backward_time=0.315, grad_norm=78.956, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.134, optim0_lr0=6.295e-05, train_time=1.666
+[gpua005:0/64] 2023-12-19 10:54:59,153 (trainer:737) INFO: 41epoch:train:5901-6000batch: iter_time=8.272e-05, forward_time=0.193, loss_ctc=62.900, loss_att=49.096, acc=0.752, loss=53.237, backward_time=0.299, grad_norm=67.969, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.132, optim0_lr0=6.294e-05, train_time=1.357
+[gpua005:0/64] 2023-12-19 10:57:42,365 (trainer:737) INFO: 41epoch:train:6001-6100batch: iter_time=8.292e-05, forward_time=0.182, loss_ctc=56.415, loss_att=42.800, acc=0.763, loss=46.884, backward_time=0.326, grad_norm=64.545, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.133, optim0_lr0=6.294e-05, train_time=1.632
+[gpua005:0/64] 2023-12-19 10:59:15,950 (trainer:668) WARNING: The grad norm is nan. Skipping updating the model.
+[gpua005:0/64] 2023-12-19 10:59:32,876 (trainer:668) WARNING: The grad norm is nan. Skipping updating the model.
+[gpua005:0/64] 2023-12-19 11:00:30,284 (trainer:737) INFO: 41epoch:train:6101-6200batch: iter_time=8.289e-05, forward_time=0.147, loss_ctc=65.249, loss_att=52.052, acc=0.727, loss=56.011, backward_time=0.314, grad_norm=87.897, clip=100.000, loss_scale=2.784e+31, optim_step_time=0.132, optim0_lr0=6.293e-05, train_time=1.679
+[gpua005:0/64] 2023-12-19 11:02:04,394 (multiple_iter_factory:32) INFO: Building 5th iter-factory...
+[gpua005:0/64] 2023-12-19 11:02:22,868 (s2t:445) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4')
+[gpua005:0/64] 2023-12-19 11:02:26,286 (abs_task:1616) INFO: [train] dataset:
+ESPnetDataset(
+  speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.9", "type": "kaldi_ark"}
+  text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.9", "type": "text"}
+  text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.9", "type": "text"}
+  text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.9", "type": "text"}
+  preprocess: <espnet2.train.preprocessor.S2TPreprocessor object at 0x7f0aeae052a0>)
+[gpua005:0/64] 2023-12-19 11:02:26,286 (abs_task:1617) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.9, 
+[gpua005:0/64] 2023-12-19 11:02:26,344 (abs_task:1618) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257
+[gpua005:0/64] 2023-12-19 11:15:13,610 (trainer:737) INFO: 41epoch:train:6201-6300batch: iter_time=2.480, forward_time=0.146, loss_ctc=52.483, loss_att=41.846, acc=0.749, loss=45.037, backward_time=0.341, grad_norm=69.181, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.132, optim0_lr0=6.293e-05, train_time=8.833
+[gpua005:0/64] 2023-12-19 11:20:05,067 (trainer:737) INFO: 41epoch:train:6301-6400batch: iter_time=9.134e-05, forward_time=0.146, loss_ctc=57.093, loss_att=48.307, acc=0.736, loss=50.943, backward_time=0.544, grad_norm=71.810, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.132, optim0_lr0=6.292e-05, train_time=2.914
+[gpua005:0/64] 2023-12-19 11:25:00,629 (trainer:737) INFO: 41epoch:train:6401-6500batch: iter_time=9.966e-05, forward_time=0.146, loss_ctc=66.913, loss_att=55.680, acc=0.729, loss=59.050, backward_time=0.687, grad_norm=74.404, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.135, optim0_lr0=6.292e-05, train_time=2.955
+[gpua005:0/64] 2023-12-19 11:28:27,728 (trainer:737) INFO: 41epoch:train:6501-6600batch: iter_time=9.209e-05, forward_time=0.146, loss_ctc=71.311, loss_att=50.054, acc=0.757, loss=56.431, backward_time=0.344, grad_norm=60.758, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.132, optim0_lr0=6.291e-05, train_time=2.071
+[gpua005:0/64] 2023-12-19 11:32:05,304 (trainer:737) INFO: 41epoch:train:6601-6700batch: iter_time=9.563e-05, forward_time=0.147, loss_ctc=72.796, loss_att=59.602, acc=0.726, loss=63.560, backward_time=0.467, grad_norm=69.586, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.132, optim0_lr0=6.291e-05, train_time=2.176
+[gpua005:0/64] 2023-12-19 11:35:25,991 (trainer:737) INFO: 41epoch:train:6701-6800batch: iter_time=8.971e-05, forward_time=0.145, loss_ctc=63.054, loss_att=50.639, acc=0.729, loss=54.364, backward_time=0.358, grad_norm=64.236, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.132, optim0_lr0=6.290e-05, train_time=2.007
+[gpua005:0/64] 2023-12-19 11:38:29,485 (trainer:737) INFO: 41epoch:train:6801-6900batch: iter_time=8.960e-05, forward_time=0.146, loss_ctc=56.266, loss_att=49.457, acc=0.730, loss=51.500, backward_time=0.301, grad_norm=59.508, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.132, optim0_lr0=6.290e-05, train_time=1.835
+[gpua005:0/64] 2023-12-19 11:41:57,405 (trainer:737) INFO: 41epoch:train:6901-7000batch: iter_time=9.499e-05, forward_time=0.156, loss_ctc=60.887, loss_att=44.467, acc=0.733, loss=49.393, backward_time=0.422, grad_norm=69.931, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.132, optim0_lr0=6.289e-05, train_time=2.079
+[gpua005:0/64] 2023-12-19 11:45:47,026 (trainer:737) INFO: 41epoch:train:7001-7100batch: iter_time=9.404e-05, forward_time=0.168, loss_ctc=66.575, loss_att=49.556, acc=0.736, loss=54.662, backward_time=0.421, grad_norm=80.413, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.133, optim0_lr0=6.289e-05, train_time=2.296
+[gpua005:0/64] 2023-12-19 11:49:19,703 (trainer:737) INFO: 41epoch:train:7101-7200batch: iter_time=9.488e-05, forward_time=0.169, loss_ctc=70.716, loss_att=53.837, acc=0.732, loss=58.900, backward_time=0.378, grad_norm=71.141, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.133, optim0_lr0=6.288e-05, train_time=2.127
+[gpua005:0/64] 2023-12-19 11:51:50,670 (trainer:737) INFO: 41epoch:train:7201-7300batch: iter_time=8.454e-05, forward_time=0.153, loss_ctc=55.871, loss_att=46.718, acc=0.751, loss=49.464, backward_time=0.290, grad_norm=64.604, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.132, optim0_lr0=6.287e-05, train_time=1.509
+[gpua005:0/64] 2023-12-19 11:55:15,139 (trainer:737) INFO: 41epoch:train:7301-7400batch: iter_time=8.774e-05, forward_time=0.156, loss_ctc=64.068, loss_att=47.812, acc=0.733, loss=52.689, backward_time=0.366, grad_norm=71.126, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.132, optim0_lr0=6.287e-05, train_time=2.045
+[gpua005:0/64] 2023-12-19 11:57:53,905 (trainer:737) INFO: 41epoch:train:7401-7500batch: iter_time=8.932e-05, forward_time=0.146, loss_ctc=52.741, loss_att=40.887, acc=0.752, loss=44.444, backward_time=0.294, grad_norm=64.680, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.132, optim0_lr0=6.286e-05, train_time=1.587
+[gpua005:0/64] 2023-12-19 11:58:13,934 (multiple_iter_factory:32) INFO: Building 6th iter-factory...
+[gpua005:0/64] 2023-12-19 11:58:32,357 (s2t:445) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4')
+[gpua005:0/64] 2023-12-19 11:58:35,778 (abs_task:1616) INFO: [train] dataset:
+ESPnetDataset(
+  speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.8", "type": "kaldi_ark"}
+  text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.8", "type": "text"}
+  text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.8", "type": "text"}
+  text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.8", "type": "text"}
+  preprocess: <espnet2.train.preprocessor.S2TPreprocessor object at 0x7f0b10008130>)
+[gpua005:0/64] 2023-12-19 11:58:35,778 (abs_task:1617) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.8, 
+[gpua005:0/64] 2023-12-19 11:58:35,816 (abs_task:1618) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257
+[gpua005:0/64] 2023-12-19 12:06:01,621 (trainer:737) INFO: 41epoch:train:7501-7600batch: iter_time=2.668, forward_time=0.184, loss_ctc=53.206, loss_att=42.322, acc=0.748, loss=45.587, backward_time=0.308, grad_norm=63.712, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.133, optim0_lr0=6.286e-05, train_time=4.877
+[gpua005:0/64] 2023-12-19 12:11:09,033 (trainer:737) INFO: 41epoch:train:7601-7700batch: iter_time=8.500e-05, forward_time=0.148, loss_ctc=63.220, loss_att=56.331, acc=0.736, loss=58.398, backward_time=0.488, grad_norm=73.049, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.132, optim0_lr0=6.285e-05, train_time=3.074
+[gpua005:0/64] 2023-12-19 12:16:24,566 (trainer:737) INFO: 41epoch:train:7701-7800batch: iter_time=8.697e-05, forward_time=0.175, loss_ctc=72.305, loss_att=57.599, acc=0.748, loss=62.011, backward_time=0.570, grad_norm=79.769, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.134, optim0_lr0=6.285e-05, train_time=3.155
+[gpua005:0/64] 2023-12-19 12:20:44,204 (trainer:737) INFO: 41epoch:train:7801-7900batch: iter_time=9.556e-05, forward_time=0.165, loss_ctc=73.603, loss_att=53.780, acc=0.756, loss=59.727, backward_time=0.369, grad_norm=71.299, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.131, optim0_lr0=6.284e-05, train_time=2.596
+[gpua005:0/64] 2023-12-19 12:25:05,875 (trainer:737) INFO: 41epoch:train:7901-8000batch: iter_time=9.460e-05, forward_time=0.163, loss_ctc=62.505, loss_att=52.370, acc=0.740, loss=55.410, backward_time=0.466, grad_norm=61.811, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.133, optim0_lr0=6.284e-05, train_time=2.616
+[gpua005:0/64] 2023-12-19 12:29:24,819 (trainer:737) INFO: 41epoch:train:8001-8100batch: iter_time=9.397e-05, forward_time=0.154, loss_ctc=56.341, loss_att=46.770, acc=0.734, loss=49.641, backward_time=0.451, grad_norm=89.959, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.132, optim0_lr0=6.283e-05, train_time=2.589
+[gpua005:0/64] 2023-12-19 12:33:36,728 (trainer:737) INFO: 41epoch:train:8101-8200batch: iter_time=8.728e-05, forward_time=0.146, loss_ctc=59.251, loss_att=49.022, acc=0.739, loss=52.091, backward_time=0.429, grad_norm=120.549, clip=100.000, loss_scale=1.349e+31, optim_step_time=0.132, optim0_lr0=6.283e-05, train_time=2.519
+[gpua005:0/64] 2023-12-19 12:37:17,346 (trainer:737) INFO: 41epoch:train:8201-8300batch: iter_time=8.322e-05, forward_time=0.145, loss_ctc=67.171, loss_att=52.329, acc=0.736, loss=56.781, backward_time=0.400, grad_norm=110.482, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.132, optim0_lr0=6.282e-05, train_time=2.206
+[gpua005:0/64] 2023-12-19 12:41:09,961 (trainer:737) INFO: 41epoch:train:8301-8400batch: iter_time=8.205e-05, forward_time=0.146, loss_ctc=68.913, loss_att=52.985, acc=0.735, loss=57.763, backward_time=0.404, grad_norm=71.355, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.132, optim0_lr0=6.282e-05, train_time=2.326
+[gpua005:0/64] 2023-12-19 12:44:41,153 (trainer:737) INFO: 41epoch:train:8401-8500batch: iter_time=8.439e-05, forward_time=0.172, loss_ctc=62.530, loss_att=48.526, acc=0.755, loss=52.727, backward_time=0.398, grad_norm=65.204, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.132, optim0_lr0=6.281e-05, train_time=2.112
+[gpua005:0/64] 2023-12-19 12:48:18,971 (trainer:737) INFO: 41epoch:train:8501-8600batch: iter_time=8.081e-05, forward_time=0.153, loss_ctc=56.245, loss_att=42.434, acc=0.766, loss=46.577, backward_time=0.406, grad_norm=63.418, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.134, optim0_lr0=6.281e-05, train_time=2.178
+[gpua005:0/64] 2023-12-19 12:51:42,847 (trainer:737) INFO: 41epoch:train:8601-8700batch: iter_time=8.812e-05, forward_time=0.150, loss_ctc=64.053, loss_att=51.832, acc=0.730, loss=55.498, backward_time=0.419, grad_norm=89.729, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.132, optim0_lr0=6.280e-05, train_time=2.039
+[gpua005:0/64] 2023-12-19 12:53:38,540 (multiple_iter_factory:32) INFO: Building 7th iter-factory...
+[gpua005:0/64] 2023-12-19 12:53:56,821 (s2t:445) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4')
+[gpua005:0/64] 2023-12-19 12:54:00,239 (abs_task:1616) INFO: [train] dataset:
+ESPnetDataset(
+  speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.10", "type": "kaldi_ark"}
+  text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.10", "type": "text"}
+  text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.10", "type": "text"}
+  text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.10", "type": "text"}
+  preprocess: <espnet2.train.preprocessor.S2TPreprocessor object at 0x7f0b1000bf70>)
+[gpua005:0/64] 2023-12-19 12:54:00,239 (abs_task:1617) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.10, 
+[gpua005:0/64] 2023-12-19 12:54:00,322 (abs_task:1618) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257
+[gpua005:0/64] 2023-12-19 13:04:13,559 (trainer:737) INFO: 41epoch:train:8701-8800batch: iter_time=2.916, forward_time=0.185, loss_ctc=52.103, loss_att=41.533, acc=0.753, loss=44.704, backward_time=0.506, grad_norm=76.501, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.133, optim0_lr0=6.280e-05, train_time=7.507
+[gpua005:0/64] 2023-12-19 13:07:48,314 (trainer:737) INFO: 41epoch:train:8801-8900batch: iter_time=8.697e-05, forward_time=0.146, loss_ctc=56.457, loss_att=46.930, acc=0.750, loss=49.788, backward_time=0.380, grad_norm=81.342, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.132, optim0_lr0=6.279e-05, train_time=2.147
+[gpua005:0/64] 2023-12-19 13:12:39,185 (trainer:737) INFO: 41epoch:train:8901-9000batch: iter_time=8.767e-05, forward_time=0.146, loss_ctc=66.504, loss_att=56.397, acc=0.735, loss=59.429, backward_time=0.478, grad_norm=81.915, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.132, optim0_lr0=6.279e-05, train_time=2.908
+[gpua005:0/64] 2023-12-19 13:19:32,451 (trainer:737) INFO: 41epoch:train:9001-9100batch: iter_time=8.986e-05, forward_time=0.152, loss_ctc=71.143, loss_att=50.189, acc=0.763, loss=56.475, backward_time=0.637, grad_norm=99.611, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.133, optim0_lr0=6.278e-05, train_time=4.132
+[gpua005:0/64] 2023-12-19 13:26:13,732 (trainer:737) INFO: 41epoch:train:9101-9200batch: iter_time=9.156e-05, forward_time=0.176, loss_ctc=71.925, loss_att=58.524, acc=0.743, loss=62.544, backward_time=0.704, grad_norm=145.260, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.134, optim0_lr0=6.278e-05, train_time=4.013
+[gpua005:0/64] 2023-12-19 13:34:12,081 (trainer:737) INFO: 41epoch:train:9201-9300batch: iter_time=9.225e-05, forward_time=0.147, loss_ctc=63.800, loss_att=52.406, acc=0.734, loss=55.824, backward_time=0.823, grad_norm=82.950, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.132, optim0_lr0=6.277e-05, train_time=4.783
+[gpua005:0/64] 2023-12-19 13:40:48,158 (trainer:737) INFO: 41epoch:train:9301-9400batch: iter_time=9.179e-05, forward_time=0.155, loss_ctc=55.651, loss_att=50.473, acc=0.733, loss=52.026, backward_time=0.780, grad_norm=96.646, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.134, optim0_lr0=6.277e-05, train_time=3.961
+[gpua005:0/64] 2023-12-19 13:47:05,808 (trainer:737) INFO: 41epoch:train:9401-9500batch: iter_time=9.153e-05, forward_time=0.147, loss_ctc=60.657, loss_att=45.096, acc=0.739, loss=49.764, backward_time=0.637, grad_norm=100.972, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.133, optim0_lr0=6.276e-05, train_time=3.776
+[gpua005:0/64] 2023-12-19 13:54:33,226 (trainer:737) INFO: 41epoch:train:9501-9600batch: iter_time=9.531e-05, forward_time=0.189, loss_ctc=65.879, loss_att=49.888, acc=0.743, loss=54.685, backward_time=0.613, grad_norm=95.523, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.134, optim0_lr0=6.276e-05, train_time=4.474
+[gpua005:0/64] 2023-12-19 14:05:13,903 (trainer:737) INFO: 41epoch:train:9601-9700batch: iter_time=9.904e-05, forward_time=0.147, loss_ctc=70.688, loss_att=54.165, acc=0.743, loss=59.122, backward_time=0.866, grad_norm=169.026, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.132, optim0_lr0=6.275e-05, train_time=6.407
+[gpua005:0/64] 2023-12-19 14:12:09,272 (trainer:668) WARNING: The grad norm is nan. Skipping updating the model.
+[gpua005:0/64] 2023-12-19 14:14:53,178 (trainer:737) INFO: 41epoch:train:9701-9800batch: iter_time=9.846e-05, forward_time=0.147, loss_ctc=55.332, loss_att=46.032, acc=0.762, loss=48.822, backward_time=0.876, grad_norm=122.659, clip=100.000, loss_scale=1.793e+31, optim_step_time=0.132, optim0_lr0=6.275e-05, train_time=5.793
+[gpua005:0/64] 2023-12-19 14:27:12,798 (trainer:737) INFO: 41epoch:train:9801-9900batch: iter_time=1.045e-04, forward_time=0.150, loss_ctc=64.431, loss_att=47.755, acc=0.743, loss=52.758, backward_time=1.042, grad_norm=86.668, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.133, optim0_lr0=6.274e-05, train_time=7.396
+[gpua005:0/64] 2023-12-19 14:41:47,048 (trainer:737) INFO: 41epoch:train:9901-10000batch: iter_time=9.987e-05, forward_time=0.154, loss_ctc=52.098, loss_att=40.246, acc=0.760, loss=43.802, backward_time=1.008, grad_norm=125.306, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.133, optim0_lr0=6.274e-05, train_time=8.742
+[gpua005:0/64] 2023-12-19 14:42:06,027 (multiple_iter_factory:32) INFO: Building 8th iter-factory...
+[gpua005:0/64] 2023-12-19 14:42:24,325 (s2t:445) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4')
+[gpua005:0/64] 2023-12-19 14:42:27,711 (abs_task:1616) INFO: [train] dataset:
+ESPnetDataset(
+  speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.3", "type": "kaldi_ark"}
+  text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.3", "type": "text"}
+  text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.3", "type": "text"}
+  text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.3", "type": "text"}
+  preprocess: <espnet2.train.preprocessor.S2TPreprocessor object at 0x7f06f6221b40>)
+[gpua005:0/64] 2023-12-19 14:42:27,711 (abs_task:1617) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.3, 
+[gpua005:0/64] 2023-12-19 14:42:27,729 (abs_task:1618) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257
+[gpua005:0/64] 2023-12-19 14:51:17,288 (trainer:737) INFO: 41epoch:train:10001-10100batch: iter_time=2.561, forward_time=0.158, loss_ctc=52.860, loss_att=41.843, acc=0.744, loss=45.148, backward_time=0.286, grad_norm=68.321, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.132, optim0_lr0=6.273e-05, train_time=5.702
+[gpua005:0/64] 2023-12-19 14:53:24,633 (trainer:737) INFO: 41epoch:train:10101-10200batch: iter_time=8.075e-05, forward_time=0.160, loss_ctc=63.599, loss_att=56.679, acc=0.723, loss=58.755, backward_time=0.282, grad_norm=87.850, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.132, optim0_lr0=6.273e-05, train_time=1.273
+[gpua005:0/64] 2023-12-19 14:56:01,986 (trainer:737) INFO: 41epoch:train:10201-10300batch: iter_time=8.254e-05, forward_time=0.163, loss_ctc=72.803, loss_att=56.478, acc=0.743, loss=61.376, backward_time=0.359, grad_norm=174.458, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.132, optim0_lr0=6.272e-05, train_time=1.573
+[gpua005:0/64] 2023-12-19 14:58:44,660 (trainer:737) INFO: 41epoch:train:10301-10400batch: iter_time=8.838e-05, forward_time=0.146, loss_ctc=73.465, loss_att=53.799, acc=0.746, loss=59.699, backward_time=0.305, grad_norm=120.300, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.132, optim0_lr0=6.272e-05, train_time=1.627
+[gpua005:0/64] 2023-12-19 15:01:24,071 (trainer:737) INFO: 41epoch:train:10401-10500batch: iter_time=9.496e-05, forward_time=0.171, loss_ctc=62.761, loss_att=51.274, acc=0.735, loss=54.720, backward_time=0.360, grad_norm=82.229, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.134, optim0_lr0=6.271e-05, train_time=1.594
+[gpua005:0/64] 2023-12-19 15:03:32,098 (trainer:737) INFO: 41epoch:train:10501-10600batch: iter_time=1.003e-04, forward_time=0.146, loss_ctc=56.358, loss_att=46.409, acc=0.729, loss=49.394, backward_time=0.280, grad_norm=80.137, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.132, optim0_lr0=6.270e-05, train_time=1.279
+[gpua005:0/64] 2023-12-19 15:06:38,921 (trainer:737) INFO: 41epoch:train:10601-10700batch: iter_time=9.492e-05, forward_time=0.146, loss_ctc=59.321, loss_att=48.671, acc=0.738, loss=51.866, backward_time=0.318, grad_norm=111.851, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.132, optim0_lr0=6.270e-05, train_time=1.869
+[gpua005:0/64] 2023-12-19 15:09:09,824 (trainer:737) INFO: 41epoch:train:10701-10800batch: iter_time=9.276e-05, forward_time=0.146, loss_ctc=67.372, loss_att=51.232, acc=0.729, loss=56.074, backward_time=0.281, grad_norm=89.845, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.132, optim0_lr0=6.269e-05, train_time=1.509
+[gpua005:0/64] 2023-12-19 15:11:26,088 (trainer:737) INFO: 41epoch:train:10801-10900batch: iter_time=8.757e-05, forward_time=0.145, loss_ctc=68.755, loss_att=52.650, acc=0.728, loss=57.482, backward_time=0.278, grad_norm=89.221, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.132, optim0_lr0=6.269e-05, train_time=1.362
+[gpua005:0/64] 2023-12-19 15:14:09,427 (trainer:737) INFO: 41epoch:train:10901-11000batch: iter_time=9.314e-05, forward_time=0.160, loss_ctc=62.587, loss_att=48.428, acc=0.745, loss=52.676, backward_time=0.344, grad_norm=81.616, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.132, optim0_lr0=6.268e-05, train_time=1.633
+[gpua005:0/64] 2023-12-19 15:16:32,587 (trainer:737) INFO: 41epoch:train:11001-11100batch: iter_time=8.666e-05, forward_time=0.164, loss_ctc=56.637, loss_att=42.393, acc=0.760, loss=46.667, backward_time=0.303, grad_norm=58.387, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.133, optim0_lr0=6.268e-05, train_time=1.431
+[gpua005:0/64] 2023-12-19 15:18:58,200 (trainer:737) INFO: 41epoch:train:11101-11200batch: iter_time=8.570e-05, forward_time=0.148, loss_ctc=63.621, loss_att=50.992, acc=0.724, loss=54.781, backward_time=0.303, grad_norm=76.746, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.132, optim0_lr0=6.267e-05, train_time=1.456
+[gpua005:0/64] 2023-12-19 15:20:45,989 (multiple_iter_factory:32) INFO: Building 9th iter-factory...
+[gpua005:0/64] 2023-12-19 15:21:04,233 (s2t:445) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4')
+[gpua005:0/64] 2023-12-19 15:21:07,626 (abs_task:1616) INFO: [train] dataset:
+ESPnetDataset(
+  speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.2", "type": "kaldi_ark"}
+  text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.2", "type": "text"}
+  text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.2", "type": "text"}
+  text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.2", "type": "text"}
+  preprocess: <espnet2.train.preprocessor.S2TPreprocessor object at 0x7f06f0b236a0>)
+[gpua005:0/64] 2023-12-19 15:21:07,626 (abs_task:1617) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.2, 
+[gpua005:0/64] 2023-12-19 15:21:07,630 (abs_task:1618) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257
+[gpua005:0/64] 2023-12-19 15:26:28,704 (trainer:737) INFO: 41epoch:train:11201-11300batch: iter_time=2.899, forward_time=0.184, loss_ctc=52.186, loss_att=42.273, acc=0.749, loss=45.247, backward_time=0.308, grad_norm=174.603, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.132, optim0_lr0=6.267e-05, train_time=4.505
+[gpua005:0/64] 2023-12-19 15:28:30,862 (trainer:737) INFO: 41epoch:train:11301-11400batch: iter_time=8.404e-05, forward_time=0.147, loss_ctc=56.191, loss_att=47.940, acc=0.750, loss=50.415, backward_time=0.279, grad_norm=74.842, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.132, optim0_lr0=6.266e-05, train_time=1.221
+[gpua005:0/64] 2023-12-19 15:30:35,431 (trainer:737) INFO: 41epoch:train:11401-11500batch: iter_time=9.016e-05, forward_time=0.149, loss_ctc=66.444, loss_att=56.689, acc=0.737, loss=59.615, backward_time=0.279, grad_norm=104.593, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.132, optim0_lr0=6.266e-05, train_time=1.245
+[gpua005:0/64] 2023-12-19 15:33:07,481 (trainer:737) INFO: 41epoch:train:11501-11600batch: iter_time=9.184e-05, forward_time=0.147, loss_ctc=71.234, loss_att=50.524, acc=0.762, loss=56.737, backward_time=0.331, grad_norm=170.038, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.132, optim0_lr0=6.265e-05, train_time=1.520
+[gpua005:0/64] 2023-12-19 15:35:47,970 (trainer:737) INFO: 41epoch:train:11601-11700batch: iter_time=8.978e-05, forward_time=0.147, loss_ctc=71.740, loss_att=58.979, acc=0.741, loss=62.807, backward_time=0.309, grad_norm=81.528, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.132, optim0_lr0=6.265e-05, train_time=1.605
+[gpua005:0/64] 2023-12-19 15:38:26,627 (trainer:737) INFO: 41epoch:train:11701-11800batch: iter_time=8.802e-05, forward_time=0.147, loss_ctc=63.114, loss_att=51.983, acc=0.737, loss=55.322, backward_time=0.332, grad_norm=170.451, clip=100.000, loss_scale=1.247e+31, optim_step_time=0.132, optim0_lr0=6.264e-05, train_time=1.586
+[gpua005:0/64] 2023-12-19 15:41:51,565 (trainer:737) INFO: 41epoch:train:11801-11900batch: iter_time=8.722e-05, forward_time=0.146, loss_ctc=55.950, loss_att=50.292, acc=0.732, loss=51.989, backward_time=0.329, grad_norm=93.367, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.131, optim0_lr0=6.264e-05, train_time=2.049
+[gpua005:0/64] 2023-12-19 15:44:16,706 (trainer:737) INFO: 41epoch:train:11901-12000batch: iter_time=8.668e-05, forward_time=0.146, loss_ctc=61.107, loss_att=45.244, acc=0.740, loss=50.003, backward_time=0.308, grad_norm=87.283, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.131, optim0_lr0=6.263e-05, train_time=1.451
+[gpua005:0/64] 2023-12-19 15:47:14,296 (trainer:737) INFO: 41epoch:train:12001-12100batch: iter_time=8.858e-05, forward_time=0.216, loss_ctc=66.106, loss_att=49.752, acc=0.744, loss=54.658, backward_time=0.389, grad_norm=88.888, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.146, optim0_lr0=6.263e-05, train_time=1.776
+[gpua005:0/64] 2023-12-19 15:49:31,038 (trainer:737) INFO: 41epoch:train:12101-12200batch: iter_time=8.566e-05, forward_time=0.171, loss_ctc=70.335, loss_att=54.184, acc=0.741, loss=59.029, backward_time=0.304, grad_norm=122.299, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.132, optim0_lr0=6.262e-05, train_time=1.367
+[gpua005:0/64] 2023-12-19 15:52:34,332 (trainer:737) INFO: 41epoch:train:12201-12300batch: iter_time=8.739e-05, forward_time=0.146, loss_ctc=55.710, loss_att=46.537, acc=0.759, loss=49.289, backward_time=0.348, grad_norm=71.699, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.132, optim0_lr0=6.262e-05, train_time=1.833
+[gpua005:0/64] 2023-12-19 15:54:56,602 (trainer:737) INFO: 41epoch:train:12301-12400batch: iter_time=8.111e-05, forward_time=0.145, loss_ctc=63.443, loss_att=47.283, acc=0.743, loss=52.131, backward_time=0.295, grad_norm=113.071, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.132, optim0_lr0=6.261e-05, train_time=1.422
+[gpua005:0/64] 2023-12-19 15:57:16,344 (trainer:737) INFO: 41epoch:train:12401-12500batch: iter_time=8.099e-05, forward_time=0.145, loss_ctc=52.475, loss_att=41.048, acc=0.757, loss=44.476, backward_time=0.296, grad_norm=64.037, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.132, optim0_lr0=6.261e-05, train_time=1.397
+[gpua005:0/64] 2023-12-19 15:57:36,372 (multiple_iter_factory:32) INFO: Building 10th iter-factory...
+[gpua005:0/64] 2023-12-19 15:57:54,547 (s2t:445) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4')
+[gpua005:0/64] 2023-12-19 15:57:57,954 (abs_task:1616) INFO: [train] dataset:
+ESPnetDataset(
+  speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.4", "type": "kaldi_ark"}
+  text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.4", "type": "text"}
+  text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.4", "type": "text"}
+  text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.4", "type": "text"}
+  preprocess: <espnet2.train.preprocessor.S2TPreprocessor object at 0x7f06f561f2e0>)
+[gpua005:0/64] 2023-12-19 15:57:57,954 (abs_task:1617) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.4, 
+[gpua005:0/64] 2023-12-19 15:57:57,989 (abs_task:1618) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257
+[gpua005:0/64] 2023-12-19 16:08:38,881 (trainer:737) INFO: 41epoch:train:12501-12600batch: iter_time=2.950, forward_time=0.245, loss_ctc=52.475, loss_att=40.909, acc=0.753, loss=44.379, backward_time=0.317, grad_norm=68.417, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.134, optim0_lr0=6.260e-05, train_time=6.825
+[gpua005:0/64] 2023-12-19 16:11:09,085 (trainer:737) INFO: 41epoch:train:12601-12700batch: iter_time=8.525e-05, forward_time=0.146, loss_ctc=63.373, loss_att=55.118, acc=0.737, loss=57.594, backward_time=0.303, grad_norm=97.219, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.132, optim0_lr0=6.260e-05, train_time=1.502
+[gpua005:0/64] 2023-12-19 16:14:12,342 (trainer:737) INFO: 41epoch:train:12701-12800batch: iter_time=8.987e-05, forward_time=0.147, loss_ctc=72.127, loss_att=57.259, acc=0.749, loss=61.720, backward_time=0.351, grad_norm=109.964, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.133, optim0_lr0=6.259e-05, train_time=1.832
+[gpua005:0/64] 2023-12-19 16:16:40,625 (trainer:737) INFO: 41epoch:train:12801-12900batch: iter_time=8.767e-05, forward_time=0.147, loss_ctc=72.629, loss_att=52.894, acc=0.757, loss=58.815, backward_time=0.297, grad_norm=97.658, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.132, optim0_lr0=6.259e-05, train_time=1.483
+[gpua005:0/64] 2023-12-19 16:19:15,298 (trainer:737) INFO: 41epoch:train:12901-13000batch: iter_time=8.601e-05, forward_time=0.147, loss_ctc=62.529, loss_att=51.673, acc=0.743, loss=54.930, backward_time=0.329, grad_norm=142.835, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.132, optim0_lr0=6.258e-05, train_time=1.546
+[gpua005:0/64] 2023-12-19 16:21:55,558 (trainer:737) INFO: 41epoch:train:13001-13100batch: iter_time=8.721e-05, forward_time=0.146, loss_ctc=56.050, loss_att=46.409, acc=0.740, loss=49.301, backward_time=0.300, grad_norm=94.045, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.132, optim0_lr0=6.258e-05, train_time=1.602
+[gpua005:0/64] 2023-12-19 16:25:30,357 (trainer:737) INFO: 41epoch:train:13101-13200batch: iter_time=9.171e-05, forward_time=0.147, loss_ctc=58.975, loss_att=48.987, acc=0.740, loss=51.983, backward_time=0.385, grad_norm=69.738, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.132, optim0_lr0=6.257e-05, train_time=2.148
+[gpua005:0/64] 2023-12-19 16:27:47,454 (trainer:737) INFO: 41epoch:train:13201-13300batch: iter_time=8.424e-05, forward_time=0.197, loss_ctc=67.511, loss_att=52.164, acc=0.737, loss=56.768, backward_time=0.318, grad_norm=83.406, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.138, optim0_lr0=6.257e-05, train_time=1.371
+[gpua005:0/64] 2023-12-19 16:30:28,569 (trainer:737) INFO: 41epoch:train:13301-13400batch: iter_time=8.162e-05, forward_time=0.170, loss_ctc=68.700, loss_att=52.540, acc=0.736, loss=57.388, backward_time=0.341, grad_norm=81.774, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.135, optim0_lr0=6.256e-05, train_time=1.611
+[gpua005:0/64] 2023-12-19 16:33:19,481 (trainer:737) INFO: 41epoch:train:13401-13500batch: iter_time=8.012e-05, forward_time=0.146, loss_ctc=62.366, loss_att=48.496, acc=0.756, loss=52.657, backward_time=0.355, grad_norm=82.491, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.132, optim0_lr0=6.256e-05, train_time=1.709
+[gpua005:0/64] 2023-12-19 16:35:41,709 (trainer:737) INFO: 41epoch:train:13501-13600batch: iter_time=8.780e-05, forward_time=0.146, loss_ctc=55.935, loss_att=42.097, acc=0.768, loss=46.248, backward_time=0.291, grad_norm=121.516, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.131, optim0_lr0=6.255e-05, train_time=1.422
+[gpua005:0/64] 2023-12-19 16:38:22,101 (trainer:737) INFO: 41epoch:train:13601-13700batch: iter_time=8.456e-05, forward_time=0.146, loss_ctc=63.345, loss_att=51.156, acc=0.732, loss=54.813, backward_time=0.334, grad_norm=99.275, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.131, optim0_lr0=6.255e-05, train_time=1.604
+[gpua005:0/64] 2023-12-19 16:40:09,435 (multiple_iter_factory:32) INFO: Building 11th iter-factory...
+[gpua005:0/64] 2023-12-19 16:40:28,206 (s2t:445) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4')
+[gpua005:0/64] 2023-12-19 16:40:31,639 (abs_task:1616) INFO: [train] dataset:
+ESPnetDataset(
+  speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.7", "type": "kaldi_ark"}
+  text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.7", "type": "text"}
+  text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.7", "type": "text"}
+  text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.7", "type": "text"}
+  preprocess: <espnet2.train.preprocessor.S2TPreprocessor object at 0x7f06b27256c0>)
+[gpua005:0/64] 2023-12-19 16:40:31,639 (abs_task:1617) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.7, 
+[gpua005:0/64] 2023-12-19 16:40:31,642 (abs_task:1618) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257
+[gpua005:0/64] 2023-12-19 16:50:18,947 (trainer:737) INFO: 41epoch:train:13701-13800batch: iter_time=3.169, forward_time=0.178, loss_ctc=52.128, loss_att=42.309, acc=0.748, loss=45.255, backward_time=0.307, grad_norm=108.263, clip=100.000, loss_scale=2.495e+31, optim_step_time=0.132, optim0_lr0=6.254e-05, train_time=7.168
+[gpua005:0/64] 2023-12-19 16:52:24,985 (trainer:737) INFO: 41epoch:train:13801-13900batch: iter_time=8.066e-05, forward_time=0.147, loss_ctc=55.820, loss_att=48.449, acc=0.740, loss=50.660, backward_time=0.276, grad_norm=79.608, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.132, optim0_lr0=6.254e-05, train_time=1.260
+[gpua005:0/64] 2023-12-19 16:54:51,053 (trainer:737) INFO: 41epoch:train:13901-14000batch: iter_time=8.309e-05, forward_time=0.149, loss_ctc=66.247, loss_att=55.268, acc=0.731, loss=58.562, backward_time=0.289, grad_norm=75.467, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.132, optim0_lr0=6.253e-05, train_time=1.460
+[gpua005:0/64] 2023-12-19 16:57:29,919 (trainer:737) INFO: 41epoch:train:14001-14100batch: iter_time=8.769e-05, forward_time=0.146, loss_ctc=71.246, loss_att=50.404, acc=0.758, loss=56.657, backward_time=0.316, grad_norm=62.748, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.132, optim0_lr0=6.253e-05, train_time=1.588
+[gpua005:0/64] 2023-12-19 16:59:49,915 (trainer:737) INFO: 41epoch:train:14101-14200batch: iter_time=8.792e-05, forward_time=0.146, loss_ctc=72.375, loss_att=59.655, acc=0.728, loss=63.471, backward_time=0.304, grad_norm=105.985, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.132, optim0_lr0=6.252e-05, train_time=1.400
+[gpua005:0/64] 2023-12-19 17:02:03,764 (trainer:737) INFO: 41epoch:train:14201-14300batch: iter_time=9.338e-05, forward_time=0.146, loss_ctc=62.269, loss_att=49.912, acc=0.733, loss=53.620, backward_time=0.291, grad_norm=101.522, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.132, optim0_lr0=6.252e-05, train_time=1.338
+[gpua005:0/64] 2023-12-19 17:05:03,523 (trainer:737) INFO: 41epoch:train:14301-14400batch: iter_time=8.659e-05, forward_time=0.146, loss_ctc=55.707, loss_att=49.159, acc=0.735, loss=51.123, backward_time=0.314, grad_norm=89.208, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.132, optim0_lr0=6.251e-05, train_time=1.797
+[gpua005:0/64] 2023-12-19 17:07:33,931 (trainer:737) INFO: 41epoch:train:14401-14500batch: iter_time=8.521e-05, forward_time=0.146, loss_ctc=60.699, loss_att=44.733, acc=0.733, loss=49.523, backward_time=0.292, grad_norm=85.414, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.132, optim0_lr0=6.251e-05, train_time=1.504
+[gpua005:0/64] 2023-12-19 17:09:59,372 (trainer:737) INFO: 41epoch:train:14501-14600batch: iter_time=9.319e-05, forward_time=0.146, loss_ctc=65.842, loss_att=48.929, acc=0.742, loss=54.003, backward_time=0.311, grad_norm=93.752, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.132, optim0_lr0=6.250e-05, train_time=1.454
+[gpua005:0/64] 2023-12-19 17:12:54,586 (trainer:737) INFO: 41epoch:train:14601-14700batch: iter_time=8.231e-05, forward_time=0.199, loss_ctc=70.236, loss_att=53.394, acc=0.735, loss=58.446, backward_time=0.307, grad_norm=98.429, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.136, optim0_lr0=6.250e-05, train_time=1.752
+[gpua005:0/64] 2023-12-19 17:15:18,559 (trainer:737) INFO: 41epoch:train:14701-14800batch: iter_time=8.513e-05, forward_time=0.162, loss_ctc=55.399, loss_att=46.110, acc=0.754, loss=48.896, backward_time=0.306, grad_norm=66.470, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.132, optim0_lr0=6.249e-05, train_time=1.439
+[gpua005:0/64] 2023-12-19 17:17:43,678 (trainer:737) INFO: 41epoch:train:14801-14900batch: iter_time=7.742e-04, forward_time=0.169, loss_ctc=64.260, loss_att=47.953, acc=0.733, loss=52.845, backward_time=0.321, grad_norm=86.581, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.138, optim0_lr0=6.249e-05, train_time=1.451
+[gpua005:0/64] 2023-12-19 17:20:11,089 (trainer:737) INFO: 41epoch:train:14901-15000batch: iter_time=7.910e-05, forward_time=0.146, loss_ctc=52.467, loss_att=40.838, acc=0.753, loss=44.327, backward_time=0.299, grad_norm=74.914, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.132, optim0_lr0=6.248e-05, train_time=1.474
+[gpua005:0/64] 2023-12-19 17:45:11,441 (trainer:343) INFO: 41epoch results: [train] iter_time=0.208, forward_time=0.157, loss_ctc=63.054, loss_att=49.907, acc=0.740, loss=53.851, backward_time=0.368, grad_norm=83.288, clip=100.000, loss_scale=2.577e+31, optim_step_time=0.133, optim0_lr0=6.286e-05, train_time=2.344, time=9 hours, 46 minutes and 33.58 seconds, total_count=615000, gpu_max_cached_mem_GB=36.082, [valid] loss_ctc=32.143, cer_ctc=0.167, loss_att=32.882, acc=0.727, cer=0.344, wer=0.991, loss=32.660, time=24 minutes and 36.38 seconds, total_count=191511, gpu_max_cached_mem_GB=36.082
+[gpua005:0/64] 2023-12-19 17:45:37,934 (trainer:391) INFO: The best model has been updated: valid.total_count
+[gpua005:0/64] 2023-12-19 17:45:38,434 (trainer:445) INFO: The model files were removed: exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/36epoch.pth
+[gpua005:0/64] 2023-12-19 17:45:38,680 (trainer:272) INFO: 42/45epoch started. Estimated time to finish: 1 day, 16 hours and 46 minutes
+[gpua005:0/64] 2023-12-19 17:45:39,731 (multiple_iter_factory:32) INFO: Building 0th iter-factory...
+[gpua005:0/64] 2023-12-19 17:45:57,745 (s2t:445) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4')
+[gpua005:0/64] 2023-12-19 17:46:01,145 (abs_task:1616) INFO: [train] dataset:
+ESPnetDataset(
+  speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.0", "type": "kaldi_ark"}
+  text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.0", "type": "text"}
+  text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.0", "type": "text"}
+  text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.0", "type": "text"}
+  preprocess: <espnet2.train.preprocessor.S2TPreprocessor object at 0x7f06b0135e40>)
+[gpua005:0/64] 2023-12-19 17:46:01,145 (abs_task:1617) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.0, 
+[gpua005:0/64] 2023-12-19 17:46:01,182 (abs_task:1618) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257
+[gpua005:0/64] 2023-12-19 17:53:16,219 (trainer:737) INFO: 42epoch:train:1-100batch: iter_time=2.101, forward_time=0.179, loss_ctc=65.607, loss_att=48.091, acc=0.731, loss=53.346, backward_time=0.315, grad_norm=113.662, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.136, optim0_lr0=6.247e-05, train_time=4.568
+[gpua005:0/64] 2023-12-19 17:59:49,470 (trainer:737) INFO: 42epoch:train:101-200batch: iter_time=1.158e-04, forward_time=0.147, loss_ctc=60.904, loss_att=49.580, acc=0.744, loss=52.977, backward_time=0.753, grad_norm=70.731, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.135, optim0_lr0=6.247e-05, train_time=3.933
+[gpua005:0/64] 2023-12-19 18:05:51,156 (trainer:737) INFO: 42epoch:train:201-300batch: iter_time=1.161e-04, forward_time=0.148, loss_ctc=72.000, loss_att=62.821, acc=0.734, loss=65.575, backward_time=0.715, grad_norm=88.731, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.135, optim0_lr0=6.246e-05, train_time=3.617
+[gpua005:0/64] 2023-12-19 18:07:49,673 (trainer:668) WARNING: The grad norm is nan. Skipping updating the model.
+[gpua005:0/64] 2023-12-19 18:08:25,216 (trainer:737) INFO: 42epoch:train:301-400batch: iter_time=1.009e-04, forward_time=0.166, loss_ctc=65.791, loss_att=47.766, acc=0.720, loss=53.174, backward_time=0.318, grad_norm=71.767, clip=100.000, loss_scale=3.565e+31, optim_step_time=0.135, optim0_lr0=6.246e-05, train_time=1.540
+[gpua005:0/64] 2023-12-19 18:11:12,272 (trainer:737) INFO: 42epoch:train:401-500batch: iter_time=8.431e-05, forward_time=0.186, loss_ctc=75.501, loss_att=59.828, acc=0.717, loss=64.530, backward_time=0.390, grad_norm=80.345, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.135, optim0_lr0=6.245e-05, train_time=1.670
+[gpua005:0/64] 2023-12-19 18:14:13,558 (trainer:737) INFO: 42epoch:train:501-600batch: iter_time=9.118e-05, forward_time=0.162, loss_ctc=71.319, loss_att=55.809, acc=0.738, loss=60.462, backward_time=0.347, grad_norm=80.563, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.134, optim0_lr0=6.245e-05, train_time=1.813
+[gpua005:0/64] 2023-12-19 18:16:41,015 (trainer:737) INFO: 42epoch:train:601-700batch: iter_time=8.036e-05, forward_time=0.148, loss_ctc=69.003, loss_att=54.700, acc=0.739, loss=58.991, backward_time=0.300, grad_norm=75.383, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.134, optim0_lr0=6.244e-05, train_time=1.474
+[gpua005:0/64] 2023-12-19 18:19:32,394 (trainer:737) INFO: 42epoch:train:701-800batch: iter_time=8.132e-05, forward_time=0.147, loss_ctc=61.198, loss_att=47.840, acc=0.751, loss=51.847, backward_time=0.300, grad_norm=64.936, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.134, optim0_lr0=6.244e-05, train_time=1.714
+[gpua005:0/64] 2023-12-19 18:21:49,109 (trainer:737) INFO: 42epoch:train:801-900batch: iter_time=9.172e-05, forward_time=0.150, loss_ctc=81.452, loss_att=61.325, acc=0.717, loss=67.363, backward_time=0.298, grad_norm=98.913, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.134, optim0_lr0=6.243e-05, train_time=1.367
+[gpua005:0/64] 2023-12-19 18:24:39,044 (trainer:737) INFO: 42epoch:train:901-1000batch: iter_time=7.256e-05, forward_time=0.147, loss_ctc=66.039, loss_att=51.626, acc=0.741, loss=55.950, backward_time=0.307, grad_norm=65.141, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.133, optim0_lr0=6.243e-05, train_time=1.699
+[gpua005:0/64] 2023-12-19 18:27:23,753 (trainer:737) INFO: 42epoch:train:1001-1100batch: iter_time=7.446e-05, forward_time=0.161, loss_ctc=71.242, loss_att=48.204, acc=0.752, loss=55.116, backward_time=0.338, grad_norm=77.291, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.134, optim0_lr0=6.242e-05, train_time=1.647
+[gpua005:0/64] 2023-12-19 18:30:17,138 (trainer:737) INFO: 42epoch:train:1101-1200batch: iter_time=7.416e-05, forward_time=0.171, loss_ctc=66.797, loss_att=53.040, acc=0.737, loss=57.167, backward_time=0.374, grad_norm=64.912, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.134, optim0_lr0=6.242e-05, train_time=1.733
+[gpua005:0/64] 2023-12-19 18:31:56,032 (multiple_iter_factory:32) INFO: Building 1th iter-factory...
+[gpua005:0/64] 2023-12-19 18:32:14,473 (s2t:445) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4')
+[gpua005:0/64] 2023-12-19 18:32:17,977 (abs_task:1616) INFO: [train] dataset:
+ESPnetDataset(
+  speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.7", "type": "kaldi_ark"}
+  text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.7", "type": "text"}
+  text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.7", "type": "text"}
+  text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.7", "type": "text"}
+  preprocess: <espnet2.train.preprocessor.S2TPreprocessor object at 0x7f0b0f2cd540>)
+[gpua005:0/64] 2023-12-19 18:32:17,977 (abs_task:1617) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.7, 
+[gpua005:0/64] 2023-12-19 18:32:17,981 (abs_task:1618) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257
+[gpua005:0/64] 2023-12-19 18:39:36,913 (trainer:737) INFO: 42epoch:train:1201-1300batch: iter_time=2.765, forward_time=0.186, loss_ctc=57.900, loss_att=44.822, acc=0.743, loss=48.746, backward_time=0.292, grad_norm=61.977, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.136, optim0_lr0=6.241e-05, train_time=5.598
+[gpua005:0/64] 2023-12-19 18:41:50,174 (trainer:737) INFO: 42epoch:train:1301-1400batch: iter_time=8.052e-05, forward_time=0.150, loss_ctc=63.901, loss_att=49.773, acc=0.733, loss=54.011, backward_time=0.322, grad_norm=76.138, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.134, optim0_lr0=6.241e-05, train_time=1.332
+[gpua005:0/64] 2023-12-19 18:43:57,292 (trainer:737) INFO: 42epoch:train:1401-1500batch: iter_time=7.742e-05, forward_time=0.148, loss_ctc=69.403, loss_att=60.430, acc=0.741, loss=63.122, backward_time=0.281, grad_norm=79.119, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.134, optim0_lr0=6.240e-05, train_time=1.271
+[gpua005:0/64] 2023-12-19 18:45:58,813 (trainer:737) INFO: 42epoch:train:1501-1600batch: iter_time=8.128e-05, forward_time=0.146, loss_ctc=67.867, loss_att=54.594, acc=0.711, loss=58.576, backward_time=0.279, grad_norm=79.244, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.134, optim0_lr0=6.240e-05, train_time=1.215
+[gpua005:0/64] 2023-12-19 18:48:10,404 (trainer:737) INFO: 42epoch:train:1601-1700batch: iter_time=8.739e-05, forward_time=0.149, loss_ctc=69.155, loss_att=53.634, acc=0.716, loss=58.290, backward_time=0.289, grad_norm=113.044, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.134, optim0_lr0=6.239e-05, train_time=1.316
+[gpua005:0/64] 2023-12-19 18:48:52,789 (trainer:668) WARNING: The grad norm is nan. Skipping updating the model.
+[gpua005:0/64] 2023-12-19 18:50:38,476 (trainer:737) INFO: 42epoch:train:1701-1800batch: iter_time=8.527e-05, forward_time=0.163, loss_ctc=70.047, loss_att=56.794, acc=0.718, loss=60.770, backward_time=0.317, grad_norm=101.174, clip=100.000, loss_scale=1.280e+31, optim_step_time=0.135, optim0_lr0=6.239e-05, train_time=1.480
+[gpua005:0/64] 2023-12-19 18:53:42,584 (trainer:737) INFO: 42epoch:train:1801-1900batch: iter_time=8.042e-05, forward_time=0.155, loss_ctc=70.290, loss_att=52.555, acc=0.740, loss=57.875, backward_time=0.305, grad_norm=98.131, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.134, optim0_lr0=6.238e-05, train_time=1.841
+[gpua005:0/64] 2023-12-19 18:56:20,014 (trainer:737) INFO: 42epoch:train:1901-2000batch: iter_time=7.940e-05, forward_time=0.180, loss_ctc=58.153, loss_att=44.650, acc=0.753, loss=48.701, backward_time=0.316, grad_norm=73.375, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.135, optim0_lr0=6.238e-05, train_time=1.574
+[gpua005:0/64] 2023-12-19 18:58:53,177 (trainer:737) INFO: 42epoch:train:2001-2100batch: iter_time=8.015e-05, forward_time=0.166, loss_ctc=68.745, loss_att=56.185, acc=0.730, loss=59.953, backward_time=0.301, grad_norm=73.573, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.137, optim0_lr0=6.237e-05, train_time=1.532
+[gpua005:0/64] 2023-12-19 19:01:22,436 (trainer:737) INFO: 42epoch:train:2101-2200batch: iter_time=8.578e-05, forward_time=0.147, loss_ctc=77.031, loss_att=55.361, acc=0.714, loss=61.862, backward_time=0.289, grad_norm=95.869, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.134, optim0_lr0=6.237e-05, train_time=1.492
+[gpua005:0/64] 2023-12-19 19:03:31,423 (trainer:737) INFO: 42epoch:train:2201-2300batch: iter_time=8.539e-05, forward_time=0.147, loss_ctc=59.968, loss_att=48.873, acc=0.743, loss=52.202, backward_time=0.285, grad_norm=62.528, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.134, optim0_lr0=6.236e-05, train_time=1.290
+[gpua005:0/64] 2023-12-19 19:05:43,828 (trainer:737) INFO: 42epoch:train:2301-2400batch: iter_time=8.234e-05, forward_time=0.147, loss_ctc=72.658, loss_att=46.909, acc=0.742, loss=54.634, backward_time=0.289, grad_norm=91.754, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.134, optim0_lr0=6.236e-05, train_time=1.324
+[gpua005:0/64] 2023-12-19 19:08:07,157 (trainer:737) INFO: 42epoch:train:2401-2500batch: iter_time=8.308e-05, forward_time=0.147, loss_ctc=68.123, loss_att=53.231, acc=0.738, loss=57.698, backward_time=0.286, grad_norm=129.677, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.133, optim0_lr0=6.235e-05, train_time=1.433
+[gpua005:0/64] 2023-12-19 19:08:27,185 (multiple_iter_factory:32) INFO: Building 2th iter-factory...
+[gpua005:0/64] 2023-12-19 19:08:45,904 (s2t:445) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4')
+[gpua005:0/64] 2023-12-19 19:08:49,612 (abs_task:1616) INFO: [train] dataset:
+ESPnetDataset(
+  speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.1", "type": "kaldi_ark"}
+  text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.1", "type": "text"}
+  text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.1", "type": "text"}
+  text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.1", "type": "text"}
+  preprocess: <espnet2.train.preprocessor.S2TPreprocessor object at 0x7f0be2f22020>)
+[gpua005:0/64] 2023-12-19 19:08:49,612 (abs_task:1617) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.1, 
+[gpua005:0/64] 2023-12-19 19:08:49,616 (abs_task:1618) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257
+[gpua005:0/64] 2023-12-19 19:15:15,340 (trainer:737) INFO: 42epoch:train:2501-2600batch: iter_time=2.704, forward_time=0.159, loss_ctc=58.452, loss_att=45.504, acc=0.739, loss=49.389, backward_time=0.287, grad_norm=76.775, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.134, optim0_lr0=6.235e-05, train_time=4.281
+[gpua005:0/64] 2023-12-19 19:17:23,661 (trainer:737) INFO: 42epoch:train:2601-2700batch: iter_time=7.911e-05, forward_time=0.147, loss_ctc=60.060, loss_att=48.378, acc=0.745, loss=51.882, backward_time=0.280, grad_norm=139.622, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.134, optim0_lr0=6.234e-05, train_time=1.283
+[gpua005:0/64] 2023-12-19 19:19:41,755 (trainer:737) INFO: 42epoch:train:2701-2800batch: iter_time=8.051e-05, forward_time=0.157, loss_ctc=70.529, loss_att=62.377, acc=0.732, loss=64.823, backward_time=0.295, grad_norm=126.782, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.136, optim0_lr0=6.234e-05, train_time=1.381
+[gpua005:0/64] 2023-12-19 19:22:14,074 (trainer:737) INFO: 42epoch:train:2801-2900batch: iter_time=8.592e-05, forward_time=0.148, loss_ctc=65.105, loss_att=47.245, acc=0.714, loss=52.603, backward_time=0.294, grad_norm=90.699, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.134, optim0_lr0=6.233e-05, train_time=1.523
+[gpua005:0/64] 2023-12-19 19:24:47,731 (trainer:737) INFO: 42epoch:train:2901-3000batch: iter_time=8.468e-05, forward_time=0.165, loss_ctc=72.661, loss_att=58.334, acc=0.714, loss=62.632, backward_time=0.307, grad_norm=141.625, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.135, optim0_lr0=6.233e-05, train_time=1.536
+[gpua005:0/64] 2023-12-19 19:27:14,965 (trainer:737) INFO: 42epoch:train:3001-3100batch: iter_time=8.166e-05, forward_time=0.172, loss_ctc=68.030, loss_att=54.527, acc=0.733, loss=58.578, backward_time=0.302, grad_norm=100.804, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.134, optim0_lr0=6.232e-05, train_time=1.472
+[gpua005:0/64] 2023-12-19 19:29:28,550 (trainer:737) INFO: 42epoch:train:3101-3200batch: iter_time=8.652e-05, forward_time=0.153, loss_ctc=68.812, loss_att=52.962, acc=0.743, loss=57.717, backward_time=0.286, grad_norm=76.664, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.133, optim0_lr0=6.232e-05, train_time=1.336
+[gpua005:0/64] 2023-12-19 19:31:44,541 (trainer:737) INFO: 42epoch:train:3201-3300batch: iter_time=8.434e-05, forward_time=0.147, loss_ctc=60.354, loss_att=45.719, acc=0.750, loss=50.110, backward_time=0.299, grad_norm=71.422, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.133, optim0_lr0=6.231e-05, train_time=1.360
+[gpua005:0/64] 2023-12-19 19:34:05,775 (trainer:737) INFO: 42epoch:train:3301-3400batch: iter_time=8.255e-05, forward_time=0.147, loss_ctc=78.698, loss_att=58.317, acc=0.720, loss=64.431, backward_time=0.318, grad_norm=100.948, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.133, optim0_lr0=6.231e-05, train_time=1.412
+[gpua005:0/64] 2023-12-19 19:36:28,253 (trainer:737) INFO: 42epoch:train:3401-3500batch: iter_time=8.326e-05, forward_time=0.147, loss_ctc=64.976, loss_att=49.816, acc=0.741, loss=54.364, backward_time=0.286, grad_norm=81.235, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.133, optim0_lr0=6.230e-05, train_time=1.425
+[gpua005:0/64] 2023-12-19 19:39:03,192 (trainer:737) INFO: 42epoch:train:3501-3600batch: iter_time=8.363e-05, forward_time=0.158, loss_ctc=69.766, loss_att=47.430, acc=0.748, loss=54.131, backward_time=0.326, grad_norm=104.294, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.135, optim0_lr0=6.230e-05, train_time=1.549
+[gpua005:0/64] 2023-12-19 19:41:32,663 (trainer:737) INFO: 42epoch:train:3601-3700batch: iter_time=7.766e-05, forward_time=0.148, loss_ctc=66.084, loss_att=52.341, acc=0.731, loss=56.464, backward_time=0.309, grad_norm=200.474, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.134, optim0_lr0=6.229e-05, train_time=1.495
+[gpua005:0/64] 2023-12-19 19:43:05,604 (multiple_iter_factory:32) INFO: Building 3th iter-factory...
+[gpua005:0/64] 2023-12-19 19:43:23,904 (s2t:445) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4')
+[gpua005:0/64] 2023-12-19 19:43:27,367 (abs_task:1616) INFO: [train] dataset:
+ESPnetDataset(
+  speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.2", "type": "kaldi_ark"}
+  text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.2", "type": "text"}
+  text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.2", "type": "text"}
+  text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.2", "type": "text"}
+  preprocess: <espnet2.train.preprocessor.S2TPreprocessor object at 0x7f072b0f8910>)
+[gpua005:0/64] 2023-12-19 19:43:27,367 (abs_task:1617) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.2, 
+[gpua005:0/64] 2023-12-19 19:43:27,370 (abs_task:1618) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257
+[gpua005:0/64] 2023-12-19 19:48:34,446 (trainer:737) INFO: 42epoch:train:3701-3800batch: iter_time=2.464, forward_time=0.189, loss_ctc=57.741, loss_att=42.919, acc=0.754, loss=47.366, backward_time=0.288, grad_norm=71.012, clip=100.000, loss_scale=1.754e+31, optim_step_time=0.135, optim0_lr0=6.229e-05, train_time=4.218
+[gpua005:0/64] 2023-12-19 19:50:37,082 (trainer:737) INFO: 42epoch:train:3801-3900batch: iter_time=8.740e-05, forward_time=0.147, loss_ctc=61.993, loss_att=49.383, acc=0.740, loss=53.166, backward_time=0.280, grad_norm=110.011, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.134, optim0_lr0=6.228e-05, train_time=1.226
+[gpua005:0/64] 2023-12-19 19:52:41,018 (trainer:737) INFO: 42epoch:train:3901-4000batch: iter_time=8.276e-05, forward_time=0.147, loss_ctc=69.106, loss_att=60.458, acc=0.746, loss=63.052, backward_time=0.282, grad_norm=97.327, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.135, optim0_lr0=6.228e-05, train_time=1.239
+[gpua005:0/64] 2023-12-19 19:55:11,883 (trainer:737) INFO: 42epoch:train:4001-4100batch: iter_time=8.360e-05, forward_time=0.147, loss_ctc=67.131, loss_att=53.350, acc=0.727, loss=57.484, backward_time=0.323, grad_norm=123.454, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.135, optim0_lr0=6.227e-05, train_time=1.508
+[gpua005:0/64] 2023-12-19 19:57:49,946 (trainer:737) INFO: 42epoch:train:4101-4200batch: iter_time=9.701e-05, forward_time=0.147, loss_ctc=68.382, loss_att=53.171, acc=0.730, loss=57.734, backward_time=0.293, grad_norm=106.489, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.134, optim0_lr0=6.227e-05, train_time=1.580
+[gpua005:0/64] 2023-12-19 20:00:16,413 (trainer:737) INFO: 42epoch:train:4201-4300batch: iter_time=9.283e-05, forward_time=0.171, loss_ctc=68.167, loss_att=54.843, acc=0.737, loss=58.840, backward_time=0.306, grad_norm=106.846, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.134, optim0_lr0=6.226e-05, train_time=1.464
+[gpua005:0/64] 2023-12-19 20:02:28,049 (trainer:737) INFO: 42epoch:train:4301-4400batch: iter_time=9.167e-05, forward_time=0.148, loss_ctc=69.275, loss_att=52.356, acc=0.746, loss=57.432, backward_time=0.284, grad_norm=120.480, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.134, optim0_lr0=6.226e-05, train_time=1.316
+[gpua005:0/64] 2023-12-19 20:05:16,328 (trainer:737) INFO: 42epoch:train:4401-4500batch: iter_time=8.888e-05, forward_time=0.147, loss_ctc=58.005, loss_att=46.984, acc=0.756, loss=50.291, backward_time=0.318, grad_norm=151.348, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.134, optim0_lr0=6.225e-05, train_time=1.683
+[gpua005:0/64] 2023-12-19 20:08:10,199 (trainer:737) INFO: 42epoch:train:4501-4600batch: iter_time=8.752e-05, forward_time=0.248, loss_ctc=67.555, loss_att=55.882, acc=0.742, loss=59.384, backward_time=0.312, grad_norm=86.198, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.137, optim0_lr0=6.225e-05, train_time=1.738
+[gpua005:0/64] 2023-12-19 20:11:00,739 (trainer:737) INFO: 42epoch:train:4601-4700batch: iter_time=8.425e-05, forward_time=0.149, loss_ctc=75.409, loss_att=56.594, acc=0.721, loss=62.238, backward_time=0.321, grad_norm=88.799, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.134, optim0_lr0=6.224e-05, train_time=1.706
+[gpua005:0/64] 2023-12-19 20:13:44,968 (trainer:737) INFO: 42epoch:train:4701-4800batch: iter_time=9.176e-05, forward_time=0.147, loss_ctc=59.974, loss_att=49.947, acc=0.751, loss=52.955, backward_time=0.357, grad_norm=67.191, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.134, optim0_lr0=6.224e-05, train_time=1.642
+[gpua005:0/64] 2023-12-19 20:16:18,880 (trainer:737) INFO: 42epoch:train:4801-4900batch: iter_time=9.358e-05, forward_time=0.147, loss_ctc=72.052, loss_att=46.238, acc=0.757, loss=53.982, backward_time=0.293, grad_norm=79.447, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.133, optim0_lr0=6.223e-05, train_time=1.539
+[gpua005:0/64] 2023-12-19 20:18:27,612 (trainer:737) INFO: 42epoch:train:4901-5000batch: iter_time=8.814e-05, forward_time=0.147, loss_ctc=67.040, loss_att=54.449, acc=0.745, loss=58.226, backward_time=0.279, grad_norm=84.495, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.134, optim0_lr0=6.223e-05, train_time=1.287
+[gpua005:0/64] 2023-12-19 20:18:47,640 (multiple_iter_factory:32) INFO: Building 4th iter-factory...
+[gpua005:0/64] 2023-12-19 20:19:06,064 (s2t:445) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4')
+[gpua005:0/64] 2023-12-19 20:19:09,512 (abs_task:1616) INFO: [train] dataset:
+ESPnetDataset(
+  speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.9", "type": "kaldi_ark"}
+  text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.9", "type": "text"}
+  text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.9", "type": "text"}
+  text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.9", "type": "text"}
+  preprocess: <espnet2.train.preprocessor.S2TPreprocessor object at 0x7f0783b53850>)
+[gpua005:0/64] 2023-12-19 20:19:09,512 (abs_task:1617) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.9, 
+[gpua005:0/64] 2023-12-19 20:19:09,524 (abs_task:1618) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257
+[gpua005:0/64] 2023-12-19 20:25:59,443 (trainer:737) INFO: 42epoch:train:5001-5100batch: iter_time=3.198, forward_time=0.162, loss_ctc=56.835, loss_att=45.544, acc=0.739, loss=48.931, backward_time=0.281, grad_norm=84.331, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.134, optim0_lr0=6.222e-05, train_time=4.518
+[gpua005:0/64] 2023-12-19 20:28:17,530 (trainer:737) INFO: 42epoch:train:5101-5200batch: iter_time=9.479e-05, forward_time=0.148, loss_ctc=59.503, loss_att=48.498, acc=0.745, loss=51.800, backward_time=0.326, grad_norm=96.989, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.134, optim0_lr0=6.222e-05, train_time=1.381
+[gpua005:0/64] 2023-12-19 20:30:31,192 (trainer:737) INFO: 42epoch:train:5201-5300batch: iter_time=9.838e-05, forward_time=0.148, loss_ctc=70.969, loss_att=62.249, acc=0.733, loss=64.865, backward_time=0.284, grad_norm=95.301, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.134, optim0_lr0=6.221e-05, train_time=1.336
+[gpua005:0/64] 2023-12-19 20:33:24,537 (trainer:737) INFO: 42epoch:train:5301-5400batch: iter_time=1.050e-04, forward_time=0.148, loss_ctc=64.155, loss_att=46.972, acc=0.716, loss=52.127, backward_time=0.309, grad_norm=129.048, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.134, optim0_lr0=6.221e-05, train_time=1.733
+[gpua005:0/64] 2023-12-19 20:35:58,970 (trainer:737) INFO: 42epoch:train:5401-5500batch: iter_time=9.646e-05, forward_time=0.147, loss_ctc=72.369, loss_att=57.859, acc=0.714, loss=62.212, backward_time=0.303, grad_norm=96.342, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.134, optim0_lr0=6.220e-05, train_time=1.544
+[gpua005:0/64] 2023-12-19 20:38:23,876 (trainer:737) INFO: 42epoch:train:5501-5600batch: iter_time=8.689e-05, forward_time=0.234, loss_ctc=67.882, loss_att=54.502, acc=0.735, loss=58.516, backward_time=0.307, grad_norm=96.161, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.137, optim0_lr0=6.220e-05, train_time=1.449
+[gpua005:0/64] 2023-12-19 20:40:47,720 (trainer:737) INFO: 42epoch:train:5601-5700batch: iter_time=9.576e-05, forward_time=0.149, loss_ctc=68.313, loss_att=52.411, acc=0.745, loss=57.182, backward_time=0.307, grad_norm=68.650, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.134, optim0_lr0=6.219e-05, train_time=1.439
+[gpua005:0/64] 2023-12-19 20:42:53,230 (trainer:737) INFO: 42epoch:train:5701-5800batch: iter_time=9.074e-05, forward_time=0.149, loss_ctc=60.466, loss_att=45.739, acc=0.752, loss=50.157, backward_time=0.284, grad_norm=59.954, clip=100.000, loss_scale=3.509e+31, optim_step_time=0.134, optim0_lr0=6.219e-05, train_time=1.255
+[gpua005:0/64] 2023-12-19 20:45:37,466 (trainer:737) INFO: 42epoch:train:5801-5900batch: iter_time=8.712e-05, forward_time=0.148, loss_ctc=76.927, loss_att=57.915, acc=0.722, loss=63.618, backward_time=0.333, grad_norm=91.996, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.134, optim0_lr0=6.218e-05, train_time=1.642
+[gpua005:0/64] 2023-12-19 20:48:13,544 (trainer:737) INFO: 42epoch:train:5901-6000batch: iter_time=9.390e-05, forward_time=0.148, loss_ctc=64.624, loss_att=49.428, acc=0.743, loss=53.987, backward_time=0.305, grad_norm=69.705, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.134, optim0_lr0=6.218e-05, train_time=1.561
+[gpua005:0/64] 2023-12-19 20:50:43,477 (trainer:737) INFO: 42epoch:train:6001-6100batch: iter_time=9.781e-05, forward_time=0.163, loss_ctc=69.376, loss_att=46.962, acc=0.750, loss=53.686, backward_time=0.304, grad_norm=77.233, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.134, optim0_lr0=6.217e-05, train_time=1.499
+[gpua005:0/64] 2023-12-19 20:52:59,294 (trainer:737) INFO: 42epoch:train:6101-6200batch: iter_time=8.814e-05, forward_time=0.147, loss_ctc=65.070, loss_att=51.364, acc=0.734, loss=55.476, backward_time=0.297, grad_norm=67.453, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.134, optim0_lr0=6.217e-05, train_time=1.358
+[gpua005:0/64] 2023-12-19 20:54:19,770 (multiple_iter_factory:32) INFO: Building 5th iter-factory...
+[gpua005:0/64] 2023-12-19 20:54:38,554 (s2t:445) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4')
+[gpua005:0/64] 2023-12-19 20:54:42,042 (abs_task:1616) INFO: [train] dataset:
+ESPnetDataset(
+  speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.3", "type": "kaldi_ark"}
+  text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.3", "type": "text"}
+  text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.3", "type": "text"}
+  text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.3", "type": "text"}
+  preprocess: <espnet2.train.preprocessor.S2TPreprocessor object at 0x7f06b206ba30>)
+[gpua005:0/64] 2023-12-19 20:54:42,042 (abs_task:1617) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.3, 
+[gpua005:0/64] 2023-12-19 20:54:42,045 (abs_task:1618) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257
+[gpua005:0/64] 2023-12-19 21:00:28,907 (trainer:737) INFO: 42epoch:train:6201-6300batch: iter_time=3.172, forward_time=0.179, loss_ctc=57.386, loss_att=42.071, acc=0.753, loss=46.665, backward_time=0.286, grad_norm=68.176, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.135, optim0_lr0=6.216e-05, train_time=4.496
+[gpua005:0/64] 2023-12-19 21:02:35,600 (trainer:737) INFO: 42epoch:train:6301-6400batch: iter_time=7.775e-05, forward_time=0.146, loss_ctc=60.520, loss_att=47.805, acc=0.741, loss=51.619, backward_time=0.278, grad_norm=74.345, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.133, optim0_lr0=6.216e-05, train_time=1.267
+[gpua005:0/64] 2023-12-19 21:04:36,248 (trainer:737) INFO: 42epoch:train:6401-6500batch: iter_time=8.403e-05, forward_time=0.147, loss_ctc=67.740, loss_att=59.697, acc=0.746, loss=62.110, backward_time=0.280, grad_norm=74.487, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.134, optim0_lr0=6.215e-05, train_time=1.206
+[gpua005:0/64] 2023-12-19 21:07:15,822 (trainer:737) INFO: 42epoch:train:6501-6600batch: iter_time=8.679e-05, forward_time=0.148, loss_ctc=66.699, loss_att=53.280, acc=0.717, loss=57.306, backward_time=0.296, grad_norm=113.355, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.135, optim0_lr0=6.215e-05, train_time=1.596
+[gpua005:0/64] 2023-12-19 21:09:38,651 (trainer:737) INFO: 42epoch:train:6601-6700batch: iter_time=8.785e-05, forward_time=0.148, loss_ctc=67.988, loss_att=52.891, acc=0.721, loss=57.420, backward_time=0.290, grad_norm=121.992, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.135, optim0_lr0=6.214e-05, train_time=1.428
+[gpua005:0/64] 2023-12-19 21:11:57,948 (trainer:737) INFO: 42epoch:train:6701-6800batch: iter_time=8.438e-05, forward_time=0.148, loss_ctc=67.972, loss_att=55.227, acc=0.726, loss=59.050, backward_time=0.294, grad_norm=109.414, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.135, optim0_lr0=6.214e-05, train_time=1.393
+[gpua005:0/64] 2023-12-19 21:14:34,329 (trainer:737) INFO: 42epoch:train:6801-6900batch: iter_time=8.142e-05, forward_time=0.158, loss_ctc=69.635, loss_att=51.545, acc=0.746, loss=56.972, backward_time=0.307, grad_norm=99.257, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.138, optim0_lr0=6.213e-05, train_time=1.564
+[gpua005:0/64] 2023-12-19 21:17:01,554 (trainer:737) INFO: 42epoch:train:6901-7000batch: iter_time=8.183e-05, forward_time=0.148, loss_ctc=58.096, loss_att=44.293, acc=0.756, loss=48.434, backward_time=0.285, grad_norm=73.478, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.135, optim0_lr0=6.213e-05, train_time=1.472
+[gpua005:0/64] 2023-12-19 21:19:33,447 (trainer:737) INFO: 42epoch:train:7001-7100batch: iter_time=8.225e-05, forward_time=0.150, loss_ctc=67.423, loss_att=55.391, acc=0.735, loss=59.000, backward_time=0.290, grad_norm=70.737, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.135, optim0_lr0=6.212e-05, train_time=1.519
+[gpua005:0/64] 2023-12-19 21:23:51,165 (trainer:737) INFO: 42epoch:train:7101-7200batch: iter_time=9.012e-05, forward_time=0.183, loss_ctc=75.264, loss_att=54.548, acc=0.719, loss=60.763, backward_time=0.430, grad_norm=90.447, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.138, optim0_lr0=6.212e-05, train_time=2.576
+[gpua005:0/64] 2023-12-19 21:28:07,533 (trainer:737) INFO: 42epoch:train:7201-7300batch: iter_time=9.151e-05, forward_time=0.347, loss_ctc=59.700, loss_att=48.515, acc=0.748, loss=51.870, backward_time=0.330, grad_norm=63.859, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.144, optim0_lr0=6.211e-05, train_time=2.564
+[gpua005:0/64] 2023-12-19 21:31:38,379 (trainer:737) INFO: 42epoch:train:7301-7400batch: iter_time=9.064e-05, forward_time=0.182, loss_ctc=71.768, loss_att=46.167, acc=0.746, loss=53.847, backward_time=0.363, grad_norm=72.678, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.146, optim0_lr0=6.211e-05, train_time=2.109
+[gpua005:0/64] 2023-12-19 21:33:53,828 (trainer:737) INFO: 42epoch:train:7401-7500batch: iter_time=9.580e-05, forward_time=0.148, loss_ctc=66.756, loss_att=52.688, acc=0.743, loss=56.909, backward_time=0.293, grad_norm=69.872, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.135, optim0_lr0=6.210e-05, train_time=1.354
+[gpua005:0/64] 2023-12-19 21:34:13,857 (multiple_iter_factory:32) INFO: Building 6th iter-factory...
+[gpua005:0/64] 2023-12-19 21:34:32,241 (s2t:445) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4')
+[gpua005:0/64] 2023-12-19 21:34:36,142 (abs_task:1616) INFO: [train] dataset:
+ESPnetDataset(
+  speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.4", "type": "kaldi_ark"}
+  text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.4", "type": "text"}
+  text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.4", "type": "text"}
+  text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.4", "type": "text"}
+  preprocess: <espnet2.train.preprocessor.S2TPreprocessor object at 0x7f07641f2230>)
+[gpua005:0/64] 2023-12-19 21:34:36,142 (abs_task:1617) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.4, 
+[gpua005:0/64] 2023-12-19 21:34:36,145 (abs_task:1618) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257
+[gpua005:0/64] 2023-12-19 21:41:03,765 (trainer:737) INFO: 42epoch:train:7501-7600batch: iter_time=2.976, forward_time=0.161, loss_ctc=56.514, loss_att=46.909, acc=0.740, loss=49.791, backward_time=0.281, grad_norm=73.378, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.134, optim0_lr0=6.210e-05, train_time=4.299
+[gpua005:0/64] 2023-12-19 21:43:03,786 (trainer:737) INFO: 42epoch:train:7601-7700batch: iter_time=9.520e-05, forward_time=0.147, loss_ctc=59.557, loss_att=48.068, acc=0.756, loss=51.515, backward_time=0.278, grad_norm=64.935, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.133, optim0_lr0=6.209e-05, train_time=1.200
+[gpua005:0/64] 2023-12-19 21:45:16,768 (trainer:737) INFO: 42epoch:train:7701-7800batch: iter_time=8.198e-05, forward_time=0.149, loss_ctc=70.362, loss_att=61.641, acc=0.744, loss=64.257, backward_time=0.308, grad_norm=66.695, clip=100.000, loss_scale=7.018e+31, optim_step_time=0.134, optim0_lr0=6.209e-05, train_time=1.330
+[gpua005:0/64] 2023-12-19 21:47:58,825 (trainer:737) INFO: 42epoch:train:7801-7900batch: iter_time=9.334e-05, forward_time=0.148, loss_ctc=62.870, loss_att=46.432, acc=0.731, loss=51.363, backward_time=0.286, grad_norm=66.507, clip=100.000, loss_scale=8.113e+31, optim_step_time=0.134, optim0_lr0=6.208e-05, train_time=1.620
+[gpua005:0/64] 2023-12-19 21:50:39,656 (trainer:737) INFO: 42epoch:train:7901-8000batch: iter_time=1.001e-04, forward_time=0.152, loss_ctc=71.664, loss_att=58.383, acc=0.728, loss=62.368, backward_time=0.295, grad_norm=81.059, clip=100.000, loss_scale=8.113e+31, optim_step_time=0.136, optim0_lr0=6.208e-05, train_time=1.608
+[gpua005:0/64] 2023-12-19 21:53:03,694 (trainer:737) INFO: 42epoch:train:8001-8100batch: iter_time=9.059e-05, forward_time=0.195, loss_ctc=67.677, loss_att=53.925, acc=0.745, loss=58.050, backward_time=0.303, grad_norm=81.204, clip=100.000, loss_scale=8.113e+31, optim_step_time=0.144, optim0_lr0=6.207e-05, train_time=1.440
+[gpua005:0/64] 2023-12-19 21:55:38,099 (trainer:737) INFO: 42epoch:train:8101-8200batch: iter_time=8.478e-04, forward_time=0.164, loss_ctc=68.001, loss_att=53.881, acc=0.746, loss=58.117, backward_time=0.306, grad_norm=72.927, clip=100.000, loss_scale=8.113e+31, optim_step_time=0.135, optim0_lr0=6.207e-05, train_time=1.543
+[gpua005:0/64] 2023-12-19 21:57:51,417 (trainer:737) INFO: 42epoch:train:8201-8300batch: iter_time=1.041e-04, forward_time=0.146, loss_ctc=59.847, loss_att=46.677, acc=0.759, loss=50.628, backward_time=0.291, grad_norm=77.436, clip=100.000, loss_scale=8.113e+31, optim_step_time=0.133, optim0_lr0=6.206e-05, train_time=1.334
+[gpua005:0/64] 2023-12-19 22:00:39,394 (trainer:737) INFO: 42epoch:train:8301-8400batch: iter_time=9.213e-05, forward_time=0.163, loss_ctc=76.601, loss_att=59.345, acc=0.724, loss=64.522, backward_time=0.345, grad_norm=104.841, clip=100.000, loss_scale=8.113e+31, optim_step_time=0.134, optim0_lr0=6.206e-05, train_time=1.680
+[gpua005:0/64] 2023-12-19 22:03:21,368 (trainer:737) INFO: 42epoch:train:8401-8500batch: iter_time=8.977e-05, forward_time=0.149, loss_ctc=64.564, loss_att=51.915, acc=0.746, loss=55.710, backward_time=0.365, grad_norm=100.287, clip=100.000, loss_scale=8.113e+31, optim_step_time=0.134, optim0_lr0=6.205e-05, train_time=1.620
+[gpua005:0/64] 2023-12-19 22:06:19,033 (trainer:737) INFO: 42epoch:train:8501-8600batch: iter_time=8.797e-05, forward_time=0.147, loss_ctc=68.965, loss_att=46.928, acc=0.758, loss=53.539, backward_time=0.333, grad_norm=85.672, clip=100.000, loss_scale=8.113e+31, optim_step_time=0.134, optim0_lr0=6.205e-05, train_time=1.776
+[gpua005:0/64] 2023-12-19 22:08:27,395 (trainer:737) INFO: 42epoch:train:8601-8700batch: iter_time=9.116e-05, forward_time=0.146, loss_ctc=64.752, loss_att=52.062, acc=0.743, loss=55.869, backward_time=0.285, grad_norm=87.841, clip=100.000, loss_scale=8.113e+31, optim_step_time=0.133, optim0_lr0=6.204e-05, train_time=1.283
+[gpua005:0/64] 2023-12-19 22:10:02,662 (multiple_iter_factory:32) INFO: Building 7th iter-factory...
+[gpua005:0/64] 2023-12-19 22:10:20,938 (s2t:445) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4')
+[gpua005:0/64] 2023-12-19 22:10:24,726 (abs_task:1616) INFO: [train] dataset:
+ESPnetDataset(
+  speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.10", "type": "kaldi_ark"}
+  text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.10", "type": "text"}
+  text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.10", "type": "text"}
+  text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.10", "type": "text"}
+  preprocess: <espnet2.train.preprocessor.S2TPreprocessor object at 0x7f076322fd00>)
+[gpua005:0/64] 2023-12-19 22:10:24,726 (abs_task:1617) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.10, 
+[gpua005:0/64] 2023-12-19 22:10:24,729 (abs_task:1618) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257
+[gpua005:0/64] 2023-12-19 22:16:10,216 (trainer:737) INFO: 42epoch:train:8701-8800batch: iter_time=3.146, forward_time=0.169, loss_ctc=57.255, loss_att=43.635, acc=0.756, loss=47.721, backward_time=0.283, grad_norm=73.420, clip=100.000, loss_scale=8.113e+31, optim_step_time=0.134, optim0_lr0=6.204e-05, train_time=4.628
+[gpua005:0/64] 2023-12-19 22:18:15,791 (trainer:737) INFO: 42epoch:train:8801-8900batch: iter_time=8.108e-05, forward_time=0.148, loss_ctc=59.433, loss_att=47.896, acc=0.746, loss=51.357, backward_time=0.281, grad_norm=74.708, clip=100.000, loss_scale=8.113e+31, optim_step_time=0.134, optim0_lr0=6.203e-05, train_time=1.256
+[gpua005:0/64] 2023-12-19 22:20:26,840 (trainer:737) INFO: 42epoch:train:8901-9000batch: iter_time=8.332e-05, forward_time=0.148, loss_ctc=68.104, loss_att=60.070, acc=0.749, loss=62.480, backward_time=0.283, grad_norm=73.192, clip=100.000, loss_scale=8.113e+31, optim_step_time=0.134, optim0_lr0=6.203e-05, train_time=1.310
+[gpua005:0/64] 2023-12-19 22:23:12,483 (trainer:737) INFO: 42epoch:train:9001-9100batch: iter_time=8.988e-05, forward_time=0.147, loss_ctc=66.937, loss_att=52.762, acc=0.731, loss=57.015, backward_time=0.293, grad_norm=66.498, clip=100.000, loss_scale=8.113e+31, optim_step_time=0.134, optim0_lr0=6.202e-05, train_time=1.656
+[gpua005:0/64] 2023-12-19 22:26:27,848 (trainer:737) INFO: 42epoch:train:9101-9200batch: iter_time=8.792e-05, forward_time=0.155, loss_ctc=67.430, loss_att=52.498, acc=0.733, loss=56.977, backward_time=0.323, grad_norm=71.796, clip=100.000, loss_scale=8.113e+31, optim_step_time=0.135, optim0_lr0=6.202e-05, train_time=1.953
+[gpua005:0/64] 2023-12-19 22:29:07,432 (trainer:737) INFO: 42epoch:train:9201-9300batch: iter_time=8.147e-05, forward_time=0.148, loss_ctc=67.217, loss_att=55.304, acc=0.738, loss=58.878, backward_time=0.308, grad_norm=88.069, clip=100.000, loss_scale=8.113e+31, optim_step_time=0.134, optim0_lr0=6.201e-05, train_time=1.596
+[gpua005:0/64] 2023-12-19 22:31:43,712 (trainer:737) INFO: 42epoch:train:9301-9400batch: iter_time=7.788e-05, forward_time=0.147, loss_ctc=69.159, loss_att=52.190, acc=0.747, loss=57.280, backward_time=0.311, grad_norm=75.763, clip=100.000, loss_scale=8.113e+31, optim_step_time=0.134, optim0_lr0=6.201e-05, train_time=1.563
+[gpua005:0/64] 2023-12-19 22:34:21,636 (trainer:737) INFO: 42epoch:train:9401-9500batch: iter_time=8.576e-05, forward_time=0.179, loss_ctc=57.746, loss_att=46.910, acc=0.757, loss=50.161, backward_time=0.376, grad_norm=63.826, clip=100.000, loss_scale=8.113e+31, optim_step_time=0.139, optim0_lr0=6.200e-05, train_time=1.579
+[gpua005:0/64] 2023-12-19 22:37:07,951 (trainer:737) INFO: 42epoch:train:9501-9600batch: iter_time=8.890e-05, forward_time=0.148, loss_ctc=67.172, loss_att=55.235, acc=0.744, loss=58.816, backward_time=0.323, grad_norm=67.766, clip=100.000, loss_scale=8.113e+31, optim_step_time=0.134, optim0_lr0=6.200e-05, train_time=1.663
+[gpua005:0/64] 2023-12-19 22:39:44,299 (trainer:737) INFO: 42epoch:train:9601-9700batch: iter_time=8.729e-05, forward_time=0.148, loss_ctc=74.663, loss_att=56.666, acc=0.722, loss=62.065, backward_time=0.301, grad_norm=81.242, clip=100.000, loss_scale=8.113e+31, optim_step_time=0.133, optim0_lr0=6.199e-05, train_time=1.563
+[gpua005:0/64] 2023-12-19 22:42:34,977 (trainer:737) INFO: 42epoch:train:9701-9800batch: iter_time=8.487e-05, forward_time=0.148, loss_ctc=59.164, loss_att=49.474, acc=0.752, loss=52.381, backward_time=0.304, grad_norm=60.363, clip=100.000, loss_scale=1.404e+32, optim_step_time=0.133, optim0_lr0=6.199e-05, train_time=1.707
+[gpua005:0/64] 2023-12-19 22:45:02,466 (trainer:737) INFO: 42epoch:train:9801-9900batch: iter_time=8.375e-05, forward_time=0.160, loss_ctc=71.113, loss_att=45.785, acc=0.759, loss=53.383, backward_time=0.300, grad_norm=71.237, clip=100.000, loss_scale=1.623e+32, optim_step_time=0.133, optim0_lr0=6.198e-05, train_time=1.475
+[gpua005:0/64] 2023-12-19 22:47:29,468 (trainer:737) INFO: 42epoch:train:9901-10000batch: iter_time=8.738e-05, forward_time=0.151, loss_ctc=66.629, loss_att=54.253, acc=0.748, loss=57.966, backward_time=0.311, grad_norm=73.309, clip=100.000, loss_scale=1.623e+32, optim_step_time=0.134, optim0_lr0=6.198e-05, train_time=1.470
+[gpua005:0/64] 2023-12-19 22:47:49,497 (multiple_iter_factory:32) INFO: Building 8th iter-factory...
+[gpua005:0/64] 2023-12-19 22:48:07,729 (s2t:445) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4')
+[gpua005:0/64] 2023-12-19 22:48:11,513 (abs_task:1616) INFO: [train] dataset:
+ESPnetDataset(
+  speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.6", "type": "kaldi_ark"}
+  text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.6", "type": "text"}
+  text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.6", "type": "text"}
+  text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.6", "type": "text"}
+  preprocess: <espnet2.train.preprocessor.S2TPreprocessor object at 0x7f0702803fd0>)
+[gpua005:0/64] 2023-12-19 22:48:11,513 (abs_task:1617) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.6, 
+[gpua005:0/64] 2023-12-19 22:48:11,517 (abs_task:1618) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257
+[gpua005:0/64] 2023-12-19 22:55:04,041 (trainer:737) INFO: 42epoch:train:10001-10100batch: iter_time=3.167, forward_time=0.148, loss_ctc=55.574, loss_att=44.922, acc=0.746, loss=48.117, backward_time=0.282, grad_norm=70.120, clip=100.000, loss_scale=1.623e+32, optim_step_time=0.134, optim0_lr0=6.197e-05, train_time=4.546
+[gpua005:0/64] 2023-12-19 22:57:06,096 (trainer:737) INFO: 42epoch:train:10101-10200batch: iter_time=8.231e-05, forward_time=0.148, loss_ctc=59.053, loss_att=47.879, acc=0.757, loss=51.231, backward_time=0.284, grad_norm=66.857, clip=100.000, loss_scale=1.623e+32, optim_step_time=0.134, optim0_lr0=6.197e-05, train_time=1.220
+Process SpawnProcess-1:
+Traceback (most recent call last):
+  File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
+    self.run()
+  File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
+    self._target(*self._args, **self._kwargs)
+  File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
+    cls.trainer.run(
+  File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 290, in run
+    all_steps_are_invalid = cls.train_one_epoch(
+  File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 572, in train_one_epoch
+    retval = model(**batch)
+  File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
+    return forward_call(*input, **kwargs)
+  File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1040, in forward
+    output = self._run_ddp_forward(*inputs, **kwargs)
+  File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/parallel/distributed.py", line 1000, in _run_ddp_forward
+    return module_to_run(*inputs[0], **kwargs[0])
+  File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
+    return forward_call(*input, **kwargs)
+  File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/s2t/espnet_model.py", line 225, in forward
+    loss_att, acc_att, cer_att, wer_att = self._calc_att_loss(
+  File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/s2t/espnet_model.py", line 396, in _calc_att_loss
+    loss_att = self.criterion_att(decoder_out, ys_out_pad)
+  File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
+    return forward_call(*input, **kwargs)
+  File "/scratch/bbjs/peng6/espnet-whisper-public/espnet/nets/pytorch_backend/transformer/label_smoothing_loss.py", line 61, in forward
+    kl = self.criterion(torch.log_softmax(x, dim=1), true_dist)
+  File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
+    return forward_call(*input, **kwargs)
+  File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/modules/loss.py", line 471, in forward
+    return F.kl_div(input, target, reduction=self.reduction, log_target=self.log_target)
+  File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/nn/functional.py", line 2928, in kl_div
+    reduced = torch.kl_div(input, target, reduction_enum, log_target=log_target)
+torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 524.00 MiB (GPU 0; 39.39 GiB total capacity; 37.17 GiB already allocated; 276.75 MiB free; 38.49 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
+gpua005:3000819:3000897 [0] NCCL INFO [Service thread] Connection closed by localRank 0
+gpua005:3000819:3000819 [0] NCCL INFO comm 0xec8b50a0 rank 0 nranks 64 cudaDev 0 busId 7000 - Abort COMPLETE
+Traceback (most recent call last):
+  File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main
+    return _run_code(code, main_globals, None,
+  File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code
+    exec(code, run_globals)
+  File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in <module>
+    main()
+  File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main
+    S2TTask.main(cmd=cmd)
+  File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1134, in main
+    while not ProcessContext(processes, error_queues).join():
+  File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join
+    raise ProcessExitedException(
+torch.multiprocessing.spawn.ProcessExitedException: process 0 terminated with exit code 1
+srun: error: gpua005: task 0: Exited with exit code 1
+srun: Job step aborted: Waiting up to 32 seconds for job step to finish.