diff --git "a/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/train.4.log" "b/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/train.4.log" new file mode 100644--- /dev/null +++ "b/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/train.4.log" @@ -0,0 +1,3101 @@ +# Running on gpua006.delta.ncsa.illinois.edu +# Started at Sun Feb 11 12:45:35 CST 2024 +# SLURMD_NODENAME=gpua006 +# SLURM_CLUSTER_NAME=delta +# SLURM_CONF=/var/spool/slurmd/conf-cache/slurm.conf +# SLURM_CPUS_ON_NODE=64 +# SLURM_CPUS_PER_TASK=64 +# SLURM_EXPORT_ENV=PATH +# SLURM_GET_USER_ENV=1 +# SLURM_GPUS_ON_NODE=4 +# SLURM_GTIDS=0 +# SLURM_JOBID=2972492 +# SLURM_JOB_ACCOUNT=bbjs-delta-gpu +# SLURM_JOB_CPUS_PER_NODE='64(x16)' +# SLURM_JOB_END_TIME=1707754521 +# SLURM_JOB_GID=202 +# SLURM_JOB_GPUS=0,1,2,3 +# SLURM_JOB_ID=2972492 +# SLURM_JOB_NAME=exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/train.log +# SLURM_JOB_NODELIST='gpua[006,012,016,033,038-040,042,049,054-055,057,079-080,085,089]' +# SLURM_JOB_NUM_NODES=16 +# SLURM_JOB_PARTITION=gpuA100x4 +# SLURM_JOB_QOS=bbjs-delta-gpu +# SLURM_JOB_RESERVATION=bbjs +# SLURM_JOB_START_TIME=1707677121 +# SLURM_JOB_UID=68077 +# SLURM_JOB_USER=peng6 +# SLURM_LOCALID=0 +# SLURM_MEM_PER_NODE=240000 +# SLURM_MPI_TYPE=pmi2 +# SLURM_NNODES=16 +# SLURM_NODEID=0 +# SLURM_NODELIST='gpua[006,012,016,033,038-040,042,049,054-055,057,079-080,085,089]' +# SLURM_NODE_ALIASES='(null)' +# SLURM_OPEN_MODE=a +# SLURM_PRIO_PROCESS=0 +# SLURM_PROCID=0 +# SLURM_SUBMIT_DIR=/scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1 +# SLURM_SUBMIT_HOST=dt-login03.delta.ncsa.illinois.edu +# SLURM_TASKS_PER_NODE='1(x16)' +# SLURM_TASK_PID=720587 +# SLURM_TOPOLOGY_ADDR=ss00.ss05.gpua006 +# SLURM_TOPOLOGY_ADDR_PATTERN=switch.switch.node +# SLURM_WORKING_CLUSTER=delta:dt-sched:6817:9984:109 +# srun --export=ALL python3 -m espnet2.bin.s2t_train --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_5d65f79e-0afb-4171-b482-84bc82633098 +GpuFreq=control_disabled +GpuFreq=control_disabled +GpuFreq=control_disabled +GpuFreq=control_disabled +GpuFreq=control_disabled +GpuFreq=control_disabled +GpuFreq=control_disabled +GpuFreq=control_disabled +GpuFreq=control_disabled +GpuFreq=control_disabled +GpuFreq=control_disabled +GpuFreq=control_disabled +GpuFreq=control_disabled +GpuFreq=control_disabled +GpuFreq=control_disabled +GpuFreq=control_disabled +/scratch/bbjs/peng6/espnet-owsm-ctc-2/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-owsm-ctc-2/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_5d65f79e-0afb-4171-b482-84bc82633098 +/scratch/bbjs/peng6/espnet-owsm-ctc-2/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-owsm-ctc-2/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_5d65f79e-0afb-4171-b482-84bc82633098 +/scratch/bbjs/peng6/espnet-owsm-ctc-2/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-owsm-ctc-2/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_st/scratch/bbjs/peng6/espnet-owsm-ctc-2/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-owsm-ctc-2/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_5d65f79e-0afb-4171-b482-84bc82633098 +ats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_5d65f79e-0afb-4171-b482-84bc82633098 +/scratch/bbjs/peng6/espnet-owsm-ctc-2/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-owsm-ctc-2/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_st/scratch/bbjs/peng6/espnet-owsm-ctc-2/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-owsm-ctc-2/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_st/scratch/bbjs/peng6/espnet-owsm-ctc-2/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-owsm-ctc-2/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_5d65f79e-0afb-4171-b482-84bc82633098 +/scratch/bbjs/peng6/espnet-owsm-ctc-2/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-owsm-ctc-2/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_5d65f79e-0afb-4171-b482-84bc82633098 +ats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_5d65f79e-0afb-4171-b482-84bc82633098 +/scratch/bbjs/peng6/espnet-owsm-ctc-2/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-owsm-ctc-2/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_5d65f79e-0afb-4171-b482-84bc82633098 +/scratch/bbjs/peng6/espnet-owsm-ctc-2/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-owsm-ctc-2/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_st/scratch/bbjs/peng6/espnet-owsm-ctc-2/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-owsm-ctc-2/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_5d65f79e-0afb-4171-b482-84bc82633098 +ats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_5d65f79e-0afb-4171-b482-84bc82633098 +ats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_5d65f79e-0afb-4171-b482-84bc82633098 +[gpua006:0/64] 2024-02-11 12:45:50,790 (distributed_c10d:319) INFO: Added key: store_based_barrier_key:1 to store for rank: 0 +[gpua006:0/64] 2024-02-11 12:46:00,873 (distributed_c10d:337) INFO: Waiting in store based barrier to initialize process group for rank: 0, key: store_based_barrier_key:1 (world_size=64, worker_count=44, timeout=0:30:00) +/scratch/bbjs/peng6/espnet-owsm-ctc-2/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-owsm-ctc-2/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_5d65f79e-0afb-4171-b482-84bc82633098 +[gpua006:0/64] 2024-02-11 12:46:10,924 (distributed_c10d:337) INFO: Waiting in store based barrier to initialize process group for rank: 0, key: store_based_barrier_key:1 (world_size=64, worker_count=48, timeout=0:30:00) +[gpua006:0/64] 2024-02-11 12:46:20,929 (distributed_c10d:337) INFO: Waiting in store based barrier to initialize process group for rank: 0, key: store_based_barrier_key:1 (world_size=64, worker_count=48, timeout=0:30:00) +[gpua006:0/64] 2024-02-11 12:46:30,991 (distributed_c10d:337) INFO: Waiting in store based barrier to initialize process group for rank: 0, key: store_based_barrier_key:1 (world_size=64, worker_count=48, timeout=0:30:00) +[gpua006:0/64] 2024-02-11 12:46:41,084 (distributed_c10d:337) INFO: Waiting in store based barrier to initialize process group for rank: 0, key: store_based_barrier_key:1 (world_size=64, worker_count=48, timeout=0:30:00) +[gpua006:0/64] 2024-02-11 12:46:51,171 (distributed_c10d:337) INFO: Waiting in store based barrier to initialize process group for rank: 0, key: store_based_barrier_key:1 (world_size=64, worker_count=48, timeout=0:30:00) +/scratch/bbjs/peng6/espnet-owsm-ctc-2/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-owsm-ctc-2/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_5d65f79e-0afb-4171-b482-84bc82633098 +[gpua006:0/64] 2024-02-11 12:47:01,264 (distributed_c10d:337) INFO: Waiting in store based barrier to initialize process group for rank: 0, key: store_based_barrier_key:1 (world_size=64, worker_count=52, timeout=0:30:00) +[gpua006:0/64] 2024-02-11 12:47:11,341 (distributed_c10d:337) INFO: Waiting in store based barrier to initialize process group for rank: 0, key: store_based_barrier_key:1 (world_size=64, worker_count=52, timeout=0:30:00) +[gpua006:0/64] 2024-02-11 12:47:21,363 (distributed_c10d:337) INFO: Waiting in store based barrier to initialize process group for rank: 0, key: store_based_barrier_key:1 (world_size=64, worker_count=52, timeout=0:30:00) +[gpua006:0/64] 2024-02-11 12:47:31,381 (distributed_c10d:337) INFO: Waiting in store based barrier to initialize process group for rank: 0, key: store_based_barrier_key:1 (world_size=64, worker_count=52, timeout=0:30:00) +[gpua006:0/64] 2024-02-11 12:47:41,391 (distributed_c10d:337) INFO: Waiting in store based barrier to initialize process group for rank: 0, key: store_based_barrier_key:1 (world_size=64, worker_count=52, timeout=0:30:00) +[gpua006:0/64] 2024-02-11 12:47:51,477 (distributed_c10d:337) INFO: Waiting in store based barrier to initialize process group for rank: 0, key: store_based_barrier_key:1 (world_size=64, worker_count=52, timeout=0:30:00) +[gpua006:0/64] 2024-02-11 12:48:01,516 (distributed_c10d:337) INFO: Waiting in store based barrier to initialize process group for rank: 0, key: store_based_barrier_key:1 (world_size=64, worker_count=52, timeout=0:30:00) +[gpua006:0/64] 2024-02-11 12:48:11,544 (distributed_c10d:337) INFO: Waiting in store based barrier to initialize process group for rank: 0, key: store_based_barrier_key:1 (world_size=64, worker_count=52, timeout=0:30:00) +[gpua006:0/64] 2024-02-11 12:48:21,635 (distributed_c10d:337) INFO: Waiting in store based barrier to initialize process group for rank: 0, key: store_based_barrier_key:1 (world_size=64, worker_count=52, timeout=0:30:00) +[gpua006:0/64] 2024-02-11 12:48:31,649 (distributed_c10d:337) INFO: Waiting in store based barrier to initialize process group for rank: 0, key: store_based_barrier_key:1 (world_size=64, worker_count=52, timeout=0:30:00) +[gpua006:0/64] 2024-02-11 12:48:41,666 (distributed_c10d:337) INFO: Waiting in store based barrier to initialize process group for rank: 0, key: store_based_barrier_key:1 (world_size=64, worker_count=52, timeout=0:30:00) +/scratch/bbjs/peng6/espnet-owsm-ctc-2/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-owsm-ctc-2/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_st/scratch/bbjs/peng6/espnet-owsm-ctc-2/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-owsm-ctc-2/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_st/scratch/bbjs/peng6/espnet-owsm-ctc-2/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-owsm-ctc-2/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_5d65f79e-0afb-4171-b482-84bc82633098 +ats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_5d65f79e-0afb-4171-b482-84bc82633098 +ats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_5d65f79e-0afb-4171-b482-84bc82633098 +[gpua006:0/64] 2024-02-11 12:48:54,562 (distributed_c10d:337) INFO: Waiting in store based barrier to initialize process group for rank: 0, key: store_based_barrier_key:1 (world_size=64, worker_count=64, timeout=0:30:00) +[gpua006:0/64] 2024-02-11 12:48:54,562 (distributed_c10d:353) INFO: Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 64 nodes. +[gpua006:0/64] 2024-02-11 12:48:54,614 (s2t:420) INFO: Vocabulary size: 50002 +[gpua006:0/64] 2024-02-11 12:49:06,333 (abs_task:1270) INFO: pytorch.version=1.13.1, cuda.available=True, cudnn.version=8500, cudnn.benchmark=False, cudnn.deterministic=True +[gpua006:0/64] 2024-02-11 12:49:06,344 (abs_task:1271) INFO: Model structure: +ESPnetS2TCTCModel( + (frontend): DefaultFrontend( + (stft): Stft(n_fft=512, win_length=400, hop_length=160, center=True, normalized=False, onesided=True) + (frontend): Frontend() + (logmel): LogMel(sr=16000, n_fft=512, n_mels=80, fmin=0, fmax=8000.0, htk=False) + ) + (specaug): SpecAug( + (freq_mask): MaskAlongAxis(mask_width_range=[0, 27], num_mask=2, axis=freq) + (time_mask): MaskAlongAxisVariableMaxWidth(mask_width_ratio_range=[0.0, 0.05], num_mask=10, axis=time) + ) + (normalize): GlobalMVN(stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz, norm_means=True, norm_vars=True) + (encoder): EBranchformerCTCEncoder( + (embed): Conv2dSubsampling8( + (conv): Sequential( + (0): Conv2d(1, 1024, kernel_size=(3, 3), stride=(2, 2)) + (1): ReLU() + (2): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(2, 2)) + (3): ReLU() + (4): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(2, 2)) + (5): ReLU() + ) + (out): Linear(in_features=9216, out_features=1024, bias=True) + (pos_enc): PositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (encoders): MultiSequential( + (0): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (1): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (2): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (cross_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (norm_cross_attn): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (3): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (4): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (5): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (cross_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (norm_cross_attn): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (6): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (7): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (8): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (cross_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (norm_cross_attn): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (9): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (10): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (11): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (cross_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (norm_cross_attn): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (12): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (13): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (14): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (cross_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (norm_cross_attn): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (15): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (16): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (17): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (cross_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (norm_cross_attn): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (18): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (19): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (20): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (cross_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (norm_cross_attn): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (21): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (22): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (23): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (cross_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (norm_cross_attn): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (24): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (25): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (26): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (cross_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (norm_cross_attn): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + ) + (after_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (conditioning_layer): Linear(in_features=50002, out_features=1024, bias=True) + ) + (prompt_encoder): TransformerEncoder( + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=512, out_features=512, bias=True) + (linear_k): Linear(in_features=512, out_features=512, bias=True) + (linear_v): Linear(in_features=512, out_features=512, bias=True) + (linear_out): Linear(in_features=512, out_features=512, bias=True) + (dropout): Identity() + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=512, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=512, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((512,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((512,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=512, out_features=512, bias=True) + (linear_k): Linear(in_features=512, out_features=512, bias=True) + (linear_v): Linear(in_features=512, out_features=512, bias=True) + (linear_out): Linear(in_features=512, out_features=512, bias=True) + (dropout): Identity() + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=512, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=512, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((512,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((512,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=512, out_features=512, bias=True) + (linear_k): Linear(in_features=512, out_features=512, bias=True) + (linear_v): Linear(in_features=512, out_features=512, bias=True) + (linear_out): Linear(in_features=512, out_features=512, bias=True) + (dropout): Identity() + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=512, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=512, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((512,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((512,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=512, out_features=512, bias=True) + (linear_k): Linear(in_features=512, out_features=512, bias=True) + (linear_v): Linear(in_features=512, out_features=512, bias=True) + (linear_out): Linear(in_features=512, out_features=512, bias=True) + (dropout): Identity() + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=512, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=512, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((512,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((512,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((512,), eps=1e-12, elementwise_affine=True) + ) + (embed): Embedding(50002, 512) + (pos_enc): PositionalEncoding( + (dropout): Dropout(p=0.0, inplace=False) + ) + (embed_proj): Linear(in_features=512, out_features=1024, bias=True) + (prompt_proj): Linear(in_features=512, out_features=1024, bias=True) + (ctc): CTC( + (ctc_lo): Linear(in_features=1024, out_features=50002, bias=True) + (ctc_loss): CTCLoss() + ) +) + +Model summary: + Class Name: ESPnetS2TCTCModel + Total Number of model parameters: 1.01 B + Number of trainable parameters: 1.01 B (100.0%) + Size: 4.02 GB + Type: torch.float32 +[gpua006:0/64] 2024-02-11 12:49:06,345 (abs_task:1274) INFO: Optimizer: +AdamW ( +Parameter Group 0 + amsgrad: False + betas: [0.9, 0.98] + capturable: False + eps: 1e-06 + foreach: None + initial_lr: 0.0002 + lr: 1.6666666666666667e-09 + maximize: False + weight_decay: 0.0 +) +[gpua006:0/64] 2024-02-11 12:49:06,345 (abs_task:1275) INFO: Scheduler: PiecewiseLinearWarmupLR(warmup_steps_list=[0, 30000, 60000], warmup_lr_list=[0.0, 5e-05, 0.0002]) +[gpua006:0/64] 2024-02-11 12:49:06,347 (abs_task:1284) INFO: Saving the configuration in exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/config.yaml +[gpua006:0/64] 2024-02-11 12:49:12,115 (s2t:401) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpua006:0/64] 2024-02-11 12:49:13,057 (abs_task:1660) INFO: [valid] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/dev_v3/wav.scp", "type": "kaldi_ark"} + text_prev: {"path": "dump/raw/dev_v3/text.prev", "type": "text"} + text_ctc: {"path": "dump/raw/dev_v3/text.ctc", "type": "text"} + text: {"path": "dump/raw/dev_v3/text", "type": "text"} + preprocess: ) +[gpua006:0/64] 2024-02-11 12:49:13,057 (abs_task:1661) INFO: [valid] Batch sampler: UnsortedBatchSampler(N-batch=4671, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/valid/speech_shape, +[gpua006:0/64] 2024-02-11 12:49:13,063 (abs_task:1662) INFO: [valid] mini-batch sizes summary: N-batch=4671, mean=256.0, min=256, max=257 +[gpua006:0/64] 2024-02-11 12:49:42,808 (trainer:167) INFO: The training was resumed using exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/checkpoint.pth +gpua006:720654:720654 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.6<0> +gpua006:720654:720654 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua006:720654:720654 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua006:720654:720654 [0] NCCL INFO cudaDriverVersion 12020 +NCCL version 2.14.3+cuda11.7 +[gpua006:0/64] 2024-02-11 12:49:48,761 (trainer:301) INFO: 31/45epoch started +[gpua006:0/64] 2024-02-11 12:49:48,801 (multiple_iter_factory:32) INFO: Building 0th iter-factory... +[gpua006:0/64] 2024-02-11 12:50:07,039 (s2t:401) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpua006:0/64] 2024-02-11 12:50:10,315 (abs_task:1660) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.5", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.5", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.5", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.5", "type": "text"} + preprocess: ) +[gpua006:0/64] 2024-02-11 12:50:10,315 (abs_task:1661) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.5, +[gpua006:0/64] 2024-02-11 12:50:10,318 (abs_task:1662) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257 +gpua055:163211:163211 [2] NCCL INFO cudaDriverVersion 12020 +gpua055:163211:163211 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.55<0> +gpua055:163211:163211 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua055:163211:163211 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua055:163211:163272 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua055:163211:163272 [2] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua055:163211:163272 [2] NCCL INFO Using network AWS Libfabric +gpua055:163211:163272 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpua055:163211:163272 [2] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua055:163211:163272 [2] NCCL INFO Trees [0] 43/-1/-1->42->41 [1] 43/-1/-1->42->41 +gpua055:163211:163272 [2] NCCL INFO Channel 00/0 : 42[85000] -> 43[c7000] via P2P/IPC/read +gpua055:163211:163272 [2] NCCL INFO Channel 01/0 : 42[85000] -> 43[c7000] via P2P/IPC/read +gpua055:163211:163272 [2] NCCL INFO Connected all rings +gpua055:163211:163272 [2] NCCL INFO Channel 00/0 : 42[85000] -> 41[46000] via P2P/IPC/read +gpua055:163211:163272 [2] NCCL INFO Channel 01/0 : 42[85000] -> 41[46000] via P2P/IPC/read +gpua055:163211:163272 [2] NCCL INFO Connected all trees +gpua055:163211:163272 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua055:163211:163272 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua055:163211:163272 [2] NCCL INFO comm 0x55ebcdc5c820 rank 42 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpua040:3903231:3903231 [1] NCCL INFO cudaDriverVersion 12020 +gpua040:3903231:3903231 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.40<0> +gpua040:3903231:3903231 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua040:3903231:3903231 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua040:3903231:3903298 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua040:3903231:3903298 [1] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua040:3903231:3903298 [1] NCCL INFO Using network AWS Libfabric +gpua040:3903231:3903298 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpua040:3903231:3903298 [1] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua040:3903231:3903298 [1] NCCL INFO Trees [0] 26/20/-1->25->24 [1] 26/-1/-1->25->24 +gpua040:3903231:3903298 [1] NCCL INFO Channel 00/0 : 25[46000] -> 26[85000] via P2P/IPC/read +gpua040:3903231:3903298 [1] NCCL INFO Channel 01/0 : 25[46000] -> 26[85000] via P2P/IPC/read +gpua042:289287:289287 [3] NCCL INFO cudaDriverVersion 12020 +gpua042:289287:289287 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.42<0> +gpua042:289287:289287 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua042:289287:289287 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua042:289287:289373 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua042:289287:289373 [3] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua042:289287:289373 [3] NCCL INFO Using network AWS Libfabric +gpua042:289287:289373 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpua042:289287:289373 [3] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua042:289287:289373 [3] NCCL INFO Trees [0] -1/-1/-1->31->30 [1] -1/-1/-1->31->30 +gpua042:289287:289373 [3] NCCL INFO Channel 00/0 : 31[c7000] -> 32[7000] [send] via NET/AWS Libfabric/1 +gpua042:289287:289373 [3] NCCL INFO Channel 01/0 : 31[c7000] -> 32[7000] [send] via NET/AWS Libfabric/1 +gpua006:720654:720833 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua006:720654:720833 [0] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua006:720654:720833 [0] NCCL INFO Using network AWS Libfabric +gpua006:720654:720833 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpua006:720654:720833 [0] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua006:720654:720833 [0] NCCL INFO Channel 00/02 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 +gpua006:720654:720833 [0] NCCL INFO Channel 01/02 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 +gpua006:720654:720833 [0] NCCL INFO Trees [0] 1/32/-1->0->-1 [1] 1/-1/-1->0->4 +gpua006:720654:720833 [0] NCCL INFO Channel 00/0 : 63[c7000] -> 0[7000] [receive] via NET/AWS Libfabric/1 +gpua006:720654:720833 [0] NCCL INFO Channel 01/0 : 63[c7000] -> 0[7000] [receive] via NET/AWS Libfabric/1 +gpua006:720654:720833 [0] NCCL INFO Channel 00/0 : 0[7000] -> 1[46000] via P2P/IPC/read +gpua012:2768202:2768202 [3] NCCL INFO cudaDriverVersion 12020 +gpua012:2768202:2768202 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.12<0> +gpua012:2768202:2768202 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua012:2768202:2768202 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua012:2768202:2768274 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua012:2768202:2768274 [3] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua012:2768202:2768274 [3] NCCL INFO Using network AWS Libfabric +gpua012:2768202:2768274 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpua012:2768202:2768274 [3] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua012:2768202:2768274 [3] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] -1/-1/-1->7->6 +gpua012:2768202:2768274 [3] NCCL INFO Channel 00/0 : 7[c7000] -> 8[7000] [send] via NET/AWS Libfabric/1 +gpua012:2768202:2768274 [3] NCCL INFO Channel 01/0 : 7[c7000] -> 8[7000] [send] via NET/AWS Libfabric/1 +gpua039:3705386:3705386 [2] NCCL INFO cudaDriverVersion 12020 +gpua039:3705386:3705386 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.39<0> +gpua039:3705386:3705386 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua039:3705386:3705386 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua039:3705386:3705441 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua039:3705386:3705441 [2] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua039:3705386:3705441 [2] NCCL INFO Using network AWS Libfabric +gpua039:3705386:3705441 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpua039:3705386:3705441 [2] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua039:3705386:3705441 [2] NCCL INFO Trees [0] 23/-1/-1->22->21 [1] 23/-1/-1->22->21 +gpua039:3705386:3705441 [2] NCCL INFO Channel 00/0 : 22[85000] -> 23[c7000] via P2P/IPC/read +gpua039:3705386:3705441 [2] NCCL INFO Channel 01/0 : 22[85000] -> 23[c7000] via P2P/IPC/read +gpua033:1688531:1688531 [3] NCCL INFO cudaDriverVersion 12020 +gpua033:1688531:1688531 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.33<0> +gpua033:1688531:1688531 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua033:1688531:1688531 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua033:1688531:1688608 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua033:1688531:1688608 [3] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua033:1688531:1688608 [3] NCCL INFO Using network AWS Libfabric +gpua033:1688531:1688608 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpua033:1688531:1688608 [3] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua033:1688531:1688608 [3] NCCL INFO Trees [0] -1/-1/-1->15->14 [1] -1/-1/-1->15->14 +gpua033:1688531:1688608 [3] NCCL INFO Channel 00/0 : 15[c7000] -> 16[7000] [send] via NET/AWS Libfabric/1 +gpua033:1688531:1688608 [3] NCCL INFO Channel 01/0 : 15[c7000] -> 16[7000] [send] via NET/AWS Libfabric/1 +gpua054:464655:464655 [2] NCCL INFO cudaDriverVersion 12020 +gpua054:464655:464655 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.54<0> +gpua054:464655:464655 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua054:464655:464655 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua054:464655:464738 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua054:464655:464738 [2] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua054:464655:464738 [2] NCCL INFO Using network AWS Libfabric +gpua054:464655:464738 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpua054:464655:464738 [2] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua054:464655:464738 [2] NCCL INFO Trees [0] 39/-1/-1->38->37 [1] 39/-1/-1->38->37 +gpua054:464655:464738 [2] NCCL INFO Channel 00/0 : 38[85000] -> 39[c7000] via P2P/IPC/read +gpua054:464655:464738 [2] NCCL INFO Channel 01/0 : 38[85000] -> 39[c7000] via P2P/IPC/read +gpua054:464655:464738 [2] NCCL INFO Connected all rings +gpua079:3737285:3737285 [2] NCCL INFO cudaDriverVersion 12020 +gpua079:3737285:3737285 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.79<0> +gpua079:3737285:3737285 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua079:3737285:3737285 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua079:3737285:3737365 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua079:3737285:3737365 [2] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua079:3737285:3737365 [2] NCCL INFO Using network AWS Libfabric +gpua079:3737285:3737365 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpua079:3737285:3737365 [2] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua079:3737285:3737365 [2] NCCL INFO Trees [0] 51/-1/-1->50->49 [1] 51/-1/-1->50->49 +gpua079:3737285:3737365 [2] NCCL INFO Channel 00/0 : 50[85000] -> 51[c7000] via P2P/IPC/read +gpua079:3737285:3737365 [2] NCCL INFO Channel 01/0 : 50[85000] -> 51[c7000] via P2P/IPC/read +gpua049:4092061:4092061 [2] NCCL INFO cudaDriverVersion 12020 +gpua049:4092061:4092061 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.49<0> +gpua049:4092061:4092061 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua049:4092061:4092061 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua049:4092061:4092141 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua049:4092061:4092141 [2] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua049:4092061:4092141 [2] NCCL INFO Using network AWS Libfabric +gpua049:4092061:4092141 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpua049:4092061:4092141 [2] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua049:4092061:4092141 [2] NCCL INFO Trees [0] 35/-1/-1->34->33 [1] 35/-1/-1->34->33 +gpua049:4092061:4092141 [2] NCCL INFO Channel 00/0 : 34[85000] -> 35[c7000] via P2P/IPC/read +gpua049:4092061:4092141 [2] NCCL INFO Channel 01/0 : 34[85000] -> 35[c7000] via P2P/IPC/read +gpua057:3929639:3929639 [3] NCCL INFO cudaDriverVersion 12020 +gpua057:3929639:3929639 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.57<0> +gpua057:3929639:3929639 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua057:3929639:3929639 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua057:3929639:3929711 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua057:3929639:3929711 [3] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua057:3929639:3929711 [3] NCCL INFO Using network AWS Libfabric +gpua057:3929639:3929711 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpua057:3929639:3929711 [3] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua057:3929639:3929711 [3] NCCL INFO Trees [0] -1/-1/-1->47->46 [1] -1/-1/-1->47->46 +gpua057:3929639:3929711 [3] NCCL INFO Channel 00/0 : 47[c7000] -> 48[7000] [send] via NET/AWS Libfabric/1 +gpua057:3929639:3929711 [3] NCCL INFO Channel 01/0 : 47[c7000] -> 48[7000] [send] via NET/AWS Libfabric/1 +gpua055:163212:163212 [3] NCCL INFO cudaDriverVersion 12020 +gpua055:163212:163212 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.55<0> +gpua055:163212:163212 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua055:163212:163212 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua055:163212:163274 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua055:163212:163274 [3] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua055:163212:163274 [3] NCCL INFO Using network AWS Libfabric +gpua055:163212:163274 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpua055:163212:163274 [3] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua055:163212:163274 [3] NCCL INFO Trees [0] -1/-1/-1->43->42 [1] -1/-1/-1->43->42 +gpua055:163212:163274 [3] NCCL INFO Channel 00/0 : 43[c7000] -> 44[7000] [send] via NET/AWS Libfabric/1 +gpua055:163212:163274 [3] NCCL INFO Channel 01/0 : 43[c7000] -> 44[7000] [send] via NET/AWS Libfabric/1 +gpua038:223222:223222 [2] NCCL INFO cudaDriverVersion 12020 +gpua038:223222:223222 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.38<0> +gpua038:223222:223222 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua038:223222:223222 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua038:223222:223301 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua038:223222:223301 [2] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua038:223222:223301 [2] NCCL INFO Using network AWS Libfabric +gpua038:223222:223301 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpua038:223222:223301 [2] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua038:223222:223301 [2] NCCL INFO Trees [0] 19/-1/-1->18->17 [1] 19/-1/-1->18->17 +gpua038:223222:223301 [2] NCCL INFO Channel 00/0 : 18[85000] -> 19[c7000] via P2P/IPC/read +gpua038:223222:223301 [2] NCCL INFO Channel 01/0 : 18[85000] -> 19[c7000] via P2P/IPC/read +gpua038:223222:223301 [2] NCCL INFO Connected all rings +gpua016:626219:626219 [1] NCCL INFO cudaDriverVersion 12020 +gpua016:626219:626219 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.16<0> +gpua016:626219:626219 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua016:626219:626219 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua016:626219:626293 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua016:626219:626293 [1] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua016:626219:626293 [1] NCCL INFO Using network AWS Libfabric +gpua016:626219:626293 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpua016:626219:626293 [1] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua016:626219:626293 [1] NCCL INFO Trees [0] 10/4/-1->9->8 [1] 10/-1/-1->9->8 +gpua016:626219:626293 [1] NCCL INFO Channel 00/0 : 9[46000] -> 10[85000] via P2P/IPC/read +gpua016:626219:626293 [1] NCCL INFO Channel 01/0 : 9[46000] -> 10[85000] via P2P/IPC/read +gpua016:626219:626293 [1] NCCL INFO Connected all rings +gpua085:4061404:4061404 [2] NCCL INFO cudaDriverVersion 12020 +gpua085:4061404:4061404 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.85<0> +gpua085:4061404:4061404 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua085:4061404:4061404 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua085:4061404:4061984 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua085:4061404:4061984 [2] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua085:4061404:4061984 [2] NCCL INFO Using network AWS Libfabric +gpua085:4061404:4061984 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpua085:4061404:4061984 [2] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua085:4061404:4061984 [2] NCCL INFO Trees [0] 59/-1/-1->58->57 [1] 59/-1/-1->58->57 +gpua085:4061404:4061984 [2] NCCL INFO Channel 00/0 : 58[85000] -> 59[c7000] via P2P/IPC/read +gpua085:4061404:4061984 [2] NCCL INFO Channel 01/0 : 58[85000] -> 59[c7000] via P2P/IPC/read +gpua080:3313503:3313503 [3] NCCL INFO cudaDriverVersion 12020 +gpua080:3313503:3313503 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.80<0> +gpua080:3313503:3313503 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua080:3313503:3313503 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua080:3313503:3313585 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua080:3313503:3313585 [3] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua080:3313503:3313585 [3] NCCL INFO Using network AWS Libfabric +gpua080:3313503:3313585 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpua080:3313503:3313585 [3] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua080:3313503:3313585 [3] NCCL INFO Trees [0] -1/-1/-1->55->54 [1] -1/-1/-1->55->54 +gpua080:3313503:3313585 [3] NCCL INFO Channel 00/0 : 55[c7000] -> 56[7000] [send] via NET/AWS Libfabric/1 +gpua080:3313503:3313585 [3] NCCL INFO Channel 01/0 : 55[c7000] -> 56[7000] [send] via NET/AWS Libfabric/1 +gpua089:894162:894162 [3] NCCL INFO cudaDriverVersion 12020 +gpua089:894162:894162 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.89<0> +gpua089:894162:894162 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua089:894162:894162 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua089:894162:894233 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua089:894162:894233 [3] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua089:894162:894233 [3] NCCL INFO Using network AWS Libfabric +gpua089:894162:894233 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpua089:894162:894233 [3] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua089:894162:894233 [3] NCCL INFO Trees [0] -1/-1/-1->63->62 [1] -1/-1/-1->63->62 +gpua089:894162:894233 [3] NCCL INFO Channel 00/0 : 63[c7000] -> 0[7000] [send] via NET/AWS Libfabric/1 +gpua089:894162:894233 [3] NCCL INFO Channel 01/0 : 63[c7000] -> 0[7000] [send] via NET/AWS Libfabric/1 +gpua040:3903231:3903298 [1] NCCL INFO Connected all rings +gpua040:3903231:3903298 [1] NCCL INFO Channel 00/0 : 20[7000] -> 25[46000] [receive] via NET/AWS Libfabric/1 +gpua040:3903231:3903298 [1] NCCL INFO Channel 00/0 : 25[46000] -> 20[7000] [send] via NET/AWS Libfabric/1 +gpua040:3903231:3903298 [1] NCCL INFO Channel 00/0 : 25[46000] -> 24[7000] via P2P/IPC/read +gpua040:3903231:3903298 [1] NCCL INFO Channel 01/0 : 25[46000] -> 24[7000] via P2P/IPC/read +gpua040:3903231:3903298 [1] NCCL INFO Connected all trees +gpua040:3903231:3903298 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua040:3903231:3903298 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua040:3903231:3903298 [1] NCCL INFO comm 0x55bfd6f5ba90 rank 25 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpua042:289287:289373 [3] NCCL INFO Connected all rings +gpua042:289287:289373 [3] NCCL INFO Channel 00/0 : 31[c7000] -> 30[85000] via P2P/IPC/read +gpua042:289287:289373 [3] NCCL INFO Channel 01/0 : 31[c7000] -> 30[85000] via P2P/IPC/read +gpua042:289287:289373 [3] NCCL INFO Connected all trees +gpua042:289287:289373 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua042:289287:289373 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua042:289287:289373 [3] NCCL INFO comm 0x55b51889dd40 rank 31 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpua006:720654:720833 [0] NCCL INFO Channel 01/0 : 0[7000] -> 1[46000] via P2P/IPC/read +gpua006:720654:720833 [0] NCCL INFO Connected all rings +gpua006:720654:720833 [0] NCCL INFO Channel 01/0 : 0[7000] -> 4[7000] [send] via NET/AWS Libfabric/1 +gpua006:720654:720833 [0] NCCL INFO Channel 00/0 : 32[7000] -> 0[7000] [receive] via NET/AWS Libfabric/1 +gpua006:720654:720833 [0] NCCL INFO Channel 00/0 : 0[7000] -> 32[7000] [send] via NET/AWS Libfabric/1 +gpua006:720654:720833 [0] NCCL INFO Channel 01/0 : 4[7000] -> 0[7000] [receive] via NET/AWS Libfabric/1 +gpua006:720654:720833 [0] NCCL INFO Connected all trees +gpua006:720654:720833 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua006:720654:720833 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua006:720654:720833 [0] NCCL INFO comm 0x55cf75a68990 rank 0 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpua012:2768202:2768274 [3] NCCL INFO Connected all rings +gpua012:2768202:2768274 [3] NCCL INFO Channel 00/0 : 7[c7000] -> 6[85000] via P2P/IPC/read +gpua012:2768202:2768274 [3] NCCL INFO Channel 01/0 : 7[c7000] -> 6[85000] via P2P/IPC/read +gpua012:2768202:2768274 [3] NCCL INFO Connected all trees +gpua012:2768202:2768274 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua012:2768202:2768274 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua012:2768202:2768274 [3] NCCL INFO comm 0x55dd6326b8a0 rank 7 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpua012:2768200:2768200 [1] NCCL INFO cudaDriverVersion 12020 +gpua012:2768200:2768200 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.12<0> +gpua012:2768200:2768200 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua012:2768200:2768200 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua012:2768200:2768276 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua039:3705386:3705441 [2] NCCL INFO Connected all rings +gpua039:3705386:3705441 [2] NCCL INFO Channel 00/0 : 22[85000] -> 21[46000] via P2P/IPC/read +gpua039:3705386:3705441 [2] NCCL INFO Channel 01/0 : 22[85000] -> 21[46000] via P2P/IPC/read +gpua039:3705386:3705441 [2] NCCL INFO Connected all trees +gpua039:3705386:3705441 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua039:3705386:3705441 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua039:3705386:3705441 [2] NCCL INFO comm 0x560317496ae0 rank 22 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpua033:1688531:1688608 [3] NCCL INFO Connected all rings +gpua033:1688531:1688608 [3] NCCL INFO Channel 00/0 : 15[c7000] -> 14[85000] via P2P/IPC/read +gpua033:1688531:1688608 [3] NCCL INFO Channel 01/0 : 15[c7000] -> 14[85000] via P2P/IPC/read +gpua033:1688531:1688608 [3] NCCL INFO Connected all trees +gpua033:1688531:1688608 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua033:1688531:1688608 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua033:1688531:1688608 [3] NCCL INFO comm 0x55734f2ddf10 rank 15 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpua054:464655:464738 [2] NCCL INFO Channel 00/0 : 38[85000] -> 37[46000] via P2P/IPC/read +gpua054:464655:464738 [2] NCCL INFO Channel 01/0 : 38[85000] -> 37[46000] via P2P/IPC/read +gpua054:464655:464738 [2] NCCL INFO Connected all trees +gpua054:464655:464738 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua054:464655:464738 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua054:464655:464738 [2] NCCL INFO comm 0x55c057332c10 rank 38 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpua079:3737285:3737365 [2] NCCL INFO Connected all rings +gpua079:3737285:3737365 [2] NCCL INFO Channel 00/0 : 50[85000] -> 49[46000] via P2P/IPC/read +gpua079:3737285:3737365 [2] NCCL INFO Channel 01/0 : 50[85000] -> 49[46000] via P2P/IPC/read +gpua079:3737285:3737365 [2] NCCL INFO Connected all trees +gpua079:3737285:3737365 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua079:3737285:3737365 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua079:3737285:3737365 [2] NCCL INFO comm 0x55cfd7a707c0 rank 50 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpua049:4092061:4092141 [2] NCCL INFO Connected all rings +gpua049:4092061:4092141 [2] NCCL INFO Channel 00/0 : 34[85000] -> 33[46000] via P2P/IPC/read +gpua049:4092061:4092141 [2] NCCL INFO Channel 01/0 : 34[85000] -> 33[46000] via P2P/IPC/read +gpua049:4092061:4092141 [2] NCCL INFO Connected all trees +gpua049:4092061:4092141 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua049:4092061:4092141 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua049:4092061:4092141 [2] NCCL INFO comm 0x561043ee68e0 rank 34 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpua057:3929639:3929711 [3] NCCL INFO Connected all rings +gpua057:3929639:3929711 [3] NCCL INFO Channel 00/0 : 47[c7000] -> 46[85000] via P2P/IPC/read +gpua057:3929639:3929711 [3] NCCL INFO Channel 01/0 : 47[c7000] -> 46[85000] via P2P/IPC/read +gpua057:3929639:3929711 [3] NCCL INFO Connected all trees +gpua057:3929639:3929711 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua057:3929639:3929711 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua057:3929639:3929711 [3] NCCL INFO comm 0x563ff93d1990 rank 47 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpua055:163212:163274 [3] NCCL INFO Connected all rings +gpua055:163212:163274 [3] NCCL INFO Channel 00/0 : 43[c7000] -> 42[85000] via P2P/IPC/read +gpua055:163212:163274 [3] NCCL INFO Channel 01/0 : 43[c7000] -> 42[85000] via P2P/IPC/read +gpua055:163212:163274 [3] NCCL INFO Connected all trees +gpua055:163212:163274 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua055:163212:163274 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua055:163212:163274 [3] NCCL INFO comm 0x5610d5226080 rank 43 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpua038:223222:223301 [2] NCCL INFO Channel 00/0 : 18[85000] -> 17[46000] via P2P/IPC/read +gpua038:223222:223301 [2] NCCL INFO Channel 01/0 : 18[85000] -> 17[46000] via P2P/IPC/read +gpua038:223222:223301 [2] NCCL INFO Connected all trees +gpua038:223222:223301 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua038:223222:223301 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua038:223222:223301 [2] NCCL INFO comm 0x5625fda8edf0 rank 18 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpua016:626219:626293 [1] NCCL INFO Channel 00/0 : 4[7000] -> 9[46000] [receive] via NET/AWS Libfabric/1 +gpua016:626219:626293 [1] NCCL INFO Channel 00/0 : 9[46000] -> 4[7000] [send] via NET/AWS Libfabric/1 +gpua016:626219:626293 [1] NCCL INFO Channel 00/0 : 9[46000] -> 8[7000] via P2P/IPC/read +gpua016:626219:626293 [1] NCCL INFO Channel 01/0 : 9[46000] -> 8[7000] via P2P/IPC/read +gpua016:626219:626293 [1] NCCL INFO Connected all trees +gpua016:626219:626293 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua016:626219:626293 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua016:626219:626293 [1] NCCL INFO comm 0x5596c262b320 rank 9 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpua085:4061404:4061984 [2] NCCL INFO Connected all rings +gpua085:4061404:4061984 [2] NCCL INFO Channel 00/0 : 58[85000] -> 57[46000] via P2P/IPC/read +gpua085:4061404:4061984 [2] NCCL INFO Channel 01/0 : 58[85000] -> 57[46000] via P2P/IPC/read +gpua085:4061404:4061984 [2] NCCL INFO Connected all trees +gpua085:4061404:4061984 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua085:4061404:4061984 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua085:4061404:4061984 [2] NCCL INFO comm 0x56297fdb1940 rank 58 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpua080:3313503:3313585 [3] NCCL INFO Connected all rings +gpua080:3313503:3313585 [3] NCCL INFO Channel 00/0 : 55[c7000] -> 54[85000] via P2P/IPC/read +gpua080:3313503:3313585 [3] NCCL INFO Channel 01/0 : 55[c7000] -> 54[85000] via P2P/IPC/read +gpua080:3313503:3313585 [3] NCCL INFO Connected all trees +gpua080:3313503:3313585 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua080:3313503:3313585 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua080:3313503:3313585 [3] NCCL INFO comm 0x5592ec197ac0 rank 55 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpua089:894162:894233 [3] NCCL INFO Connected all rings +gpua089:894162:894233 [3] NCCL INFO Channel 00/0 : 63[c7000] -> 62[85000] via P2P/IPC/read +gpua089:894162:894233 [3] NCCL INFO Channel 01/0 : 63[c7000] -> 62[85000] via P2P/IPC/read +gpua089:894162:894233 [3] NCCL INFO Connected all trees +gpua089:894162:894233 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua089:894162:894233 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua089:894162:894233 [3] NCCL INFO comm 0x55925505dc80 rank 63 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpua040:3903232:3903232 [2] NCCL INFO cudaDriverVersion 12020 +gpua040:3903232:3903232 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.40<0> +gpua040:3903232:3903232 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua040:3903232:3903232 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua040:3903232:3903300 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua040:3903232:3903300 [2] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua040:3903232:3903300 [2] NCCL INFO Using network AWS Libfabric +gpua040:3903232:3903300 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpua040:3903232:3903300 [2] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua040:3903232:3903300 [2] NCCL INFO Trees [0] 27/-1/-1->26->25 [1] 27/-1/-1->26->25 +gpua040:3903232:3903300 [2] NCCL INFO Channel 00/0 : 26[85000] -> 27[c7000] via P2P/IPC/read +gpua040:3903232:3903300 [2] NCCL INFO Channel 01/0 : 26[85000] -> 27[c7000] via P2P/IPC/read +gpua042:289285:289285 [1] NCCL INFO cudaDriverVersion 12020 +gpua042:289285:289285 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.42<0> +gpua042:289285:289285 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua042:289285:289285 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua042:289285:289371 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua042:289285:289371 [1] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua042:289285:289371 [1] NCCL INFO Using network AWS Libfabric +gpua042:289285:289371 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpua042:289285:289371 [1] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua042:289285:289371 [1] NCCL INFO Trees [0] 30/-1/-1->29->28 [1] 30/44/-1->29->28 +gpua042:289285:289371 [1] NCCL INFO Channel 00/0 : 29[46000] -> 30[85000] via P2P/IPC/read +gpua042:289285:289371 [1] NCCL INFO Channel 01/0 : 29[46000] -> 30[85000] via P2P/IPC/read +gpua042:289285:289371 [1] NCCL INFO Connected all rings +gpua006:720656:720656 [2] NCCL INFO cudaDriverVersion 12020 +gpua006:720656:720656 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.6<0> +gpua006:720656:720656 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua006:720656:720656 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua006:720656:720836 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua006:720656:720836 [2] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua006:720656:720836 [2] NCCL INFO Using network AWS Libfabric +gpua006:720656:720836 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpua006:720656:720836 [2] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua006:720656:720836 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 +gpua006:720656:720836 [2] NCCL INFO Channel 00/0 : 2[85000] -> 3[c7000] via P2P/IPC/read +gpua006:720656:720836 [2] NCCL INFO Channel 01/0 : 2[85000] -> 3[c7000] via P2P/IPC/read +gpua006:720656:720836 [2] NCCL INFO Connected all rings +gpua012:2768200:2768276 [1] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua012:2768200:2768276 [1] NCCL INFO Using network AWS Libfabric +gpua012:2768200:2768276 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpua012:2768200:2768276 [1] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua012:2768200:2768276 [1] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/8/-1->5->4 +gpua012:2768200:2768276 [1] NCCL INFO Channel 00/0 : 5[46000] -> 6[85000] via P2P/IPC/read +gpua012:2768200:2768276 [1] NCCL INFO Channel 01/0 : 5[46000] -> 6[85000] via P2P/IPC/read +gpua012:2768200:2768276 [1] NCCL INFO Connected all rings +gpua012:2768200:2768276 [1] NCCL INFO Channel 01/0 : 5[46000] -> 8[7000] [send] via NET/AWS Libfabric/1 +gpua012:2768200:2768276 [1] NCCL INFO Channel 01/0 : 8[7000] -> 5[46000] [receive] via NET/AWS Libfabric/1 +gpua012:2768200:2768276 [1] NCCL INFO Channel 00/0 : 5[46000] -> 4[7000] via P2P/IPC/read +gpua012:2768200:2768276 [1] NCCL INFO Channel 01/0 : 5[46000] -> 4[7000] via P2P/IPC/read +gpua039:3705387:3705387 [3] NCCL INFO cudaDriverVersion 12020 +gpua039:3705387:3705387 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.39<0> +gpua039:3705387:3705387 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua039:3705387:3705387 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua039:3705387:3705444 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua039:3705387:3705444 [3] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua039:3705387:3705444 [3] NCCL INFO Using network AWS Libfabric +gpua039:3705387:3705444 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpua039:3705387:3705444 [3] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua039:3705387:3705444 [3] NCCL INFO Trees [0] -1/-1/-1->23->22 [1] -1/-1/-1->23->22 +gpua039:3705387:3705444 [3] NCCL INFO Channel 00/0 : 23[c7000] -> 24[7000] [send] via NET/AWS Libfabric/1 +gpua039:3705387:3705444 [3] NCCL INFO Channel 01/0 : 23[c7000] -> 24[7000] [send] via NET/AWS Libfabric/1 +gpua033:1688530:1688530 [2] NCCL INFO cudaDriverVersion 12020 +gpua033:1688530:1688530 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.33<0> +gpua033:1688530:1688530 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua033:1688530:1688530 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua033:1688530:1688609 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua033:1688530:1688609 [2] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua033:1688530:1688609 [2] NCCL INFO Using network AWS Libfabric +gpua033:1688530:1688609 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpua033:1688530:1688609 [2] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua033:1688530:1688609 [2] NCCL INFO Trees [0] 15/-1/-1->14->13 [1] 15/-1/-1->14->13 +gpua033:1688530:1688609 [2] NCCL INFO Channel 00/0 : 14[85000] -> 15[c7000] via P2P/IPC/read +gpua033:1688530:1688609 [2] NCCL INFO Channel 01/0 : 14[85000] -> 15[c7000] via P2P/IPC/read +gpua054:464654:464654 [1] NCCL INFO cudaDriverVersion 12020 +gpua054:464654:464654 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.54<0> +gpua054:464654:464654 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua054:464654:464654 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua054:464654:464736 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua054:464654:464736 [1] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua054:464654:464736 [1] NCCL INFO Using network AWS Libfabric +gpua054:464654:464736 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpua054:464654:464736 [1] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua054:464654:464736 [1] NCCL INFO Trees [0] 38/-1/-1->37->36 [1] 38/40/-1->37->36 +gpua054:464654:464736 [1] NCCL INFO Channel 00/0 : 37[46000] -> 38[85000] via P2P/IPC/read +gpua054:464654:464736 [1] NCCL INFO Channel 01/0 : 37[46000] -> 38[85000] via P2P/IPC/read +gpua054:464654:464736 [1] NCCL INFO Connected all rings +gpua079:3737284:3737284 [1] NCCL INFO cudaDriverVersion 12020 +gpua079:3737284:3737284 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.79<0> +gpua079:3737284:3737284 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua079:3737284:3737284 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua079:3737284:3737363 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua079:3737284:3737363 [1] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua079:3737284:3737363 [1] NCCL INFO Using network AWS Libfabric +gpua079:3737284:3737363 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpua079:3737284:3737363 [1] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua079:3737284:3737363 [1] NCCL INFO Trees [0] 50/40/-1->49->48 [1] 50/-1/-1->49->48 +gpua079:3737284:3737363 [1] NCCL INFO Channel 00/0 : 49[46000] -> 50[85000] via P2P/IPC/read +gpua079:3737284:3737363 [1] NCCL INFO Channel 01/0 : 49[46000] -> 50[85000] via P2P/IPC/read +gpua049:4092059:4092059 [0] NCCL INFO cudaDriverVersion 12020 +gpua049:4092059:4092059 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.49<0> +gpua049:4092059:4092059 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua049:4092059:4092059 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua049:4092059:4092139 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua049:4092059:4092139 [0] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua049:4092059:4092139 [0] NCCL INFO Using network AWS Libfabric +gpua049:4092059:4092139 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpua049:4092059:4092139 [0] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua049:4092059:4092139 [0] NCCL INFO Trees [0] 33/48/-1->32->0 [1] 33/-1/-1->32->36 +gpua049:4092059:4092139 [0] NCCL INFO Channel 00/0 : 31[c7000] -> 32[7000] [receive] via NET/AWS Libfabric/1 +gpua057:3929638:3929638 [2] NCCL INFO cudaDriverVersion 12020 +gpua057:3929638:3929638 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.57<0> +gpua057:3929638:3929638 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua057:3929638:3929638 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua057:3929638:3929712 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua057:3929638:3929712 [2] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua057:3929638:3929712 [2] NCCL INFO Using network AWS Libfabric +gpua057:3929638:3929712 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpua057:3929638:3929712 [2] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua057:3929638:3929712 [2] NCCL INFO Trees [0] 47/-1/-1->46->45 [1] 47/-1/-1->46->45 +gpua057:3929638:3929712 [2] NCCL INFO Channel 00/0 : 46[85000] -> 47[c7000] via P2P/IPC/read +gpua057:3929638:3929712 [2] NCCL INFO Channel 01/0 : 46[85000] -> 47[c7000] via P2P/IPC/read +gpua055:163209:163209 [0] NCCL INFO cudaDriverVersion 12020 +gpua055:163209:163209 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.55<0> +gpua055:163209:163209 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua055:163209:163209 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua055:163209:163271 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua055:163209:163271 [0] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua055:163209:163271 [0] NCCL INFO Using network AWS Libfabric +gpua055:163209:163271 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpua055:163209:163271 [0] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua055:163209:163271 [0] NCCL INFO Trees [0] 41/44/-1->40->49 [1] 41/-1/-1->40->37 +gpua055:163209:163271 [0] NCCL INFO Channel 00/0 : 39[c7000] -> 40[7000] [receive] via NET/AWS Libfabric/1 +gpua055:163209:163271 [0] NCCL INFO Channel 01/0 : 39[c7000] -> 40[7000] [receive] via NET/AWS Libfabric/1 +gpua038:223221:223221 [1] NCCL INFO cudaDriverVersion 12020 +gpua038:223221:223221 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.38<0> +gpua038:223221:223221 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua038:223221:223221 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua038:223221:223304 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua038:223221:223304 [1] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua038:223221:223304 [1] NCCL INFO Using network AWS Libfabric +gpua038:223221:223304 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpua038:223221:223304 [1] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua038:223221:223304 [1] NCCL INFO Trees [0] 18/8/-1->17->16 [1] 18/-1/-1->17->16 +gpua038:223221:223304 [1] NCCL INFO Channel 00/0 : 17[46000] -> 18[85000] via P2P/IPC/read +gpua038:223221:223304 [1] NCCL INFO Channel 01/0 : 17[46000] -> 18[85000] via P2P/IPC/read +gpua038:223221:223304 [1] NCCL INFO Connected all rings +gpua016:626221:626221 [3] NCCL INFO cudaDriverVersion 12020 +gpua016:626221:626221 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.16<0> +gpua016:626221:626221 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua016:626221:626221 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua016:626221:626292 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua016:626221:626292 [3] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua016:626221:626292 [3] NCCL INFO Using network AWS Libfabric +gpua016:626221:626292 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpua016:626221:626292 [3] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua016:626221:626292 [3] NCCL INFO Trees [0] -1/-1/-1->11->10 [1] -1/-1/-1->11->10 +gpua016:626221:626292 [3] NCCL INFO Channel 00/0 : 11[c7000] -> 12[7000] [send] via NET/AWS Libfabric/1 +gpua016:626221:626292 [3] NCCL INFO Channel 01/0 : 11[c7000] -> 12[7000] [send] via NET/AWS Libfabric/1 +gpua085:4061402:4061402 [0] NCCL INFO cudaDriverVersion 12020 +gpua085:4061402:4061402 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.85<0> +gpua085:4061402:4061402 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua085:4061402:4061402 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua085:4061402:4061983 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua085:4061402:4061983 [0] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua085:4061402:4061983 [0] NCCL INFO Using network AWS Libfabric +gpua085:4061402:4061983 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpua085:4061402:4061983 [0] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua085:4061402:4061983 [0] NCCL INFO Trees [0] 57/60/-1->56->48 [1] 57/-1/-1->56->53 +gpua085:4061402:4061983 [0] NCCL INFO Channel 00/0 : 55[c7000] -> 56[7000] [receive] via NET/AWS Libfabric/1 +gpua080:3313502:3313502 [2] NCCL INFO cudaDriverVersion 12020 +gpua080:3313502:3313502 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.80<0> +gpua080:3313502:3313502 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua080:3313502:3313502 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua080:3313502:3313583 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua080:3313502:3313583 [2] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua080:3313502:3313583 [2] NCCL INFO Using network AWS Libfabric +gpua080:3313502:3313583 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpua080:3313502:3313583 [2] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua080:3313502:3313583 [2] NCCL INFO Trees [0] 55/-1/-1->54->53 [1] 55/-1/-1->54->53 +gpua080:3313502:3313583 [2] NCCL INFO Channel 00/0 : 54[85000] -> 55[c7000] via P2P/IPC/read +gpua080:3313502:3313583 [2] NCCL INFO Channel 01/0 : 54[85000] -> 55[c7000] via P2P/IPC/read +gpua089:894161:894161 [2] NCCL INFO cudaDriverVersion 12020 +gpua089:894161:894161 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.89<0> +gpua089:894161:894161 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua089:894161:894161 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua089:894161:894234 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua089:894161:894234 [2] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua089:894161:894234 [2] NCCL INFO Using network AWS Libfabric +gpua089:894161:894234 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpua089:894161:894234 [2] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua089:894161:894234 [2] NCCL INFO Trees [0] 63/-1/-1->62->61 [1] 63/-1/-1->62->61 +gpua089:894161:894234 [2] NCCL INFO Channel 00/0 : 62[85000] -> 63[c7000] via P2P/IPC/read +gpua089:894161:894234 [2] NCCL INFO Channel 01/0 : 62[85000] -> 63[c7000] via P2P/IPC/read +gpua089:894161:894234 [2] NCCL INFO Connected all rings +gpua040:3903232:3903300 [2] NCCL INFO Connected all rings +gpua040:3903232:3903300 [2] NCCL INFO Channel 00/0 : 26[85000] -> 25[46000] via P2P/IPC/read +gpua040:3903232:3903300 [2] NCCL INFO Channel 01/0 : 26[85000] -> 25[46000] via P2P/IPC/read +gpua040:3903232:3903300 [2] NCCL INFO Connected all trees +gpua040:3903232:3903300 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua040:3903232:3903300 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua040:3903232:3903300 [2] NCCL INFO comm 0x55f711aebb20 rank 26 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpua042:289285:289371 [1] NCCL INFO Channel 01/0 : 29[46000] -> 44[7000] [send] via NET/AWS Libfabric/1 +gpua042:289285:289371 [1] NCCL INFO Channel 01/0 : 44[7000] -> 29[46000] [receive] via NET/AWS Libfabric/1 +gpua042:289285:289371 [1] NCCL INFO Channel 00/0 : 29[46000] -> 28[7000] via P2P/IPC/read +gpua042:289285:289371 [1] NCCL INFO Channel 01/0 : 29[46000] -> 28[7000] via P2P/IPC/read +gpua042:289285:289371 [1] NCCL INFO Connected all trees +gpua042:289285:289371 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua042:289285:289371 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua042:289285:289371 [1] NCCL INFO comm 0x55e6c02682e0 rank 29 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpua006:720656:720836 [2] NCCL INFO Channel 00/0 : 2[85000] -> 1[46000] via P2P/IPC/read +gpua006:720656:720836 [2] NCCL INFO Channel 01/0 : 2[85000] -> 1[46000] via P2P/IPC/read +gpua006:720656:720836 [2] NCCL INFO Connected all trees +gpua006:720656:720836 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua006:720656:720836 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua006:720656:720836 [2] NCCL INFO comm 0x55a92a12f410 rank 2 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpua012:2768200:2768276 [1] NCCL INFO Connected all trees +gpua012:2768200:2768276 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua012:2768200:2768276 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua012:2768200:2768276 [1] NCCL INFO comm 0x5643ca08edd0 rank 5 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpua039:3705387:3705444 [3] NCCL INFO Connected all rings +gpua039:3705387:3705444 [3] NCCL INFO Channel 00/0 : 23[c7000] -> 22[85000] via P2P/IPC/read +gpua039:3705387:3705444 [3] NCCL INFO Channel 01/0 : 23[c7000] -> 22[85000] via P2P/IPC/read +gpua039:3705387:3705444 [3] NCCL INFO Connected all trees +gpua039:3705387:3705444 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua039:3705387:3705444 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua039:3705387:3705444 [3] NCCL INFO comm 0x55d6d7c3df00 rank 23 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpua033:1688530:1688609 [2] NCCL INFO Connected all rings +gpua033:1688530:1688609 [2] NCCL INFO Channel 00/0 : 14[85000] -> 13[46000] via P2P/IPC/read +gpua033:1688530:1688609 [2] NCCL INFO Channel 01/0 : 14[85000] -> 13[46000] via P2P/IPC/read +gpua033:1688530:1688609 [2] NCCL INFO Connected all trees +gpua033:1688530:1688609 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua033:1688530:1688609 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua033:1688530:1688609 [2] NCCL INFO comm 0x55a0fd8a5fe0 rank 14 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpua054:464654:464736 [1] NCCL INFO Channel 01/0 : 37[46000] -> 40[7000] [send] via NET/AWS Libfabric/1 +gpua054:464654:464736 [1] NCCL INFO Channel 01/0 : 40[7000] -> 37[46000] [receive] via NET/AWS Libfabric/1 +gpua054:464654:464736 [1] NCCL INFO Channel 00/0 : 37[46000] -> 36[7000] via P2P/IPC/read +gpua054:464654:464736 [1] NCCL INFO Channel 01/0 : 37[46000] -> 36[7000] via P2P/IPC/read +gpua054:464654:464736 [1] NCCL INFO Connected all trees +gpua054:464654:464736 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua054:464654:464736 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua054:464654:464736 [1] NCCL INFO comm 0x56070000ca20 rank 37 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpua079:3737284:3737363 [1] NCCL INFO Connected all rings +gpua079:3737284:3737363 [1] NCCL INFO Channel 00/0 : 40[7000] -> 49[46000] [receive] via NET/AWS Libfabric/1 +gpua079:3737284:3737363 [1] NCCL INFO Channel 00/0 : 49[46000] -> 40[7000] [send] via NET/AWS Libfabric/1 +gpua079:3737284:3737363 [1] NCCL INFO Channel 00/0 : 49[46000] -> 48[7000] via P2P/IPC/read +gpua079:3737284:3737363 [1] NCCL INFO Channel 01/0 : 49[46000] -> 48[7000] via P2P/IPC/read +gpua079:3737284:3737363 [1] NCCL INFO Connected all trees +gpua079:3737284:3737363 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua079:3737284:3737363 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua079:3737284:3737363 [1] NCCL INFO comm 0x5592b1d901c0 rank 49 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpua049:4092059:4092139 [0] NCCL INFO Channel 01/0 : 31[c7000] -> 32[7000] [receive] via NET/AWS Libfabric/1 +gpua049:4092059:4092139 [0] NCCL INFO Channel 00/0 : 32[7000] -> 33[46000] via P2P/IPC/read +gpua049:4092059:4092139 [0] NCCL INFO Channel 01/0 : 32[7000] -> 33[46000] via P2P/IPC/read +gpua049:4092059:4092139 [0] NCCL INFO Connected all rings +gpua049:4092059:4092139 [0] NCCL INFO Channel 01/0 : 32[7000] -> 36[7000] [send] via NET/AWS Libfabric/1 +gpua049:4092059:4092139 [0] NCCL INFO Channel 00/0 : 32[7000] -> 48[7000] [send] via NET/AWS Libfabric/1 +gpua049:4092059:4092139 [0] NCCL INFO Channel 00/0 : 0[7000] -> 32[7000] [receive] via NET/AWS Libfabric/1 +gpua049:4092059:4092139 [0] NCCL INFO Channel 00/0 : 32[7000] -> 0[7000] [send] via NET/AWS Libfabric/1 +gpua049:4092059:4092139 [0] NCCL INFO Channel 00/0 : 48[7000] -> 32[7000] [receive] via NET/AWS Libfabric/1 +gpua049:4092059:4092139 [0] NCCL INFO Channel 01/0 : 36[7000] -> 32[7000] [receive] via NET/AWS Libfabric/1 +gpua057:3929638:3929712 [2] NCCL INFO Connected all rings +gpua057:3929638:3929712 [2] NCCL INFO Channel 00/0 : 46[85000] -> 45[46000] via P2P/IPC/read +gpua057:3929638:3929712 [2] NCCL INFO Channel 01/0 : 46[85000] -> 45[46000] via P2P/IPC/read +gpua057:3929638:3929712 [2] NCCL INFO Connected all trees +gpua057:3929638:3929712 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua057:3929638:3929712 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua057:3929638:3929712 [2] NCCL INFO comm 0x55570c2dcbe0 rank 46 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpua055:163209:163271 [0] NCCL INFO Channel 00/0 : 40[7000] -> 41[46000] via P2P/IPC/read +gpua055:163209:163271 [0] NCCL INFO Channel 01/0 : 40[7000] -> 41[46000] via P2P/IPC/read +gpua055:163209:163271 [0] NCCL INFO Connected all rings +gpua055:163209:163271 [0] NCCL INFO Channel 01/0 : 37[46000] -> 40[7000] [receive] via NET/AWS Libfabric/1 +gpua055:163209:163271 [0] NCCL INFO Channel 00/0 : 40[7000] -> 44[7000] [send] via NET/AWS Libfabric/1 +gpua055:163209:163271 [0] NCCL INFO Channel 00/0 : 40[7000] -> 49[46000] [send] via NET/AWS Libfabric/1 +gpua055:163209:163271 [0] NCCL INFO Channel 00/0 : 49[46000] -> 40[7000] [receive] via NET/AWS Libfabric/1 +gpua055:163209:163271 [0] NCCL INFO Channel 00/0 : 44[7000] -> 40[7000] [receive] via NET/AWS Libfabric/1 +gpua055:163209:163271 [0] NCCL INFO Channel 01/0 : 40[7000] -> 37[46000] [send] via NET/AWS Libfabric/1 +gpua055:163209:163271 [0] NCCL INFO Connected all trees +gpua055:163209:163271 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua038:223221:223304 [1] NCCL INFO Channel 00/0 : 8[7000] -> 17[46000] [receive] via NET/AWS Libfabric/1 +gpua038:223221:223304 [1] NCCL INFO Channel 00/0 : 17[46000] -> 8[7000] [send] via NET/AWS Libfabric/1 +gpua038:223221:223304 [1] NCCL INFO Channel 00/0 : 17[46000] -> 16[7000] via P2P/IPC/read +gpua038:223221:223304 [1] NCCL INFO Channel 01/0 : 17[46000] -> 16[7000] via P2P/IPC/read +gpua038:223221:223304 [1] NCCL INFO Connected all trees +gpua038:223221:223304 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua038:223221:223304 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua038:223221:223304 [1] NCCL INFO comm 0x55955847d1d0 rank 17 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpua016:626221:626292 [3] NCCL INFO Connected all rings +gpua016:626221:626292 [3] NCCL INFO Channel 00/0 : 11[c7000] -> 10[85000] via P2P/IPC/read +gpua016:626221:626292 [3] NCCL INFO Channel 01/0 : 11[c7000] -> 10[85000] via P2P/IPC/read +gpua016:626221:626292 [3] NCCL INFO Connected all trees +gpua016:626221:626292 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua016:626221:626292 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua016:626221:626292 [3] NCCL INFO comm 0x55ed25fa81b0 rank 11 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpua085:4061402:4061983 [0] NCCL INFO Channel 01/0 : 55[c7000] -> 56[7000] [receive] via NET/AWS Libfabric/1 +gpua085:4061402:4061983 [0] NCCL INFO Channel 00/0 : 56[7000] -> 57[46000] via P2P/IPC/read +gpua085:4061402:4061983 [0] NCCL INFO Channel 01/0 : 56[7000] -> 57[46000] via P2P/IPC/read +gpua085:4061402:4061983 [0] NCCL INFO Connected all rings +gpua085:4061402:4061983 [0] NCCL INFO Channel 01/0 : 53[46000] -> 56[7000] [receive] via NET/AWS Libfabric/1 +gpua085:4061402:4061983 [0] NCCL INFO Channel 00/0 : 56[7000] -> 60[7000] [send] via NET/AWS Libfabric/1 +gpua085:4061402:4061983 [0] NCCL INFO Channel 00/0 : 48[7000] -> 56[7000] [receive] via NET/AWS Libfabric/1 +gpua085:4061402:4061983 [0] NCCL INFO Channel 00/0 : 56[7000] -> 48[7000] [send] via NET/AWS Libfabric/1 +gpua085:4061402:4061983 [0] NCCL INFO Channel 00/0 : 60[7000] -> 56[7000] [receive] via NET/AWS Libfabric/1 +gpua085:4061402:4061983 [0] NCCL INFO Channel 01/0 : 56[7000] -> 53[46000] [send] via NET/AWS Libfabric/1 +gpua080:3313502:3313583 [2] NCCL INFO Connected all rings +gpua080:3313502:3313583 [2] NCCL INFO Channel 00/0 : 54[85000] -> 53[46000] via P2P/IPC/read +gpua080:3313502:3313583 [2] NCCL INFO Channel 01/0 : 54[85000] -> 53[46000] via P2P/IPC/read +gpua080:3313502:3313583 [2] NCCL INFO Connected all trees +gpua080:3313502:3313583 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua080:3313502:3313583 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua080:3313502:3313583 [2] NCCL INFO comm 0x5621698f8050 rank 54 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpua089:894161:894234 [2] NCCL INFO Channel 00/0 : 62[85000] -> 61[46000] via P2P/IPC/read +gpua089:894161:894234 [2] NCCL INFO Channel 01/0 : 62[85000] -> 61[46000] via P2P/IPC/read +gpua089:894161:894234 [2] NCCL INFO Connected all trees +gpua089:894161:894234 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua089:894161:894234 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua089:894161:894234 [2] NCCL INFO comm 0x5616714af160 rank 62 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpua040:3903230:3903230 [0] NCCL INFO cudaDriverVersion 12020 +gpua040:3903230:3903230 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.40<0> +gpua040:3903230:3903230 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua040:3903230:3903230 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua040:3903230:3903301 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua040:3903230:3903301 [0] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua040:3903230:3903301 [0] NCCL INFO Using network AWS Libfabric +gpua040:3903230:3903301 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpua040:3903230:3903301 [0] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua040:3903230:3903301 [0] NCCL INFO Trees [0] 25/28/-1->24->16 [1] 25/-1/-1->24->21 +gpua040:3903230:3903301 [0] NCCL INFO Channel 00/0 : 23[c7000] -> 24[7000] [receive] via NET/AWS Libfabric/1 +gpua042:289286:289286 [2] NCCL INFO cudaDriverVersion 12020 +gpua042:289286:289286 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.42<0> +gpua042:289286:289286 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua042:289286:289286 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua042:289286:289372 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua042:289286:289372 [2] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua042:289286:289372 [2] NCCL INFO Using network AWS Libfabric +gpua042:289286:289372 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpua042:289286:289372 [2] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua042:289286:289372 [2] NCCL INFO Trees [0] 31/-1/-1->30->29 [1] 31/-1/-1->30->29 +gpua042:289286:289372 [2] NCCL INFO Channel 00/0 : 30[85000] -> 31[c7000] via P2P/IPC/read +gpua042:289286:289372 [2] NCCL INFO Channel 01/0 : 30[85000] -> 31[c7000] via P2P/IPC/read +gpua042:289286:289372 [2] NCCL INFO Connected all rings +gpua006:720655:720655 [1] NCCL INFO cudaDriverVersion 12020 +gpua006:720655:720655 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.6<0> +gpua006:720655:720655 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua006:720655:720655 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua006:720655:720835 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua006:720655:720835 [1] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua006:720655:720835 [1] NCCL INFO Using network AWS Libfabric +gpua006:720655:720835 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpua006:720655:720835 [1] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua006:720655:720835 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 +gpua006:720655:720835 [1] NCCL INFO Channel 00/0 : 1[46000] -> 2[85000] via P2P/IPC/read +gpua006:720655:720835 [1] NCCL INFO Channel 01/0 : 1[46000] -> 2[85000] via P2P/IPC/read +gpua006:720655:720835 [1] NCCL INFO Connected all rings +gpua012:2768199:2768199 [0] NCCL INFO cudaDriverVersion 12020 +gpua012:2768199:2768199 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.12<0> +gpua012:2768199:2768199 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua012:2768199:2768199 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua012:2768199:2768275 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua012:2768199:2768275 [0] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua012:2768199:2768275 [0] NCCL INFO Using network AWS Libfabric +gpua012:2768199:2768275 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpua012:2768199:2768275 [0] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua012:2768199:2768275 [0] NCCL INFO Trees [0] 5/-1/-1->4->9 [1] 5/0/-1->4->12 +gpua012:2768199:2768275 [0] NCCL INFO Channel 00/0 : 3[c7000] -> 4[7000] [receive] via NET/AWS Libfabric/1 +gpua012:2768199:2768275 [0] NCCL INFO Channel 01/0 : 3[c7000] -> 4[7000] [receive] via NET/AWS Libfabric/1 +gpua039:3705385:3705385 [1] NCCL INFO cudaDriverVersion 12020 +gpua039:3705385:3705385 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.39<0> +gpua039:3705385:3705385 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua039:3705385:3705385 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua039:3705385:3705443 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua039:3705385:3705443 [1] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua039:3705385:3705443 [1] NCCL INFO Using network AWS Libfabric +gpua039:3705385:3705443 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpua039:3705385:3705443 [1] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua039:3705385:3705443 [1] NCCL INFO Trees [0] 22/-1/-1->21->20 [1] 22/24/-1->21->20 +gpua039:3705385:3705443 [1] NCCL INFO Channel 00/0 : 21[46000] -> 22[85000] via P2P/IPC/read +gpua039:3705385:3705443 [1] NCCL INFO Channel 01/0 : 21[46000] -> 22[85000] via P2P/IPC/read +gpua033:1688529:1688529 [1] NCCL INFO cudaDriverVersion 12020 +gpua033:1688529:1688529 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.33<0> +gpua033:1688529:1688529 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua033:1688529:1688529 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua033:1688529:1688607 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua033:1688529:1688607 [1] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua033:1688529:1688607 [1] NCCL INFO Using network AWS Libfabric +gpua033:1688529:1688607 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpua033:1688529:1688607 [1] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua033:1688529:1688607 [1] NCCL INFO Trees [0] 14/-1/-1->13->12 [1] 14/20/-1->13->12 +gpua033:1688529:1688607 [1] NCCL INFO Channel 00/0 : 13[46000] -> 14[85000] via P2P/IPC/read +gpua033:1688529:1688607 [1] NCCL INFO Channel 01/0 : 13[46000] -> 14[85000] via P2P/IPC/read +gpua054:464653:464653 [0] NCCL INFO cudaDriverVersion 12020 +gpua054:464653:464653 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.54<0> +gpua054:464653:464653 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua054:464653:464653 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua054:464653:464735 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua054:464653:464735 [0] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua054:464653:464735 [0] NCCL INFO Using network AWS Libfabric +gpua054:464653:464735 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpua054:464653:464735 [0] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua054:464653:464735 [0] NCCL INFO Trees [0] 37/-1/-1->36->41 [1] 37/32/-1->36->44 +gpua054:464653:464735 [0] NCCL INFO Channel 00/0 : 35[c7000] -> 36[7000] [receive] via NET/AWS Libfabric/1 +gpua054:464653:464735 [0] NCCL INFO Channel 01/0 : 35[c7000] -> 36[7000] [receive] via NET/AWS Libfabric/1 +gpua079:3737286:3737286 [3] NCCL INFO cudaDriverVersion 12020 +gpua079:3737286:3737286 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.79<0> +gpua079:3737286:3737286 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua079:3737286:3737286 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua079:3737286:3737366 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua079:3737286:3737366 [3] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua079:3737286:3737366 [3] NCCL INFO Using network AWS Libfabric +gpua079:3737286:3737366 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpua079:3737286:3737366 [3] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua079:3737286:3737366 [3] NCCL INFO Trees [0] -1/-1/-1->51->50 [1] -1/-1/-1->51->50 +gpua079:3737286:3737366 [3] NCCL INFO Channel 00/0 : 51[c7000] -> 52[7000] [send] via NET/AWS Libfabric/1 +gpua079:3737286:3737366 [3] NCCL INFO Channel 01/0 : 51[c7000] -> 52[7000] [send] via NET/AWS Libfabric/1 +gpua049:4092059:4092139 [0] NCCL INFO Connected all trees +gpua049:4092059:4092139 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua049:4092059:4092139 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua049:4092059:4092139 [0] NCCL INFO comm 0x558f8d29bc50 rank 32 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpua057:3929636:3929636 [0] NCCL INFO cudaDriverVersion 12020 +gpua057:3929636:3929636 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.57<0> +gpua057:3929636:3929636 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua057:3929636:3929636 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua057:3929636:3929709 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua057:3929636:3929709 [0] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua057:3929636:3929709 [0] NCCL INFO Using network AWS Libfabric +gpua057:3929636:3929709 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpua057:3929636:3929709 [0] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua057:3929636:3929709 [0] NCCL INFO Trees [0] 45/-1/-1->44->40 [1] 45/36/-1->44->29 +gpua057:3929636:3929709 [0] NCCL INFO Channel 00/0 : 43[c7000] -> 44[7000] [receive] via NET/AWS Libfabric/1 +gpua055:163209:163271 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua055:163209:163271 [0] NCCL INFO comm 0x559d73f55d20 rank 40 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpua038:223220:223220 [0] NCCL INFO cudaDriverVersion 12020 +gpua038:223220:223220 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.38<0> +gpua038:223220:223220 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua038:223220:223220 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua038:223220:223302 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua038:223220:223302 [0] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua038:223220:223302 [0] NCCL INFO Using network AWS Libfabric +gpua038:223220:223302 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpua038:223220:223302 [0] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua038:223220:223302 [0] NCCL INFO Trees [0] 17/24/-1->16->33 [1] 17/-1/-1->16->20 +gpua038:223220:223302 [0] NCCL INFO Channel 00/0 : 15[c7000] -> 16[7000] [receive] via NET/AWS Libfabric/1 +gpua038:223220:223302 [0] NCCL INFO Channel 01/0 : 15[c7000] -> 16[7000] [receive] via NET/AWS Libfabric/1 +gpua016:626218:626218 [0] NCCL INFO cudaDriverVersion 12020 +gpua016:626218:626218 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.16<0> +gpua016:626218:626218 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua016:626218:626218 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua016:626218:626295 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua016:626218:626295 [0] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua016:626218:626295 [0] NCCL INFO Using network AWS Libfabric +gpua016:626218:626295 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpua016:626218:626295 [0] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua016:626218:626295 [0] NCCL INFO Trees [0] 9/12/-1->8->17 [1] 9/-1/-1->8->5 +gpua016:626218:626295 [0] NCCL INFO Channel 00/0 : 7[c7000] -> 8[7000] [receive] via NET/AWS Libfabric/1 +gpua016:626218:626295 [0] NCCL INFO Channel 01/0 : 7[c7000] -> 8[7000] [receive] via NET/AWS Libfabric/1 +gpua085:4061402:4061983 [0] NCCL INFO Connected all trees +gpua085:4061402:4061983 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua085:4061402:4061983 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua085:4061402:4061983 [0] NCCL INFO comm 0x559940c55b80 rank 56 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpua080:3313500:3313500 [0] NCCL INFO cudaDriverVersion 12020 +gpua080:3313500:3313500 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.80<0> +gpua080:3313500:3313500 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua080:3313500:3313500 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua080:3313500:3313584 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua080:3313500:3313584 [0] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua080:3313500:3313584 [0] NCCL INFO Using network AWS Libfabric +gpua080:3313500:3313584 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpua080:3313500:3313584 [0] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua080:3313500:3313584 [0] NCCL INFO Trees [0] 53/-1/-1->52->57 [1] 53/48/-1->52->45 +gpua080:3313500:3313584 [0] NCCL INFO Channel 00/0 : 51[c7000] -> 52[7000] [receive] via NET/AWS Libfabric/1 +gpua089:894159:894159 [0] NCCL INFO cudaDriverVersion 12020 +gpua089:894159:894159 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.89<0> +gpua089:894159:894159 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua089:894159:894159 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua089:894159:894232 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua089:894159:894232 [0] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua089:894159:894232 [0] NCCL INFO Using network AWS Libfabric +gpua089:894159:894232 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpua089:894159:894232 [0] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua089:894159:894232 [0] NCCL INFO Trees [0] 61/-1/-1->60->56 [1] 61/28/-1->60->-1 +gpua089:894159:894232 [0] NCCL INFO Channel 00/0 : 59[c7000] -> 60[7000] [receive] via NET/AWS Libfabric/1 +gpua089:894159:894232 [0] NCCL INFO Channel 01/0 : 59[c7000] -> 60[7000] [receive] via NET/AWS Libfabric/1 +gpua040:3903230:3903301 [0] NCCL INFO Channel 01/0 : 23[c7000] -> 24[7000] [receive] via NET/AWS Libfabric/1 +gpua040:3903230:3903301 [0] NCCL INFO Channel 00/0 : 24[7000] -> 25[46000] via P2P/IPC/read +gpua040:3903230:3903301 [0] NCCL INFO Channel 01/0 : 24[7000] -> 25[46000] via P2P/IPC/read +gpua040:3903230:3903301 [0] NCCL INFO Connected all rings +gpua040:3903230:3903301 [0] NCCL INFO Channel 01/0 : 21[46000] -> 24[7000] [receive] via NET/AWS Libfabric/1 +gpua040:3903230:3903301 [0] NCCL INFO Channel 00/0 : 24[7000] -> 28[7000] [send] via NET/AWS Libfabric/1 +gpua040:3903230:3903301 [0] NCCL INFO Channel 00/0 : 16[7000] -> 24[7000] [receive] via NET/AWS Libfabric/1 +gpua040:3903230:3903301 [0] NCCL INFO Channel 00/0 : 24[7000] -> 16[7000] [send] via NET/AWS Libfabric/1 +gpua040:3903230:3903301 [0] NCCL INFO Channel 00/0 : 28[7000] -> 24[7000] [receive] via NET/AWS Libfabric/1 +gpua040:3903230:3903301 [0] NCCL INFO Channel 01/0 : 24[7000] -> 21[46000] [send] via NET/AWS Libfabric/1 +gpua042:289286:289372 [2] NCCL INFO Channel 00/0 : 30[85000] -> 29[46000] via P2P/IPC/read +gpua042:289286:289372 [2] NCCL INFO Channel 01/0 : 30[85000] -> 29[46000] via P2P/IPC/read +gpua042:289286:289372 [2] NCCL INFO Connected all trees +gpua042:289286:289372 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua042:289286:289372 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua042:289286:289372 [2] NCCL INFO comm 0x55f40dd0d450 rank 30 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpua006:720655:720835 [1] NCCL INFO Channel 00/0 : 1[46000] -> 0[7000] via P2P/IPC/read +gpua006:720655:720835 [1] NCCL INFO Channel 01/0 : 1[46000] -> 0[7000] via P2P/IPC/read +gpua006:720655:720835 [1] NCCL INFO Connected all trees +gpua006:720655:720835 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua006:720655:720835 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua006:720655:720835 [1] NCCL INFO comm 0x55e9f108d2d0 rank 1 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpua006:720657:720657 [3] NCCL INFO cudaDriverVersion 12020 +gpua006:720657:720657 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.6<0> +gpua006:720657:720657 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua006:720657:720657 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua006:720657:720834 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua006:720657:720834 [3] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua012:2768199:2768275 [0] NCCL INFO Channel 00/0 : 4[7000] -> 5[46000] via P2P/IPC/read +gpua012:2768199:2768275 [0] NCCL INFO Channel 01/0 : 4[7000] -> 5[46000] via P2P/IPC/read +gpua012:2768199:2768275 [0] NCCL INFO Connected all rings +gpua012:2768199:2768275 [0] NCCL INFO Channel 01/0 : 0[7000] -> 4[7000] [receive] via NET/AWS Libfabric/1 +gpua012:2768199:2768275 [0] NCCL INFO Channel 00/0 : 4[7000] -> 9[46000] [send] via NET/AWS Libfabric/1 +gpua012:2768199:2768275 [0] NCCL INFO Channel 01/0 : 4[7000] -> 12[7000] [send] via NET/AWS Libfabric/1 +gpua012:2768199:2768275 [0] NCCL INFO Channel 01/0 : 12[7000] -> 4[7000] [receive] via NET/AWS Libfabric/1 +gpua012:2768199:2768275 [0] NCCL INFO Channel 00/0 : 9[46000] -> 4[7000] [receive] via NET/AWS Libfabric/1 +gpua012:2768199:2768275 [0] NCCL INFO Channel 01/0 : 4[7000] -> 0[7000] [send] via NET/AWS Libfabric/1 +gpua012:2768199:2768275 [0] NCCL INFO Connected all trees +gpua012:2768199:2768275 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua039:3705385:3705443 [1] NCCL INFO Connected all rings +gpua039:3705385:3705443 [1] NCCL INFO Channel 01/0 : 21[46000] -> 24[7000] [send] via NET/AWS Libfabric/1 +gpua039:3705385:3705443 [1] NCCL INFO Channel 01/0 : 24[7000] -> 21[46000] [receive] via NET/AWS Libfabric/1 +gpua039:3705385:3705443 [1] NCCL INFO Channel 00/0 : 21[46000] -> 20[7000] via P2P/IPC/read +gpua039:3705385:3705443 [1] NCCL INFO Channel 01/0 : 21[46000] -> 20[7000] via P2P/IPC/read +gpua039:3705385:3705443 [1] NCCL INFO Connected all trees +gpua039:3705385:3705443 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua039:3705385:3705443 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua039:3705385:3705443 [1] NCCL INFO comm 0x55cd51585fb0 rank 21 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpua033:1688529:1688607 [1] NCCL INFO Connected all rings +gpua033:1688529:1688607 [1] NCCL INFO Channel 01/0 : 13[46000] -> 20[7000] [send] via NET/AWS Libfabric/1 +gpua033:1688529:1688607 [1] NCCL INFO Channel 01/0 : 20[7000] -> 13[46000] [receive] via NET/AWS Libfabric/1 +gpua033:1688529:1688607 [1] NCCL INFO Channel 00/0 : 13[46000] -> 12[7000] via P2P/IPC/read +gpua033:1688529:1688607 [1] NCCL INFO Channel 01/0 : 13[46000] -> 12[7000] via P2P/IPC/read +gpua033:1688529:1688607 [1] NCCL INFO Connected all trees +gpua033:1688529:1688607 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua033:1688529:1688607 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua033:1688529:1688607 [1] NCCL INFO comm 0x565309712f90 rank 13 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpua054:464653:464735 [0] NCCL INFO Channel 00/0 : 36[7000] -> 37[46000] via P2P/IPC/read +gpua054:464653:464735 [0] NCCL INFO Channel 01/0 : 36[7000] -> 37[46000] via P2P/IPC/read +gpua054:464653:464735 [0] NCCL INFO Connected all rings +gpua054:464653:464735 [0] NCCL INFO Channel 01/0 : 32[7000] -> 36[7000] [receive] via NET/AWS Libfabric/1 +gpua054:464653:464735 [0] NCCL INFO Channel 00/0 : 36[7000] -> 41[46000] [send] via NET/AWS Libfabric/1 +gpua054:464653:464735 [0] NCCL INFO Channel 01/0 : 36[7000] -> 44[7000] [send] via NET/AWS Libfabric/1 +gpua054:464653:464735 [0] NCCL INFO Channel 01/0 : 44[7000] -> 36[7000] [receive] via NET/AWS Libfabric/1 +gpua054:464653:464735 [0] NCCL INFO Channel 00/0 : 41[46000] -> 36[7000] [receive] via NET/AWS Libfabric/1 +gpua054:464653:464735 [0] NCCL INFO Channel 01/0 : 36[7000] -> 32[7000] [send] via NET/AWS Libfabric/1 +gpua054:464653:464735 [0] NCCL INFO Connected all trees +gpua054:464653:464735 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua079:3737286:3737366 [3] NCCL INFO Connected all rings +gpua079:3737286:3737366 [3] NCCL INFO Channel 00/0 : 51[c7000] -> 50[85000] via P2P/IPC/read +gpua079:3737286:3737366 [3] NCCL INFO Channel 01/0 : 51[c7000] -> 50[85000] via P2P/IPC/read +gpua079:3737286:3737366 [3] NCCL INFO Connected all trees +gpua079:3737286:3737366 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua079:3737286:3737366 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua079:3737286:3737366 [3] NCCL INFO comm 0x55ec33f3ca60 rank 51 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpua049:4092062:4092062 [3] NCCL INFO cudaDriverVersion 12020 +gpua049:4092062:4092062 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.49<0> +gpua049:4092062:4092062 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua049:4092062:4092062 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua049:4092062:4092142 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua049:4092062:4092142 [3] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua049:4092062:4092142 [3] NCCL INFO Using network AWS Libfabric +gpua049:4092062:4092142 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpua049:4092062:4092142 [3] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua049:4092062:4092142 [3] NCCL INFO Trees [0] -1/-1/-1->35->34 [1] -1/-1/-1->35->34 +gpua049:4092062:4092142 [3] NCCL INFO Channel 00/0 : 35[c7000] -> 36[7000] [send] via NET/AWS Libfabric/1 +gpua049:4092062:4092142 [3] NCCL INFO Channel 01/0 : 35[c7000] -> 36[7000] [send] via NET/AWS Libfabric/1 +gpua057:3929636:3929709 [0] NCCL INFO Channel 01/0 : 43[c7000] -> 44[7000] [receive] via NET/AWS Libfabric/1 +gpua057:3929636:3929709 [0] NCCL INFO Channel 00/0 : 44[7000] -> 45[46000] via P2P/IPC/read +gpua057:3929636:3929709 [0] NCCL INFO Channel 01/0 : 44[7000] -> 45[46000] via P2P/IPC/read +gpua057:3929636:3929709 [0] NCCL INFO Connected all rings +gpua057:3929636:3929709 [0] NCCL INFO Channel 00/0 : 40[7000] -> 44[7000] [receive] via NET/AWS Libfabric/1 +gpua057:3929636:3929709 [0] NCCL INFO Channel 01/0 : 36[7000] -> 44[7000] [receive] via NET/AWS Libfabric/1 +gpua057:3929636:3929709 [0] NCCL INFO Channel 01/0 : 29[46000] -> 44[7000] [receive] via NET/AWS Libfabric/1 +gpua057:3929636:3929709 [0] NCCL INFO Channel 01/0 : 44[7000] -> 29[46000] [send] via NET/AWS Libfabric/1 +gpua057:3929636:3929709 [0] NCCL INFO Channel 01/0 : 44[7000] -> 36[7000] [send] via NET/AWS Libfabric/1 +gpua057:3929636:3929709 [0] NCCL INFO Channel 00/0 : 44[7000] -> 40[7000] [send] via NET/AWS Libfabric/1 +gpua055:163210:163210 [1] NCCL INFO cudaDriverVersion 12020 +gpua055:163210:163210 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.55<0> +gpua055:163210:163210 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua055:163210:163210 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua055:163210:163273 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua055:163210:163273 [1] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua055:163210:163273 [1] NCCL INFO Using network AWS Libfabric +gpua055:163210:163273 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpua055:163210:163273 [1] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua055:163210:163273 [1] NCCL INFO Trees [0] 42/36/-1->41->40 [1] 42/-1/-1->41->40 +gpua055:163210:163273 [1] NCCL INFO Channel 00/0 : 41[46000] -> 42[85000] via P2P/IPC/read +gpua055:163210:163273 [1] NCCL INFO Channel 01/0 : 41[46000] -> 42[85000] via P2P/IPC/read +gpua055:163210:163273 [1] NCCL INFO Connected all rings +gpua038:223220:223302 [0] NCCL INFO Channel 00/0 : 16[7000] -> 17[46000] via P2P/IPC/read +gpua038:223220:223302 [0] NCCL INFO Channel 01/0 : 16[7000] -> 17[46000] via P2P/IPC/read +gpua038:223220:223302 [0] NCCL INFO Connected all rings +gpua038:223220:223302 [0] NCCL INFO Channel 01/0 : 16[7000] -> 20[7000] [send] via NET/AWS Libfabric/1 +gpua038:223220:223302 [0] NCCL INFO Channel 00/0 : 16[7000] -> 24[7000] [send] via NET/AWS Libfabric/1 +gpua038:223220:223302 [0] NCCL INFO Channel 00/0 : 16[7000] -> 33[46000] [send] via NET/AWS Libfabric/1 +gpua038:223220:223302 [0] NCCL INFO Channel 00/0 : 33[46000] -> 16[7000] [receive] via NET/AWS Libfabric/1 +gpua038:223220:223302 [0] NCCL INFO Channel 00/0 : 24[7000] -> 16[7000] [receive] via NET/AWS Libfabric/1 +gpua038:223220:223302 [0] NCCL INFO Channel 01/0 : 20[7000] -> 16[7000] [receive] via NET/AWS Libfabric/1 +gpua038:223220:223302 [0] NCCL INFO Connected all trees +gpua038:223220:223302 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua016:626218:626295 [0] NCCL INFO Channel 00/0 : 8[7000] -> 9[46000] via P2P/IPC/read +gpua016:626218:626295 [0] NCCL INFO Channel 01/0 : 8[7000] -> 9[46000] via P2P/IPC/read +gpua016:626218:626295 [0] NCCL INFO Connected all rings +gpua016:626218:626295 [0] NCCL INFO Channel 01/0 : 5[46000] -> 8[7000] [receive] via NET/AWS Libfabric/1 +gpua016:626218:626295 [0] NCCL INFO Channel 00/0 : 8[7000] -> 12[7000] [send] via NET/AWS Libfabric/1 +gpua016:626218:626295 [0] NCCL INFO Channel 00/0 : 8[7000] -> 17[46000] [send] via NET/AWS Libfabric/1 +gpua016:626218:626295 [0] NCCL INFO Channel 00/0 : 17[46000] -> 8[7000] [receive] via NET/AWS Libfabric/1 +gpua016:626218:626295 [0] NCCL INFO Channel 00/0 : 12[7000] -> 8[7000] [receive] via NET/AWS Libfabric/1 +gpua016:626218:626295 [0] NCCL INFO Channel 01/0 : 8[7000] -> 5[46000] [send] via NET/AWS Libfabric/1 +gpua016:626218:626295 [0] NCCL INFO Connected all trees +gpua016:626218:626295 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua085:4061403:4061403 [1] NCCL INFO cudaDriverVersion 12020 +gpua085:4061403:4061403 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.85<0> +gpua085:4061403:4061403 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua085:4061403:4061403 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua085:4061403:4061986 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua085:4061403:4061986 [1] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua085:4061403:4061986 [1] NCCL INFO Using network AWS Libfabric +gpua085:4061403:4061986 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpua085:4061403:4061986 [1] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua085:4061403:4061986 [1] NCCL INFO Trees [0] 58/52/-1->57->56 [1] 58/-1/-1->57->56 +gpua085:4061403:4061986 [1] NCCL INFO Channel 00/0 : 57[46000] -> 58[85000] via P2P/IPC/read +gpua085:4061403:4061986 [1] NCCL INFO Channel 01/0 : 57[46000] -> 58[85000] via P2P/IPC/read +gpua080:3313500:3313584 [0] NCCL INFO Channel 01/0 : 51[c7000] -> 52[7000] [receive] via NET/AWS Libfabric/1 +gpua080:3313500:3313584 [0] NCCL INFO Channel 00/0 : 52[7000] -> 53[46000] via P2P/IPC/read +gpua080:3313500:3313584 [0] NCCL INFO Channel 01/0 : 52[7000] -> 53[46000] via P2P/IPC/read +gpua080:3313500:3313584 [0] NCCL INFO Connected all rings +gpua080:3313500:3313584 [0] NCCL INFO Channel 01/0 : 48[7000] -> 52[7000] [receive] via NET/AWS Libfabric/1 +gpua080:3313500:3313584 [0] NCCL INFO Channel 00/0 : 52[7000] -> 57[46000] [send] via NET/AWS Libfabric/1 +gpua080:3313500:3313584 [0] NCCL INFO Channel 01/0 : 45[46000] -> 52[7000] [receive] via NET/AWS Libfabric/1 +gpua080:3313500:3313584 [0] NCCL INFO Channel 01/0 : 52[7000] -> 45[46000] [send] via NET/AWS Libfabric/1 +gpua080:3313500:3313584 [0] NCCL INFO Channel 00/0 : 57[46000] -> 52[7000] [receive] via NET/AWS Libfabric/1 +gpua080:3313500:3313584 [0] NCCL INFO Channel 01/0 : 52[7000] -> 48[7000] [send] via NET/AWS Libfabric/1 +gpua089:894159:894232 [0] NCCL INFO Channel 00/0 : 60[7000] -> 61[46000] via P2P/IPC/read +gpua089:894159:894232 [0] NCCL INFO Channel 01/0 : 60[7000] -> 61[46000] via P2P/IPC/read +gpua089:894159:894232 [0] NCCL INFO Connected all rings +gpua089:894159:894232 [0] NCCL INFO Channel 00/0 : 56[7000] -> 60[7000] [receive] via NET/AWS Libfabric/1 +gpua089:894159:894232 [0] NCCL INFO Channel 01/0 : 28[7000] -> 60[7000] [receive] via NET/AWS Libfabric/1 +gpua089:894159:894232 [0] NCCL INFO Channel 01/0 : 60[7000] -> 28[7000] [send] via NET/AWS Libfabric/1 +gpua089:894159:894232 [0] NCCL INFO Channel 00/0 : 60[7000] -> 56[7000] [send] via NET/AWS Libfabric/1 +gpua089:894159:894232 [0] NCCL INFO Connected all trees +gpua089:894159:894232 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua089:894159:894232 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua089:894159:894232 [0] NCCL INFO comm 0x55c49103b770 rank 60 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpua040:3903230:3903301 [0] NCCL INFO Connected all trees +gpua040:3903230:3903301 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua040:3903230:3903301 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua040:3903230:3903301 [0] NCCL INFO comm 0x561dbd6e8d50 rank 24 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpua040:3903233:3903233 [3] NCCL INFO cudaDriverVersion 12020 +gpua040:3903233:3903233 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.40<0> +gpua040:3903233:3903233 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua040:3903233:3903233 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua040:3903233:3903299 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua040:3903233:3903299 [3] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua040:3903233:3903299 [3] NCCL INFO Using network AWS Libfabric +gpua040:3903233:3903299 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpua042:289284:289284 [0] NCCL INFO cudaDriverVersion 12020 +gpua042:289284:289284 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.42<0> +gpua042:289284:289284 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua042:289284:289284 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua042:289284:289374 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua042:289284:289374 [0] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua042:289284:289374 [0] NCCL INFO Using network AWS Libfabric +gpua042:289284:289374 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpua042:289284:289374 [0] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua042:289284:289374 [0] NCCL INFO Trees [0] 29/-1/-1->28->24 [1] 29/12/-1->28->60 +gpua042:289284:289374 [0] NCCL INFO Channel 00/0 : 27[c7000] -> 28[7000] [receive] via NET/AWS Libfabric/1 +gpua042:289284:289374 [0] NCCL INFO Channel 01/0 : 27[c7000] -> 28[7000] [receive] via NET/AWS Libfabric/1 +gpua006:720657:720834 [3] NCCL INFO Using network AWS Libfabric +gpua006:720657:720834 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpua006:720657:720834 [3] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua006:720657:720834 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 +gpua006:720657:720834 [3] NCCL INFO Channel 00/0 : 3[c7000] -> 4[7000] [send] via NET/AWS Libfabric/1 +gpua006:720657:720834 [3] NCCL INFO Channel 01/0 : 3[c7000] -> 4[7000] [send] via NET/AWS Libfabric/1 +gpua006:720657:720834 [3] NCCL INFO Connected all rings +gpua006:720657:720834 [3] NCCL INFO Channel 00/0 : 3[c7000] -> 2[85000] via P2P/IPC/read +gpua006:720657:720834 [3] NCCL INFO Channel 01/0 : 3[c7000] -> 2[85000] via P2P/IPC/read +gpua006:720657:720834 [3] NCCL INFO Connected all trees +gpua006:720657:720834 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua006:720657:720834 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua012:2768199:2768275 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua012:2768199:2768275 [0] NCCL INFO comm 0x563967a02f70 rank 4 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpua039:3705384:3705384 [0] NCCL INFO cudaDriverVersion 12020 +gpua039:3705384:3705384 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.39<0> +gpua039:3705384:3705384 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua039:3705384:3705384 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua039:3705384:3705442 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua039:3705384:3705442 [0] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua039:3705384:3705442 [0] NCCL INFO Using network AWS Libfabric +gpua039:3705384:3705442 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpua039:3705384:3705442 [0] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua039:3705384:3705442 [0] NCCL INFO Trees [0] 21/-1/-1->20->25 [1] 21/16/-1->20->13 +gpua039:3705384:3705442 [0] NCCL INFO Channel 00/0 : 19[c7000] -> 20[7000] [receive] via NET/AWS Libfabric/1 +gpua033:1688528:1688528 [0] NCCL INFO cudaDriverVersion 12020 +gpua033:1688528:1688528 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.33<0> +gpua033:1688528:1688528 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua033:1688528:1688528 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua033:1688528:1688610 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua033:1688528:1688610 [0] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua033:1688528:1688610 [0] NCCL INFO Using network AWS Libfabric +gpua033:1688528:1688610 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpua033:1688528:1688610 [0] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua033:1688528:1688610 [0] NCCL INFO Trees [0] 13/-1/-1->12->8 [1] 13/4/-1->12->28 +gpua033:1688528:1688610 [0] NCCL INFO Channel 00/0 : 11[c7000] -> 12[7000] [receive] via NET/AWS Libfabric/1 +gpua054:464653:464735 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua054:464653:464735 [0] NCCL INFO comm 0x557604e61db0 rank 36 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpua079:3737283:3737283 [0] NCCL INFO cudaDriverVersion 12020 +gpua079:3737283:3737283 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.79<0> +gpua079:3737283:3737283 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua079:3737283:3737283 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua079:3737283:3737364 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua079:3737283:3737364 [0] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua079:3737283:3737364 [0] NCCL INFO Using network AWS Libfabric +gpua079:3737283:3737364 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpua079:3737283:3737364 [0] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua079:3737283:3737364 [0] NCCL INFO Trees [0] 49/56/-1->48->32 [1] 49/-1/-1->48->52 +gpua079:3737283:3737364 [0] NCCL INFO Channel 00/0 : 47[c7000] -> 48[7000] [receive] via NET/AWS Libfabric/1 +gpua049:4092062:4092142 [3] NCCL INFO Connected all rings +gpua049:4092062:4092142 [3] NCCL INFO Channel 00/0 : 35[c7000] -> 34[85000] via P2P/IPC/read +gpua049:4092062:4092142 [3] NCCL INFO Channel 01/0 : 35[c7000] -> 34[85000] via P2P/IPC/read +gpua049:4092062:4092142 [3] NCCL INFO Connected all trees +gpua049:4092062:4092142 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua049:4092062:4092142 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua049:4092062:4092142 [3] NCCL INFO comm 0x56116fcaeec0 rank 35 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpua057:3929636:3929709 [0] NCCL INFO Connected all trees +gpua057:3929636:3929709 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua057:3929636:3929709 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua057:3929636:3929709 [0] NCCL INFO comm 0x55bf04cabbe0 rank 44 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpua055:163210:163273 [1] NCCL INFO Channel 00/0 : 36[7000] -> 41[46000] [receive] via NET/AWS Libfabric/1 +gpua055:163210:163273 [1] NCCL INFO Channel 00/0 : 41[46000] -> 36[7000] [send] via NET/AWS Libfabric/1 +gpua055:163210:163273 [1] NCCL INFO Channel 00/0 : 41[46000] -> 40[7000] via P2P/IPC/read +gpua055:163210:163273 [1] NCCL INFO Channel 01/0 : 41[46000] -> 40[7000] via P2P/IPC/read +gpua055:163210:163273 [1] NCCL INFO Connected all trees +gpua055:163210:163273 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua055:163210:163273 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua055:163210:163273 [1] NCCL INFO comm 0x55a27b1c8990 rank 41 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpua038:223220:223302 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua038:223220:223302 [0] NCCL INFO comm 0x55edc0686b20 rank 16 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpua016:626218:626295 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua016:626218:626295 [0] NCCL INFO comm 0x55b59c4ad8a0 rank 8 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpua016:626220:626220 [2] NCCL INFO cudaDriverVersion 12020 +gpua016:626220:626220 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.16<0> +gpua016:626220:626220 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua016:626220:626220 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua016:626220:626294 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua016:626220:626294 [2] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua016:626220:626294 [2] NCCL INFO Using network AWS Libfabric +gpua016:626220:626294 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpua016:626220:626294 [2] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua016:626220:626294 [2] NCCL INFO Trees [0] 11/-1/-1->10->9 [1] 11/-1/-1->10->9 +gpua085:4061403:4061986 [1] NCCL INFO Connected all rings +gpua085:4061403:4061986 [1] NCCL INFO Channel 00/0 : 52[7000] -> 57[46000] [receive] via NET/AWS Libfabric/1 +gpua085:4061403:4061986 [1] NCCL INFO Channel 00/0 : 57[46000] -> 52[7000] [send] via NET/AWS Libfabric/1 +gpua085:4061403:4061986 [1] NCCL INFO Channel 00/0 : 57[46000] -> 56[7000] via P2P/IPC/read +gpua085:4061403:4061986 [1] NCCL INFO Channel 01/0 : 57[46000] -> 56[7000] via P2P/IPC/read +gpua085:4061403:4061986 [1] NCCL INFO Connected all trees +gpua085:4061403:4061986 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua085:4061403:4061986 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua085:4061403:4061986 [1] NCCL INFO comm 0x55c6a24c9d90 rank 57 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpua080:3313500:3313584 [0] NCCL INFO Connected all trees +gpua080:3313500:3313584 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua080:3313500:3313584 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua080:3313500:3313584 [0] NCCL INFO comm 0x555ace99a880 rank 52 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpua089:894160:894160 [1] NCCL INFO cudaDriverVersion 12020 +gpua089:894160:894160 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.89<0> +gpua089:894160:894160 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua089:894160:894160 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua089:894160:894235 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua089:894160:894235 [1] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua089:894160:894235 [1] NCCL INFO Using network AWS Libfabric +gpua089:894160:894235 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpua089:894160:894235 [1] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua089:894160:894235 [1] NCCL INFO Trees [0] 62/-1/-1->61->60 [1] 62/-1/-1->61->60 +gpua089:894160:894235 [1] NCCL INFO Channel 00/0 : 61[46000] -> 62[85000] via P2P/IPC/read +gpua089:894160:894235 [1] NCCL INFO Channel 01/0 : 61[46000] -> 62[85000] via P2P/IPC/read +gpua089:894160:894235 [1] NCCL INFO Connected all rings +gpua040:3903233:3903299 [3] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua040:3903233:3903299 [3] NCCL INFO Trees [0] -1/-1/-1->27->26 [1] -1/-1/-1->27->26 +gpua040:3903233:3903299 [3] NCCL INFO Channel 00/0 : 27[c7000] -> 28[7000] [send] via NET/AWS Libfabric/1 +gpua040:3903233:3903299 [3] NCCL INFO Channel 01/0 : 27[c7000] -> 28[7000] [send] via NET/AWS Libfabric/1 +gpua040:3903233:3903299 [3] NCCL INFO Connected all rings +gpua040:3903233:3903299 [3] NCCL INFO Channel 00/0 : 27[c7000] -> 26[85000] via P2P/IPC/read +gpua040:3903233:3903299 [3] NCCL INFO Channel 01/0 : 27[c7000] -> 26[85000] via P2P/IPC/read +gpua040:3903233:3903299 [3] NCCL INFO Connected all trees +gpua040:3903233:3903299 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua040:3903233:3903299 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua040:3903233:3903299 [3] NCCL INFO comm 0x556121261dd0 rank 27 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpua042:289284:289374 [0] NCCL INFO Channel 00/0 : 28[7000] -> 29[46000] via P2P/IPC/read +gpua042:289284:289374 [0] NCCL INFO Channel 01/0 : 28[7000] -> 29[46000] via P2P/IPC/read +gpua042:289284:289374 [0] NCCL INFO Connected all rings +gpua042:289284:289374 [0] NCCL INFO Channel 00/0 : 24[7000] -> 28[7000] [receive] via NET/AWS Libfabric/1 +gpua042:289284:289374 [0] NCCL INFO Channel 01/0 : 12[7000] -> 28[7000] [receive] via NET/AWS Libfabric/1 +gpua042:289284:289374 [0] NCCL INFO Channel 01/0 : 60[7000] -> 28[7000] [receive] via NET/AWS Libfabric/1 +gpua042:289284:289374 [0] NCCL INFO Channel 01/0 : 28[7000] -> 60[7000] [send] via NET/AWS Libfabric/1 +gpua042:289284:289374 [0] NCCL INFO Channel 01/0 : 28[7000] -> 12[7000] [send] via NET/AWS Libfabric/1 +gpua042:289284:289374 [0] NCCL INFO Channel 00/0 : 28[7000] -> 24[7000] [send] via NET/AWS Libfabric/1 +gpua042:289284:289374 [0] NCCL INFO Connected all trees +gpua042:289284:289374 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua006:720657:720834 [3] NCCL INFO comm 0x562aa60a0f20 rank 3 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpua012:2768201:2768201 [2] NCCL INFO cudaDriverVersion 12020 +gpua012:2768201:2768201 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.12<0> +gpua012:2768201:2768201 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua012:2768201:2768201 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua012:2768201:2768273 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua012:2768201:2768273 [2] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua012:2768201:2768273 [2] NCCL INFO Using network AWS Libfabric +gpua012:2768201:2768273 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpua012:2768201:2768273 [2] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua012:2768201:2768273 [2] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 +gpua012:2768201:2768273 [2] NCCL INFO Channel 00/0 : 6[85000] -> 7[c7000] via P2P/IPC/read +gpua012:2768201:2768273 [2] NCCL INFO Channel 01/0 : 6[85000] -> 7[c7000] via P2P/IPC/read +gpua039:3705384:3705442 [0] NCCL INFO Channel 01/0 : 19[c7000] -> 20[7000] [receive] via NET/AWS Libfabric/1 +gpua039:3705384:3705442 [0] NCCL INFO Channel 00/0 : 20[7000] -> 21[46000] via P2P/IPC/read +gpua039:3705384:3705442 [0] NCCL INFO Channel 01/0 : 20[7000] -> 21[46000] via P2P/IPC/read +gpua039:3705384:3705442 [0] NCCL INFO Connected all rings +gpua039:3705384:3705442 [0] NCCL INFO Channel 01/0 : 16[7000] -> 20[7000] [receive] via NET/AWS Libfabric/1 +gpua039:3705384:3705442 [0] NCCL INFO Channel 00/0 : 20[7000] -> 25[46000] [send] via NET/AWS Libfabric/1 +gpua039:3705384:3705442 [0] NCCL INFO Channel 01/0 : 13[46000] -> 20[7000] [receive] via NET/AWS Libfabric/1 +gpua039:3705384:3705442 [0] NCCL INFO Channel 01/0 : 20[7000] -> 13[46000] [send] via NET/AWS Libfabric/1 +gpua039:3705384:3705442 [0] NCCL INFO Channel 00/0 : 25[46000] -> 20[7000] [receive] via NET/AWS Libfabric/1 +gpua039:3705384:3705442 [0] NCCL INFO Channel 01/0 : 20[7000] -> 16[7000] [send] via NET/AWS Libfabric/1 +gpua033:1688528:1688610 [0] NCCL INFO Channel 01/0 : 11[c7000] -> 12[7000] [receive] via NET/AWS Libfabric/1 +gpua033:1688528:1688610 [0] NCCL INFO Channel 00/0 : 12[7000] -> 13[46000] via P2P/IPC/read +gpua033:1688528:1688610 [0] NCCL INFO Channel 01/0 : 12[7000] -> 13[46000] via P2P/IPC/read +gpua033:1688528:1688610 [0] NCCL INFO Connected all rings +gpua033:1688528:1688610 [0] NCCL INFO Channel 00/0 : 8[7000] -> 12[7000] [receive] via NET/AWS Libfabric/1 +gpua033:1688528:1688610 [0] NCCL INFO Channel 01/0 : 4[7000] -> 12[7000] [receive] via NET/AWS Libfabric/1 +gpua033:1688528:1688610 [0] NCCL INFO Channel 01/0 : 12[7000] -> 28[7000] [send] via NET/AWS Libfabric/1 +gpua033:1688528:1688610 [0] NCCL INFO Channel 01/0 : 28[7000] -> 12[7000] [receive] via NET/AWS Libfabric/1 +gpua033:1688528:1688610 [0] NCCL INFO Channel 01/0 : 12[7000] -> 4[7000] [send] via NET/AWS Libfabric/1 +gpua033:1688528:1688610 [0] NCCL INFO Channel 00/0 : 12[7000] -> 8[7000] [send] via NET/AWS Libfabric/1 +gpua054:464656:464656 [3] NCCL INFO cudaDriverVersion 12020 +gpua054:464656:464656 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.54<0> +gpua054:464656:464656 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua054:464656:464656 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua054:464656:464737 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua054:464656:464737 [3] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua054:464656:464737 [3] NCCL INFO Using network AWS Libfabric +gpua054:464656:464737 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpua054:464656:464737 [3] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua054:464656:464737 [3] NCCL INFO Trees [0] -1/-1/-1->39->38 [1] -1/-1/-1->39->38 +gpua054:464656:464737 [3] NCCL INFO Channel 00/0 : 39[c7000] -> 40[7000] [send] via NET/AWS Libfabric/1 +gpua054:464656:464737 [3] NCCL INFO Channel 01/0 : 39[c7000] -> 40[7000] [send] via NET/AWS Libfabric/1 +gpua079:3737283:3737364 [0] NCCL INFO Channel 01/0 : 47[c7000] -> 48[7000] [receive] via NET/AWS Libfabric/1 +gpua079:3737283:3737364 [0] NCCL INFO Channel 00/0 : 48[7000] -> 49[46000] via P2P/IPC/read +gpua079:3737283:3737364 [0] NCCL INFO Channel 01/0 : 48[7000] -> 49[46000] via P2P/IPC/read +gpua079:3737283:3737364 [0] NCCL INFO Connected all rings +gpua079:3737283:3737364 [0] NCCL INFO Channel 01/0 : 48[7000] -> 52[7000] [send] via NET/AWS Libfabric/1 +gpua079:3737283:3737364 [0] NCCL INFO Channel 00/0 : 48[7000] -> 56[7000] [send] via NET/AWS Libfabric/1 +gpua079:3737283:3737364 [0] NCCL INFO Channel 00/0 : 32[7000] -> 48[7000] [receive] via NET/AWS Libfabric/1 +gpua079:3737283:3737364 [0] NCCL INFO Channel 00/0 : 48[7000] -> 32[7000] [send] via NET/AWS Libfabric/1 +gpua079:3737283:3737364 [0] NCCL INFO Channel 00/0 : 56[7000] -> 48[7000] [receive] via NET/AWS Libfabric/1 +gpua079:3737283:3737364 [0] NCCL INFO Channel 01/0 : 52[7000] -> 48[7000] [receive] via NET/AWS Libfabric/1 +gpua049:4092060:4092060 [1] NCCL INFO cudaDriverVersion 12020 +gpua049:4092060:4092060 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.49<0> +gpua049:4092060:4092060 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua049:4092060:4092060 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua049:4092060:4092140 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua049:4092060:4092140 [1] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua049:4092060:4092140 [1] NCCL INFO Using network AWS Libfabric +gpua049:4092060:4092140 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpua049:4092060:4092140 [1] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua049:4092060:4092140 [1] NCCL INFO Trees [0] 34/16/-1->33->32 [1] 34/-1/-1->33->32 +gpua049:4092060:4092140 [1] NCCL INFO Channel 00/0 : 33[46000] -> 34[85000] via P2P/IPC/read +gpua049:4092060:4092140 [1] NCCL INFO Channel 01/0 : 33[46000] -> 34[85000] via P2P/IPC/read +gpua057:3929637:3929637 [1] NCCL INFO cudaDriverVersion 12020 +gpua057:3929637:3929637 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.57<0> +gpua057:3929637:3929637 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua057:3929637:3929637 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua057:3929637:3929710 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua057:3929637:3929710 [1] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua057:3929637:3929710 [1] NCCL INFO Using network AWS Libfabric +gpua057:3929637:3929710 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpua057:3929637:3929710 [1] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua057:3929637:3929710 [1] NCCL INFO Trees [0] 46/-1/-1->45->44 [1] 46/52/-1->45->44 +gpua057:3929637:3929710 [1] NCCL INFO Channel 00/0 : 45[46000] -> 46[85000] via P2P/IPC/read +gpua057:3929637:3929710 [1] NCCL INFO Channel 01/0 : 45[46000] -> 46[85000] via P2P/IPC/read +gpua038:223223:223223 [3] NCCL INFO cudaDriverVersion 12020 +gpua038:223223:223223 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.38<0> +gpua038:223223:223223 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua038:223223:223223 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua038:223223:223303 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua038:223223:223303 [3] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua038:223223:223303 [3] NCCL INFO Using network AWS Libfabric +gpua038:223223:223303 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpua038:223223:223303 [3] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua038:223223:223303 [3] NCCL INFO Trees [0] -1/-1/-1->19->18 [1] -1/-1/-1->19->18 +gpua038:223223:223303 [3] NCCL INFO Channel 00/0 : 19[c7000] -> 20[7000] [send] via NET/AWS Libfabric/1 +gpua038:223223:223303 [3] NCCL INFO Channel 01/0 : 19[c7000] -> 20[7000] [send] via NET/AWS Libfabric/1 +gpua016:626220:626294 [2] NCCL INFO Channel 00/0 : 10[85000] -> 11[c7000] via P2P/IPC/read +gpua016:626220:626294 [2] NCCL INFO Channel 01/0 : 10[85000] -> 11[c7000] via P2P/IPC/read +gpua016:626220:626294 [2] NCCL INFO Connected all rings +gpua016:626220:626294 [2] NCCL INFO Channel 00/0 : 10[85000] -> 9[46000] via P2P/IPC/read +gpua016:626220:626294 [2] NCCL INFO Channel 01/0 : 10[85000] -> 9[46000] via P2P/IPC/read +gpua016:626220:626294 [2] NCCL INFO Connected all trees +gpua016:626220:626294 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua016:626220:626294 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua016:626220:626294 [2] NCCL INFO comm 0x564f2f398270 rank 10 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpua085:4061405:4061405 [3] NCCL INFO cudaDriverVersion 12020 +gpua085:4061405:4061405 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.85<0> +gpua085:4061405:4061405 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua085:4061405:4061405 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua085:4061405:4061985 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua085:4061405:4061985 [3] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua085:4061405:4061985 [3] NCCL INFO Using network AWS Libfabric +gpua085:4061405:4061985 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpua085:4061405:4061985 [3] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua085:4061405:4061985 [3] NCCL INFO Trees [0] -1/-1/-1->59->58 [1] -1/-1/-1->59->58 +gpua085:4061405:4061985 [3] NCCL INFO Channel 00/0 : 59[c7000] -> 60[7000] [send] via NET/AWS Libfabric/1 +gpua085:4061405:4061985 [3] NCCL INFO Channel 01/0 : 59[c7000] -> 60[7000] [send] via NET/AWS Libfabric/1 +gpua080:3313501:3313501 [1] NCCL INFO cudaDriverVersion 12020 +gpua080:3313501:3313501 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.80<0> +gpua080:3313501:3313501 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua080:3313501:3313501 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua080:3313501:3313582 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua080:3313501:3313582 [1] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua080:3313501:3313582 [1] NCCL INFO Using network AWS Libfabric +gpua080:3313501:3313582 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpua080:3313501:3313582 [1] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua080:3313501:3313582 [1] NCCL INFO Trees [0] 54/-1/-1->53->52 [1] 54/56/-1->53->52 +gpua080:3313501:3313582 [1] NCCL INFO Channel 00/0 : 53[46000] -> 54[85000] via P2P/IPC/read +gpua080:3313501:3313582 [1] NCCL INFO Channel 01/0 : 53[46000] -> 54[85000] via P2P/IPC/read +gpua089:894160:894235 [1] NCCL INFO Channel 00/0 : 61[46000] -> 60[7000] via P2P/IPC/read +gpua089:894160:894235 [1] NCCL INFO Channel 01/0 : 61[46000] -> 60[7000] via P2P/IPC/read +gpua089:894160:894235 [1] NCCL INFO Connected all trees +gpua089:894160:894235 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua089:894160:894235 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua089:894160:894235 [1] NCCL INFO comm 0x55965bc9da10 rank 61 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpua042:289284:289374 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua042:289284:289374 [0] NCCL INFO comm 0x5565d740ae90 rank 28 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpua012:2768201:2768273 [2] NCCL INFO Connected all rings +gpua012:2768201:2768273 [2] NCCL INFO Channel 00/0 : 6[85000] -> 5[46000] via P2P/IPC/read +gpua012:2768201:2768273 [2] NCCL INFO Channel 01/0 : 6[85000] -> 5[46000] via P2P/IPC/read +gpua012:2768201:2768273 [2] NCCL INFO Connected all trees +gpua012:2768201:2768273 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua012:2768201:2768273 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua012:2768201:2768273 [2] NCCL INFO comm 0x5599632d7ee0 rank 6 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpua039:3705384:3705442 [0] NCCL INFO Connected all trees +gpua039:3705384:3705442 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua039:3705384:3705442 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua039:3705384:3705442 [0] NCCL INFO comm 0x55fb9fb93190 rank 20 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpua033:1688528:1688610 [0] NCCL INFO Connected all trees +gpua033:1688528:1688610 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua033:1688528:1688610 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua033:1688528:1688610 [0] NCCL INFO comm 0x55f7abf5f7e0 rank 12 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpua054:464656:464737 [3] NCCL INFO Connected all rings +gpua054:464656:464737 [3] NCCL INFO Channel 00/0 : 39[c7000] -> 38[85000] via P2P/IPC/read +gpua054:464656:464737 [3] NCCL INFO Channel 01/0 : 39[c7000] -> 38[85000] via P2P/IPC/read +gpua054:464656:464737 [3] NCCL INFO Connected all trees +gpua054:464656:464737 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua054:464656:464737 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua054:464656:464737 [3] NCCL INFO comm 0x55d719875f60 rank 39 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpua079:3737283:3737364 [0] NCCL INFO Connected all trees +gpua079:3737283:3737364 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua079:3737283:3737364 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua079:3737283:3737364 [0] NCCL INFO comm 0x55e2e1a83770 rank 48 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpua049:4092060:4092140 [1] NCCL INFO Connected all rings +gpua049:4092060:4092140 [1] NCCL INFO Channel 00/0 : 16[7000] -> 33[46000] [receive] via NET/AWS Libfabric/1 +gpua049:4092060:4092140 [1] NCCL INFO Channel 00/0 : 33[46000] -> 16[7000] [send] via NET/AWS Libfabric/1 +gpua049:4092060:4092140 [1] NCCL INFO Channel 00/0 : 33[46000] -> 32[7000] via P2P/IPC/read +gpua049:4092060:4092140 [1] NCCL INFO Channel 01/0 : 33[46000] -> 32[7000] via P2P/IPC/read +gpua049:4092060:4092140 [1] NCCL INFO Connected all trees +gpua049:4092060:4092140 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua049:4092060:4092140 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua049:4092060:4092140 [1] NCCL INFO comm 0x55887d2e9880 rank 33 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpua057:3929637:3929710 [1] NCCL INFO Connected all rings +gpua057:3929637:3929710 [1] NCCL INFO Channel 01/0 : 45[46000] -> 52[7000] [send] via NET/AWS Libfabric/1 +gpua057:3929637:3929710 [1] NCCL INFO Channel 01/0 : 52[7000] -> 45[46000] [receive] via NET/AWS Libfabric/1 +gpua057:3929637:3929710 [1] NCCL INFO Channel 00/0 : 45[46000] -> 44[7000] via P2P/IPC/read +gpua057:3929637:3929710 [1] NCCL INFO Channel 01/0 : 45[46000] -> 44[7000] via P2P/IPC/read +gpua057:3929637:3929710 [1] NCCL INFO Connected all trees +gpua057:3929637:3929710 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua057:3929637:3929710 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua057:3929637:3929710 [1] NCCL INFO comm 0x564156e6f410 rank 45 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpua038:223223:223303 [3] NCCL INFO Connected all rings +gpua038:223223:223303 [3] NCCL INFO Channel 00/0 : 19[c7000] -> 18[85000] via P2P/IPC/read +gpua038:223223:223303 [3] NCCL INFO Channel 01/0 : 19[c7000] -> 18[85000] via P2P/IPC/read +gpua038:223223:223303 [3] NCCL INFO Connected all trees +gpua038:223223:223303 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua038:223223:223303 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua038:223223:223303 [3] NCCL INFO comm 0x55ba3cc654a0 rank 19 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpua085:4061405:4061985 [3] NCCL INFO Connected all rings +gpua085:4061405:4061985 [3] NCCL INFO Channel 00/0 : 59[c7000] -> 58[85000] via P2P/IPC/read +gpua085:4061405:4061985 [3] NCCL INFO Channel 01/0 : 59[c7000] -> 58[85000] via P2P/IPC/read +gpua085:4061405:4061985 [3] NCCL INFO Connected all trees +gpua085:4061405:4061985 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua085:4061405:4061985 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua085:4061405:4061985 [3] NCCL INFO comm 0x55a417244cf0 rank 59 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpua080:3313501:3313582 [1] NCCL INFO Connected all rings +gpua080:3313501:3313582 [1] NCCL INFO Channel 01/0 : 53[46000] -> 56[7000] [send] via NET/AWS Libfabric/1 +gpua080:3313501:3313582 [1] NCCL INFO Channel 01/0 : 56[7000] -> 53[46000] [receive] via NET/AWS Libfabric/1 +gpua080:3313501:3313582 [1] NCCL INFO Channel 00/0 : 53[46000] -> 52[7000] via P2P/IPC/read +gpua080:3313501:3313582 [1] NCCL INFO Channel 01/0 : 53[46000] -> 52[7000] via P2P/IPC/read +gpua080:3313501:3313582 [1] NCCL INFO Connected all trees +gpua080:3313501:3313582 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua080:3313501:3313582 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua080:3313501:3313582 [1] NCCL INFO comm 0x55fa186104b0 rank 53 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +[gpua006:0/64] 2024-02-11 12:56:49,881 (distributed:1027) INFO: Reducer buckets have been rebuilt in this iteration. +[gpua006:0/64] 2024-02-11 12:58:23,912 (trainer:756) INFO: 31epoch:train:1-100batch: iter_time=1.282, forward_time=0.216, loss_ctc=79.402, loss_interctc_layer6=91.192, loss_interctc_layer12=75.901, loss_interctc_layer15=69.673, loss_interctc_layer21=82.124, loss=79.658, backward_time=0.222, grad_norm=66.659, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.142, optim0_lr0=7.303e-05, train_time=5.150 +[gpua006:0/64] 2024-02-11 13:00:24,014 (trainer:756) INFO: 31epoch:train:101-200batch: iter_time=9.256e-05, forward_time=0.143, loss_ctc=90.702, loss_interctc_layer6=100.905, loss_interctc_layer12=84.413, loss_interctc_layer15=77.813, loss_interctc_layer21=93.709, loss=89.508, backward_time=0.206, grad_norm=79.018, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.140, optim0_lr0=7.303e-05, train_time=1.201 +[gpua006:0/64] 2024-02-11 13:02:31,809 (trainer:756) INFO: 31epoch:train:201-300batch: iter_time=9.662e-05, forward_time=0.141, loss_ctc=80.124, loss_interctc_layer6=87.350, loss_interctc_layer12=72.660, loss_interctc_layer15=66.857, loss_interctc_layer21=82.814, loss=77.961, backward_time=0.205, grad_norm=69.961, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.140, optim0_lr0=7.302e-05, train_time=1.278 +[gpua006:0/64] 2024-02-11 13:05:27,691 (trainer:756) INFO: 31epoch:train:301-400batch: iter_time=9.372e-05, forward_time=0.140, loss_ctc=80.682, loss_interctc_layer6=84.948, loss_interctc_layer12=70.876, loss_interctc_layer15=65.196, loss_interctc_layer21=83.852, loss=77.111, backward_time=0.201, grad_norm=77.051, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.140, optim0_lr0=7.301e-05, train_time=1.759 +[gpua006:0/64] 2024-02-11 13:08:09,060 (trainer:756) INFO: 31epoch:train:401-500batch: iter_time=1.002e-04, forward_time=0.140, loss_ctc=74.368, loss_interctc_layer6=90.609, loss_interctc_layer12=76.739, loss_interctc_layer15=71.286, loss_interctc_layer21=77.054, loss=78.011, backward_time=0.203, grad_norm=78.478, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.140, optim0_lr0=7.300e-05, train_time=1.613 +[gpua006:0/64] 2024-02-11 13:10:07,826 (trainer:756) INFO: 31epoch:train:501-600batch: iter_time=1.003e-04, forward_time=0.141, loss_ctc=61.446, loss_interctc_layer6=69.829, loss_interctc_layer12=57.774, loss_interctc_layer15=52.923, loss_interctc_layer21=63.476, loss=61.090, backward_time=0.207, grad_norm=66.379, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.140, optim0_lr0=7.299e-05, train_time=1.187 +[gpua006:0/64] 2024-02-11 13:12:16,157 (trainer:756) INFO: 31epoch:train:601-700batch: iter_time=9.850e-05, forward_time=0.142, loss_ctc=83.895, loss_interctc_layer6=91.476, loss_interctc_layer12=76.466, loss_interctc_layer15=70.385, loss_interctc_layer21=86.645, loss=81.773, backward_time=0.205, grad_norm=85.548, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.140, optim0_lr0=7.298e-05, train_time=1.283 +[gpua006:0/64] 2024-02-11 13:14:37,475 (trainer:756) INFO: 31epoch:train:701-800batch: iter_time=1.016e-04, forward_time=0.171, loss_ctc=67.632, loss_interctc_layer6=79.478, loss_interctc_layer12=65.316, loss_interctc_layer15=59.641, loss_interctc_layer21=69.875, loss=68.388, backward_time=0.216, grad_norm=67.442, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.142, optim0_lr0=7.298e-05, train_time=1.412 +[gpua006:0/64] 2024-02-11 13:16:36,244 (trainer:756) INFO: 31epoch:train:801-900batch: iter_time=1.056e-04, forward_time=0.207, loss_ctc=68.473, loss_interctc_layer6=84.084, loss_interctc_layer12=69.654, loss_interctc_layer15=63.760, loss_interctc_layer21=70.955, loss=71.385, backward_time=0.230, grad_norm=88.654, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.143, optim0_lr0=7.297e-05, train_time=1.189 +[gpua006:0/64] 2024-02-11 13:18:58,263 (trainer:756) INFO: 31epoch:train:901-1000batch: iter_time=1.072e-04, forward_time=0.141, loss_ctc=82.970, loss_interctc_layer6=84.204, loss_interctc_layer12=69.645, loss_interctc_layer15=63.792, loss_interctc_layer21=85.883, loss=77.299, backward_time=0.204, grad_norm=91.100, clip=100.000, loss_scale=3.529e+31, optim_step_time=0.140, optim0_lr0=7.296e-05, train_time=1.420 +[gpua006:0/64] 2024-02-11 13:21:11,461 (trainer:756) INFO: 31epoch:train:1001-1100batch: iter_time=9.252e-05, forward_time=0.140, loss_ctc=78.696, loss_interctc_layer6=82.437, loss_interctc_layer12=68.452, loss_interctc_layer15=62.848, loss_interctc_layer21=81.634, loss=74.813, backward_time=0.205, grad_norm=74.250, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.139, optim0_lr0=7.295e-05, train_time=1.332 +[gpua006:0/64] 2024-02-11 13:23:07,531 (trainer:756) INFO: 31epoch:train:1101-1200batch: iter_time=8.497e-05, forward_time=0.142, loss_ctc=67.309, loss_interctc_layer6=71.734, loss_interctc_layer12=59.438, loss_interctc_layer15=54.535, loss_interctc_layer21=69.829, loss=64.569, backward_time=0.207, grad_norm=67.615, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.139, optim0_lr0=7.294e-05, train_time=1.160 +[gpua006:0/64] 2024-02-11 13:24:37,578 (multiple_iter_factory:32) INFO: Building 1th iter-factory... +[gpua006:0/64] 2024-02-11 13:24:56,683 (s2t:401) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpua006:0/64] 2024-02-11 13:25:00,104 (abs_task:1660) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.9", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.9", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.9", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.9", "type": "text"} + preprocess: ) +[gpua006:0/64] 2024-02-11 13:25:00,104 (abs_task:1661) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.9, +[gpua006:0/64] 2024-02-11 13:25:00,108 (abs_task:1662) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257 +[gpua006:0/64] 2024-02-11 13:36:16,657 (trainer:756) INFO: 31epoch:train:1201-1300batch: iter_time=2.846, forward_time=0.142, loss_ctc=78.525, loss_interctc_layer6=83.314, loss_interctc_layer12=69.493, loss_interctc_layer15=63.961, loss_interctc_layer21=81.475, loss=75.354, backward_time=0.208, grad_norm=71.943, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.140, optim0_lr0=7.294e-05, train_time=7.891 +[gpua006:0/64] 2024-02-11 13:37:56,095 (trainer:756) INFO: 31epoch:train:1301-1400batch: iter_time=8.769e-05, forward_time=0.144, loss_ctc=78.823, loss_interctc_layer6=100.012, loss_interctc_layer12=83.188, loss_interctc_layer15=76.410, loss_interctc_layer21=81.274, loss=83.941, backward_time=0.209, grad_norm=76.950, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.139, optim0_lr0=7.293e-05, train_time=0.994 +[gpua006:0/64] 2024-02-11 13:40:12,855 (trainer:756) INFO: 31epoch:train:1401-1500batch: iter_time=8.416e-05, forward_time=0.142, loss_ctc=87.604, loss_interctc_layer6=90.411, loss_interctc_layer12=74.904, loss_interctc_layer15=68.586, loss_interctc_layer21=90.641, loss=82.429, backward_time=0.206, grad_norm=74.968, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.139, optim0_lr0=7.292e-05, train_time=1.367 +[gpua006:0/64] 2024-02-11 13:42:54,356 (trainer:756) INFO: 31epoch:train:1501-1600batch: iter_time=8.938e-05, forward_time=0.143, loss_ctc=74.186, loss_interctc_layer6=83.757, loss_interctc_layer12=69.665, loss_interctc_layer15=63.991, loss_interctc_layer21=76.992, loss=73.718, backward_time=0.207, grad_norm=94.228, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.139, optim0_lr0=7.291e-05, train_time=1.615 +[gpua006:0/64] 2024-02-11 13:45:43,988 (trainer:756) INFO: 31epoch:train:1601-1700batch: iter_time=8.588e-05, forward_time=0.143, loss_ctc=82.512, loss_interctc_layer6=93.733, loss_interctc_layer12=78.859, loss_interctc_layer15=72.617, loss_interctc_layer21=85.550, loss=82.654, backward_time=0.206, grad_norm=85.723, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.139, optim0_lr0=7.290e-05, train_time=1.696 +[gpua006:0/64] 2024-02-11 13:47:23,706 (trainer:756) INFO: 31epoch:train:1701-1800batch: iter_time=8.905e-05, forward_time=0.142, loss_ctc=67.928, loss_interctc_layer6=75.106, loss_interctc_layer12=62.866, loss_interctc_layer15=57.738, loss_interctc_layer21=69.951, loss=66.718, backward_time=0.208, grad_norm=68.724, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.139, optim0_lr0=7.290e-05, train_time=0.997 +[gpua006:0/64] 2024-02-11 13:49:21,653 (trainer:756) INFO: 31epoch:train:1801-1900batch: iter_time=8.744e-05, forward_time=0.145, loss_ctc=57.352, loss_interctc_layer6=73.286, loss_interctc_layer12=60.940, loss_interctc_layer15=55.940, loss_interctc_layer21=59.059, loss=61.315, backward_time=0.206, grad_norm=64.144, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.139, optim0_lr0=7.289e-05, train_time=1.179 +[gpua006:0/64] 2024-02-11 13:51:24,349 (trainer:756) INFO: 31epoch:train:1901-2000batch: iter_time=8.478e-05, forward_time=0.146, loss_ctc=85.723, loss_interctc_layer6=89.904, loss_interctc_layer12=74.516, loss_interctc_layer15=68.329, loss_interctc_layer21=88.813, loss=81.457, backward_time=0.208, grad_norm=72.798, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.139, optim0_lr0=7.288e-05, train_time=1.227 +[gpua006:0/64] 2024-02-11 13:53:39,124 (trainer:756) INFO: 31epoch:train:2001-2100batch: iter_time=8.133e-05, forward_time=0.223, loss_ctc=68.818, loss_interctc_layer6=85.362, loss_interctc_layer12=70.515, loss_interctc_layer15=64.458, loss_interctc_layer21=71.076, loss=72.046, backward_time=0.252, grad_norm=74.396, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.143, optim0_lr0=7.287e-05, train_time=1.347 +[gpua006:0/64] 2024-02-11 13:55:50,983 (trainer:756) INFO: 31epoch:train:2101-2200batch: iter_time=8.221e-05, forward_time=0.145, loss_ctc=68.141, loss_interctc_layer6=75.319, loss_interctc_layer12=62.231, loss_interctc_layer15=56.723, loss_interctc_layer21=70.619, loss=66.607, backward_time=0.209, grad_norm=67.965, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.139, optim0_lr0=7.286e-05, train_time=1.319 +[gpua006:0/64] 2024-02-11 13:58:17,585 (trainer:756) INFO: 31epoch:train:2201-2300batch: iter_time=8.564e-05, forward_time=0.142, loss_ctc=91.619, loss_interctc_layer6=94.816, loss_interctc_layer12=78.640, loss_interctc_layer15=72.067, loss_interctc_layer21=94.673, loss=86.363, backward_time=0.205, grad_norm=70.914, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.139, optim0_lr0=7.286e-05, train_time=1.466 +[gpua006:0/64] 2024-02-11 14:00:38,069 (trainer:756) INFO: 31epoch:train:2301-2400batch: iter_time=8.574e-05, forward_time=0.141, loss_ctc=70.742, loss_interctc_layer6=73.119, loss_interctc_layer12=60.535, loss_interctc_layer15=55.376, loss_interctc_layer21=73.666, loss=66.687, backward_time=0.205, grad_norm=75.859, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.139, optim0_lr0=7.285e-05, train_time=1.405 +[gpua006:0/64] 2024-02-11 14:02:41,924 (trainer:756) INFO: 31epoch:train:2401-2500batch: iter_time=8.440e-05, forward_time=0.141, loss_ctc=63.315, loss_interctc_layer6=70.284, loss_interctc_layer12=58.007, loss_interctc_layer15=53.033, loss_interctc_layer21=65.794, loss=62.086, backward_time=0.207, grad_norm=53.210, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.139, optim0_lr0=7.284e-05, train_time=1.238 +[gpua006:0/64] 2024-02-11 14:02:43,660 (multiple_iter_factory:32) INFO: Building 2th iter-factory... +[gpua006:0/64] 2024-02-11 14:03:02,440 (s2t:401) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpua006:0/64] 2024-02-11 14:03:05,877 (abs_task:1660) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.11", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.11", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.11", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.11", "type": "text"} + preprocess: ) +[gpua006:0/64] 2024-02-11 14:03:05,877 (abs_task:1661) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.11, +[gpua006:0/64] 2024-02-11 14:03:05,881 (abs_task:1662) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257 +[gpua006:0/64] 2024-02-11 14:12:11,906 (trainer:756) INFO: 31epoch:train:2501-2600batch: iter_time=1.314, forward_time=0.143, loss_ctc=77.832, loss_interctc_layer6=90.532, loss_interctc_layer12=75.192, loss_interctc_layer15=68.940, loss_interctc_layer21=80.316, loss=78.563, backward_time=0.209, grad_norm=62.513, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.139, optim0_lr0=7.283e-05, train_time=5.700 +[gpua006:0/64] 2024-02-11 14:14:02,309 (trainer:756) INFO: 31epoch:train:2601-2700batch: iter_time=7.602e-05, forward_time=0.145, loss_ctc=90.398, loss_interctc_layer6=99.255, loss_interctc_layer12=82.913, loss_interctc_layer15=76.317, loss_interctc_layer21=93.099, loss=88.397, backward_time=0.210, grad_norm=87.268, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.139, optim0_lr0=7.282e-05, train_time=1.104 +[gpua006:0/64] 2024-02-11 14:16:00,200 (trainer:756) INFO: 31epoch:train:2701-2800batch: iter_time=8.061e-05, forward_time=0.143, loss_ctc=78.753, loss_interctc_layer6=86.711, loss_interctc_layer12=71.968, loss_interctc_layer15=66.103, loss_interctc_layer21=81.465, loss=77.000, backward_time=0.209, grad_norm=80.779, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.139, optim0_lr0=7.282e-05, train_time=1.179 +[gpua006:0/64] 2024-02-11 14:17:57,393 (trainer:756) INFO: 31epoch:train:2801-2900batch: iter_time=8.676e-05, forward_time=0.143, loss_ctc=78.004, loss_interctc_layer6=83.373, loss_interctc_layer12=69.088, loss_interctc_layer15=63.299, loss_interctc_layer21=80.934, loss=74.940, backward_time=0.209, grad_norm=70.421, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.139, optim0_lr0=7.281e-05, train_time=1.172 +[gpua006:0/64] 2024-02-11 14:20:32,069 (trainer:756) INFO: 31epoch:train:2901-3000batch: iter_time=8.450e-05, forward_time=0.143, loss_ctc=73.067, loss_interctc_layer6=87.877, loss_interctc_layer12=74.192, loss_interctc_layer15=68.493, loss_interctc_layer21=75.788, loss=75.883, backward_time=0.208, grad_norm=83.155, clip=100.000, loss_scale=7.058e+31, optim_step_time=0.139, optim0_lr0=7.280e-05, train_time=1.547 +[gpua006:0/64] 2024-02-11 14:22:33,667 (trainer:756) INFO: 31epoch:train:3001-3100batch: iter_time=8.389e-05, forward_time=0.141, loss_ctc=62.101, loss_interctc_layer6=69.634, loss_interctc_layer12=57.490, loss_interctc_layer15=52.542, loss_interctc_layer21=64.237, loss=61.201, backward_time=0.208, grad_norm=50.496, clip=100.000, loss_scale=8.113e+31, optim_step_time=0.139, optim0_lr0=7.279e-05, train_time=1.216 +[gpua006:0/64] 2024-02-11 14:24:59,669 (trainer:756) INFO: 31epoch:train:3101-3200batch: iter_time=8.449e-05, forward_time=0.143, loss_ctc=80.993, loss_interctc_layer6=88.804, loss_interctc_layer12=73.751, loss_interctc_layer15=67.685, loss_interctc_layer21=83.661, loss=78.979, backward_time=0.208, grad_norm=95.723, clip=100.000, loss_scale=8.113e+31, optim_step_time=0.140, optim0_lr0=7.278e-05, train_time=1.460 +[gpua006:0/64] 2024-02-11 14:25:41,998 (trainer:687) WARNING: The grad norm is nan. Skipping updating the model. +[gpua006:0/64] 2024-02-11 14:27:25,138 (trainer:756) INFO: 31epoch:train:3201-3300batch: iter_time=7.943e-05, forward_time=0.144, loss_ctc=67.058, loss_interctc_layer6=79.138, loss_interctc_layer12=64.810, loss_interctc_layer15=59.134, loss_interctc_layer21=69.307, loss=67.889, backward_time=0.208, grad_norm=67.118, clip=100.000, loss_scale=5.163e+31, optim_step_time=0.140, optim0_lr0=7.278e-05, train_time=1.454 +[gpua006:0/64] 2024-02-11 14:29:35,901 (trainer:756) INFO: 31epoch:train:3301-3400batch: iter_time=8.486e-05, forward_time=0.143, loss_ctc=68.197, loss_interctc_layer6=84.249, loss_interctc_layer12=69.871, loss_interctc_layer15=63.892, loss_interctc_layer21=70.357, loss=71.313, backward_time=0.208, grad_norm=73.450, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.140, optim0_lr0=7.277e-05, train_time=1.307 +[gpua006:0/64] 2024-02-11 14:31:20,941 (trainer:756) INFO: 31epoch:train:3401-3500batch: iter_time=8.675e-05, forward_time=0.142, loss_ctc=83.181, loss_interctc_layer6=83.568, loss_interctc_layer12=68.942, loss_interctc_layer15=63.038, loss_interctc_layer21=86.059, loss=76.958, backward_time=0.209, grad_norm=72.356, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.139, optim0_lr0=7.276e-05, train_time=1.050 +[gpua006:0/64] 2024-02-11 14:33:51,486 (trainer:756) INFO: 31epoch:train:3501-3600batch: iter_time=8.499e-05, forward_time=0.143, loss_ctc=76.298, loss_interctc_layer6=81.176, loss_interctc_layer12=67.459, loss_interctc_layer15=61.949, loss_interctc_layer21=79.306, loss=73.238, backward_time=0.208, grad_norm=102.753, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.140, optim0_lr0=7.275e-05, train_time=1.505 +[gpua006:0/64] 2024-02-11 14:35:52,103 (trainer:756) INFO: 31epoch:train:3601-3700batch: iter_time=8.116e-05, forward_time=0.142, loss_ctc=66.972, loss_interctc_layer6=71.032, loss_interctc_layer12=58.738, loss_interctc_layer15=53.779, loss_interctc_layer21=69.511, loss=64.006, backward_time=0.209, grad_norm=64.431, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.140, optim0_lr0=7.274e-05, train_time=1.206 +[gpua006:0/64] 2024-02-11 14:37:02,151 (multiple_iter_factory:32) INFO: Building 3th iter-factory... +[gpua006:0/64] 2024-02-11 14:37:21,087 (s2t:401) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpua006:0/64] 2024-02-11 14:37:24,461 (abs_task:1660) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.7", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.7", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.7", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.7", "type": "text"} + preprocess: ) +[gpua006:0/64] 2024-02-11 14:37:24,461 (abs_task:1661) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.7, +[gpua006:0/64] 2024-02-11 14:37:24,464 (abs_task:1662) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257 +[gpua006:0/64] 2024-02-11 14:42:44,283 (trainer:756) INFO: 31epoch:train:3701-3800batch: iter_time=2.888, forward_time=0.178, loss_ctc=77.829, loss_interctc_layer6=81.979, loss_interctc_layer12=68.220, loss_interctc_layer15=62.641, loss_interctc_layer21=80.625, loss=74.259, backward_time=0.218, grad_norm=68.751, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.141, optim0_lr0=7.274e-05, train_time=4.121 +[gpua006:0/64] 2024-02-11 14:44:21,623 (trainer:756) INFO: 31epoch:train:3801-3900batch: iter_time=9.032e-05, forward_time=0.145, loss_ctc=78.056, loss_interctc_layer6=99.137, loss_interctc_layer12=82.314, loss_interctc_layer15=75.480, loss_interctc_layer21=80.433, loss=83.084, backward_time=0.209, grad_norm=90.686, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.140, optim0_lr0=7.273e-05, train_time=0.974 +[gpua006:0/64] 2024-02-11 14:45:55,064 (trainer:756) INFO: 31epoch:train:3901-4000batch: iter_time=9.014e-05, forward_time=0.143, loss_ctc=88.510, loss_interctc_layer6=90.474, loss_interctc_layer12=75.026, loss_interctc_layer15=68.754, loss_interctc_layer21=91.610, loss=82.875, backward_time=0.208, grad_norm=76.432, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.139, optim0_lr0=7.272e-05, train_time=0.934 +[gpua006:0/64] 2024-02-11 14:48:06,536 (trainer:756) INFO: 31epoch:train:4001-4100batch: iter_time=9.507e-05, forward_time=0.142, loss_ctc=72.852, loss_interctc_layer6=82.626, loss_interctc_layer12=68.704, loss_interctc_layer15=63.026, loss_interctc_layer21=75.506, loss=72.543, backward_time=0.207, grad_norm=77.987, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.139, optim0_lr0=7.271e-05, train_time=1.315 +[gpua006:0/64] 2024-02-11 14:50:10,699 (trainer:756) INFO: 31epoch:train:4101-4200batch: iter_time=8.973e-05, forward_time=0.143, loss_ctc=82.626, loss_interctc_layer6=93.210, loss_interctc_layer12=78.296, loss_interctc_layer15=72.374, loss_interctc_layer21=85.640, loss=82.429, backward_time=0.208, grad_norm=82.477, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.139, optim0_lr0=7.270e-05, train_time=1.241 +[gpua006:0/64] 2024-02-11 14:52:24,084 (trainer:756) INFO: 31epoch:train:4201-4300batch: iter_time=9.341e-05, forward_time=0.141, loss_ctc=68.521, loss_interctc_layer6=73.279, loss_interctc_layer12=61.138, loss_interctc_layer15=55.921, loss_interctc_layer21=71.224, loss=66.017, backward_time=0.206, grad_norm=87.404, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.139, optim0_lr0=7.270e-05, train_time=1.334 +[gpua006:0/64] 2024-02-11 14:54:58,758 (trainer:756) INFO: 31epoch:train:4301-4400batch: iter_time=9.117e-05, forward_time=0.141, loss_ctc=57.086, loss_interctc_layer6=72.840, loss_interctc_layer12=60.621, loss_interctc_layer15=55.650, loss_interctc_layer21=58.934, loss=61.026, backward_time=0.205, grad_norm=76.512, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.139, optim0_lr0=7.269e-05, train_time=1.547 +[gpua006:0/64] 2024-02-11 14:56:53,093 (trainer:756) INFO: 31epoch:train:4401-4500batch: iter_time=8.781e-05, forward_time=0.142, loss_ctc=84.945, loss_interctc_layer6=88.674, loss_interctc_layer12=73.265, loss_interctc_layer15=67.173, loss_interctc_layer21=87.900, loss=80.391, backward_time=0.207, grad_norm=72.715, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.139, optim0_lr0=7.268e-05, train_time=1.143 +[gpua006:0/64] 2024-02-11 14:58:52,672 (trainer:756) INFO: 31epoch:train:4501-4600batch: iter_time=8.439e-05, forward_time=0.142, loss_ctc=68.026, loss_interctc_layer6=84.800, loss_interctc_layer12=69.834, loss_interctc_layer15=63.803, loss_interctc_layer21=70.366, loss=71.366, backward_time=0.206, grad_norm=74.467, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.139, optim0_lr0=7.267e-05, train_time=1.196 +[gpua006:0/64] 2024-02-11 15:00:46,339 (trainer:756) INFO: 31epoch:train:4601-4700batch: iter_time=7.994e-05, forward_time=0.144, loss_ctc=68.681, loss_interctc_layer6=75.150, loss_interctc_layer12=62.030, loss_interctc_layer15=56.665, loss_interctc_layer21=71.151, loss=66.735, backward_time=0.207, grad_norm=60.177, clip=100.000, loss_scale=4.056e+31, optim_step_time=0.139, optim0_lr0=7.266e-05, train_time=1.136 +[gpua006:0/64] 2024-02-11 15:02:08,959 (trainer:687) WARNING: The grad norm is nan. Skipping updating the model. +[gpua006:0/64] 2024-02-11 15:02:54,515 (trainer:756) INFO: 31epoch:train:4701-4800batch: iter_time=8.600e-05, forward_time=0.142, loss_ctc=90.940, loss_interctc_layer6=94.184, loss_interctc_layer12=77.990, loss_interctc_layer15=71.563, loss_interctc_layer21=94.464, loss=85.828, backward_time=0.206, grad_norm=83.378, clip=100.000, loss_scale=3.503e+31, optim_step_time=0.139, optim0_lr0=7.266e-05, train_time=1.282 +[gpua006:0/64] 2024-02-11 15:04:50,369 (trainer:756) INFO: 31epoch:train:4801-4900batch: iter_time=8.389e-05, forward_time=0.142, loss_ctc=68.802, loss_interctc_layer6=73.174, loss_interctc_layer12=60.463, loss_interctc_layer15=55.200, loss_interctc_layer21=71.558, loss=65.839, backward_time=0.207, grad_norm=72.154, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.139, optim0_lr0=7.265e-05, train_time=1.158 +[gpua006:0/64] 2024-02-11 15:07:01,907 (trainer:756) INFO: 31epoch:train:4901-5000batch: iter_time=9.037e-05, forward_time=0.142, loss_ctc=63.286, loss_interctc_layer6=69.313, loss_interctc_layer12=57.166, loss_interctc_layer15=52.181, loss_interctc_layer21=65.636, loss=61.516, backward_time=0.207, grad_norm=67.941, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.139, optim0_lr0=7.264e-05, train_time=1.315 +[gpua006:0/64] 2024-02-11 15:07:04,641 (multiple_iter_factory:32) INFO: Building 4th iter-factory... +[gpua006:0/64] 2024-02-11 15:07:23,619 (s2t:401) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpua006:0/64] 2024-02-11 15:07:27,014 (abs_task:1660) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.1", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.1", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.1", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.1", "type": "text"} + preprocess: ) +[gpua006:0/64] 2024-02-11 15:07:27,014 (abs_task:1661) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.1, +[gpua006:0/64] 2024-02-11 15:07:27,018 (abs_task:1662) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257 +[gpua006:0/64] 2024-02-11 15:15:52,756 (trainer:756) INFO: 31epoch:train:5001-5100batch: iter_time=1.343, forward_time=0.143, loss_ctc=77.296, loss_interctc_layer6=89.921, loss_interctc_layer12=74.371, loss_interctc_layer15=68.025, loss_interctc_layer21=79.842, loss=77.891, backward_time=0.208, grad_norm=75.547, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.139, optim0_lr0=7.263e-05, train_time=5.308 +[gpua006:0/64] 2024-02-11 15:17:26,388 (trainer:756) INFO: 31epoch:train:5101-5200batch: iter_time=8.132e-05, forward_time=0.142, loss_ctc=89.249, loss_interctc_layer6=98.469, loss_interctc_layer12=82.123, loss_interctc_layer15=75.456, loss_interctc_layer21=92.029, loss=87.465, backward_time=0.207, grad_norm=85.095, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.139, optim0_lr0=7.262e-05, train_time=0.936 +[gpua006:0/64] 2024-02-11 15:19:37,360 (trainer:756) INFO: 31epoch:train:5201-5300batch: iter_time=1.203e-04, forward_time=0.201, loss_ctc=78.550, loss_interctc_layer6=86.209, loss_interctc_layer12=71.495, loss_interctc_layer15=65.558, loss_interctc_layer21=81.363, loss=76.635, backward_time=0.220, grad_norm=70.830, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.139, optim0_lr0=7.262e-05, train_time=1.310 +[gpua006:0/64] 2024-02-11 15:21:23,894 (trainer:756) INFO: 31epoch:train:5301-5400batch: iter_time=8.967e-05, forward_time=0.141, loss_ctc=77.923, loss_interctc_layer6=81.986, loss_interctc_layer12=67.889, loss_interctc_layer15=62.157, loss_interctc_layer21=80.955, loss=74.182, backward_time=0.207, grad_norm=66.929, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.138, optim0_lr0=7.261e-05, train_time=1.065 +[gpua006:0/64] 2024-02-11 15:23:31,360 (trainer:756) INFO: 31epoch:train:5401-5500batch: iter_time=2.024e-04, forward_time=0.177, loss_ctc=72.662, loss_interctc_layer6=87.409, loss_interctc_layer12=73.715, loss_interctc_layer15=68.175, loss_interctc_layer21=74.930, loss=75.378, backward_time=0.212, grad_norm=76.975, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.140, optim0_lr0=7.260e-05, train_time=1.274 +[gpua006:0/64] 2024-02-11 15:25:09,646 (trainer:756) INFO: 31epoch:train:5501-5600batch: iter_time=8.350e-05, forward_time=0.141, loss_ctc=60.877, loss_interctc_layer6=69.266, loss_interctc_layer12=57.090, loss_interctc_layer15=52.198, loss_interctc_layer21=62.919, loss=60.470, backward_time=0.207, grad_norm=57.459, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.138, optim0_lr0=7.259e-05, train_time=0.984 +[gpua006:0/64] 2024-02-11 15:27:00,586 (trainer:756) INFO: 31epoch:train:5601-5700batch: iter_time=8.514e-05, forward_time=0.143, loss_ctc=81.239, loss_interctc_layer6=88.549, loss_interctc_layer12=73.521, loss_interctc_layer15=67.547, loss_interctc_layer21=84.102, loss=78.992, backward_time=0.208, grad_norm=148.977, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.138, optim0_lr0=7.258e-05, train_time=1.109 +[gpua006:0/64] 2024-02-11 15:29:51,170 (trainer:756) INFO: 31epoch:train:5701-5800batch: iter_time=9.365e-05, forward_time=0.146, loss_ctc=66.538, loss_interctc_layer6=78.245, loss_interctc_layer12=64.119, loss_interctc_layer15=58.451, loss_interctc_layer21=68.819, loss=67.235, backward_time=0.205, grad_norm=71.736, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.139, optim0_lr0=7.258e-05, train_time=1.706 +[gpua006:0/64] 2024-02-11 15:31:53,346 (trainer:756) INFO: 31epoch:train:5801-5900batch: iter_time=9.281e-05, forward_time=0.142, loss_ctc=68.348, loss_interctc_layer6=84.148, loss_interctc_layer12=69.698, loss_interctc_layer15=63.743, loss_interctc_layer21=70.586, loss=71.305, backward_time=0.207, grad_norm=66.316, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.138, optim0_lr0=7.257e-05, train_time=1.222 +[gpua006:0/64] 2024-02-11 15:33:45,800 (trainer:756) INFO: 31epoch:train:5901-6000batch: iter_time=8.443e-05, forward_time=0.142, loss_ctc=81.285, loss_interctc_layer6=82.929, loss_interctc_layer12=68.243, loss_interctc_layer15=62.380, loss_interctc_layer21=84.290, loss=75.825, backward_time=0.207, grad_norm=75.793, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.138, optim0_lr0=7.256e-05, train_time=1.124 +[gpua006:0/64] 2024-02-11 15:35:21,682 (trainer:756) INFO: 31epoch:train:6001-6100batch: iter_time=8.772e-05, forward_time=0.142, loss_ctc=76.941, loss_interctc_layer6=80.813, loss_interctc_layer12=66.976, loss_interctc_layer15=61.335, loss_interctc_layer21=79.780, loss=73.169, backward_time=0.208, grad_norm=84.478, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.138, optim0_lr0=7.255e-05, train_time=0.959 +[gpua006:0/64] 2024-02-11 15:37:24,714 (trainer:756) INFO: 31epoch:train:6101-6200batch: iter_time=8.415e-05, forward_time=0.142, loss_ctc=66.801, loss_interctc_layer6=70.750, loss_interctc_layer12=58.465, loss_interctc_layer15=53.433, loss_interctc_layer21=69.232, loss=63.736, backward_time=0.208, grad_norm=70.227, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.138, optim0_lr0=7.254e-05, train_time=1.230 +[gpua006:0/64] 2024-02-11 15:38:31,149 (multiple_iter_factory:32) INFO: Building 5th iter-factory... +[gpua006:0/64] 2024-02-11 15:38:50,279 (s2t:401) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpua006:0/64] 2024-02-11 15:38:53,671 (abs_task:1660) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.4", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.4", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.4", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.4", "type": "text"} + preprocess: ) +[gpua006:0/64] 2024-02-11 15:38:53,672 (abs_task:1661) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.4, +[gpua006:0/64] 2024-02-11 15:38:53,675 (abs_task:1662) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257 +[gpua006:0/64] 2024-02-11 15:44:02,092 (trainer:756) INFO: 31epoch:train:6201-6300batch: iter_time=2.890, forward_time=0.143, loss_ctc=80.785, loss_interctc_layer6=81.938, loss_interctc_layer12=68.194, loss_interctc_layer15=62.610, loss_interctc_layer21=83.775, loss=75.460, backward_time=0.208, grad_norm=87.858, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.139, optim0_lr0=7.254e-05, train_time=3.974 +[gpua006:0/64] 2024-02-11 15:45:50,211 (trainer:756) INFO: 31epoch:train:6301-6400batch: iter_time=8.910e-05, forward_time=0.145, loss_ctc=80.465, loss_interctc_layer6=98.725, loss_interctc_layer12=81.822, loss_interctc_layer15=74.911, loss_interctc_layer21=83.055, loss=83.796, backward_time=0.209, grad_norm=86.120, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.140, optim0_lr0=7.253e-05, train_time=1.081 +[gpua006:0/64] 2024-02-11 15:48:03,619 (trainer:756) INFO: 31epoch:train:6401-6500batch: iter_time=8.707e-05, forward_time=0.145, loss_ctc=89.688, loss_interctc_layer6=89.313, loss_interctc_layer12=73.755, loss_interctc_layer15=67.495, loss_interctc_layer21=92.933, loss=82.637, backward_time=0.208, grad_norm=67.820, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.140, optim0_lr0=7.252e-05, train_time=1.334 +[gpua006:0/64] 2024-02-11 15:48:44,694 (trainer:687) WARNING: The grad norm is nan. Skipping updating the model. +[gpua006:0/64] 2024-02-11 15:50:02,966 (trainer:756) INFO: 31epoch:train:6501-6600batch: iter_time=9.410e-05, forward_time=0.144, loss_ctc=78.873, loss_interctc_layer6=82.314, loss_interctc_layer12=68.298, loss_interctc_layer15=62.611, loss_interctc_layer21=81.886, loss=74.796, backward_time=0.209, grad_norm=93.124, clip=100.000, loss_scale=1.373e+31, optim_step_time=0.139, optim0_lr0=7.251e-05, train_time=1.193 +[gpua006:0/64] 2024-02-11 15:51:39,592 (trainer:756) INFO: 31epoch:train:6601-6700batch: iter_time=8.612e-05, forward_time=0.145, loss_ctc=87.097, loss_interctc_layer6=91.622, loss_interctc_layer12=76.719, loss_interctc_layer15=70.701, loss_interctc_layer21=90.440, loss=83.316, backward_time=0.210, grad_norm=84.610, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.140, optim0_lr0=7.250e-05, train_time=0.966 +[gpua006:0/64] 2024-02-11 15:53:39,086 (trainer:756) INFO: 31epoch:train:6701-6800batch: iter_time=8.269e-05, forward_time=0.144, loss_ctc=70.111, loss_interctc_layer6=74.093, loss_interctc_layer12=61.553, loss_interctc_layer15=56.734, loss_interctc_layer21=72.391, loss=66.977, backward_time=0.210, grad_norm=68.630, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.140, optim0_lr0=7.250e-05, train_time=1.195 +[gpua006:0/64] 2024-02-11 15:55:26,624 (trainer:756) INFO: 31epoch:train:6801-6900batch: iter_time=8.548e-05, forward_time=0.144, loss_ctc=56.843, loss_interctc_layer6=71.707, loss_interctc_layer12=59.421, loss_interctc_layer15=54.369, loss_interctc_layer21=58.683, loss=60.204, backward_time=0.210, grad_norm=87.938, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.140, optim0_lr0=7.249e-05, train_time=1.075 +[gpua006:0/64] 2024-02-11 15:57:25,413 (trainer:756) INFO: 31epoch:train:6901-7000batch: iter_time=8.450e-05, forward_time=0.145, loss_ctc=88.585, loss_interctc_layer6=87.808, loss_interctc_layer12=72.402, loss_interctc_layer15=66.363, loss_interctc_layer21=91.804, loss=81.392, backward_time=0.210, grad_norm=105.311, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.140, optim0_lr0=7.248e-05, train_time=1.188 +[gpua006:0/64] 2024-02-11 15:59:23,256 (trainer:756) INFO: 31epoch:train:7001-7100batch: iter_time=0.002, forward_time=0.189, loss_ctc=71.597, loss_interctc_layer6=84.561, loss_interctc_layer12=69.555, loss_interctc_layer15=63.493, loss_interctc_layer21=74.141, loss=72.670, backward_time=0.217, grad_norm=64.292, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.141, optim0_lr0=7.247e-05, train_time=1.178 +[gpua006:0/64] 2024-02-11 16:01:16,995 (trainer:756) INFO: 31epoch:train:7101-7200batch: iter_time=6.751e-04, forward_time=0.191, loss_ctc=71.805, loss_interctc_layer6=74.701, loss_interctc_layer12=61.528, loss_interctc_layer15=56.138, loss_interctc_layer21=74.421, loss=67.719, backward_time=0.218, grad_norm=67.677, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.146, optim0_lr0=7.246e-05, train_time=1.134 +[gpua006:0/64] 2024-02-11 16:03:34,284 (trainer:756) INFO: 31epoch:train:7201-7300batch: iter_time=1.091e-04, forward_time=0.145, loss_ctc=96.734, loss_interctc_layer6=92.919, loss_interctc_layer12=77.009, loss_interctc_layer15=70.770, loss_interctc_layer21=100.166, loss=87.519, backward_time=0.207, grad_norm=83.299, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.140, optim0_lr0=7.246e-05, train_time=1.374 +[gpua006:0/64] 2024-02-11 16:05:55,263 (trainer:756) INFO: 31epoch:train:7301-7400batch: iter_time=1.126e-04, forward_time=0.143, loss_ctc=73.049, loss_interctc_layer6=72.662, loss_interctc_layer12=59.817, loss_interctc_layer15=54.621, loss_interctc_layer21=76.074, loss=67.244, backward_time=0.207, grad_norm=66.487, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.139, optim0_lr0=7.245e-05, train_time=1.410 +[gpua006:0/64] 2024-02-11 16:08:16,228 (trainer:756) INFO: 31epoch:train:7401-7500batch: iter_time=1.140e-04, forward_time=0.149, loss_ctc=66.014, loss_interctc_layer6=70.187, loss_interctc_layer12=57.799, loss_interctc_layer15=52.806, loss_interctc_layer21=68.650, loss=63.091, backward_time=0.209, grad_norm=57.769, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.140, optim0_lr0=7.244e-05, train_time=1.409 +[gpua006:0/64] 2024-02-11 16:08:20,291 (multiple_iter_factory:32) INFO: Building 6th iter-factory... +[gpua006:0/64] 2024-02-11 16:08:39,126 (s2t:401) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpua006:0/64] 2024-02-11 16:08:42,567 (abs_task:1660) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.2", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.2", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.2", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.2", "type": "text"} + preprocess: ) +[gpua006:0/64] 2024-02-11 16:08:42,567 (abs_task:1661) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.2, +[gpua006:0/64] 2024-02-11 16:08:42,571 (abs_task:1662) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257 +[gpua006:0/64] 2024-02-11 16:18:29,847 (trainer:756) INFO: 31epoch:train:7501-7600batch: iter_time=1.371, forward_time=0.144, loss_ctc=82.034, loss_interctc_layer6=89.811, loss_interctc_layer12=74.225, loss_interctc_layer15=67.856, loss_interctc_layer21=84.873, loss=79.760, backward_time=0.209, grad_norm=67.316, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.141, optim0_lr0=7.243e-05, train_time=6.136 +[gpua006:0/64] 2024-02-11 16:20:03,162 (trainer:756) INFO: 31epoch:train:7601-7700batch: iter_time=7.227e-05, forward_time=0.144, loss_ctc=90.821, loss_interctc_layer6=97.876, loss_interctc_layer12=81.646, loss_interctc_layer15=74.876, loss_interctc_layer21=93.903, loss=87.825, backward_time=0.210, grad_norm=74.408, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.141, optim0_lr0=7.242e-05, train_time=0.933 +[gpua006:0/64] 2024-02-11 16:22:01,731 (trainer:756) INFO: 31epoch:train:7701-7800batch: iter_time=8.927e-05, forward_time=0.144, loss_ctc=82.012, loss_interctc_layer6=86.240, loss_interctc_layer12=71.423, loss_interctc_layer15=65.401, loss_interctc_layer21=84.937, loss=78.003, backward_time=0.209, grad_norm=77.159, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.141, optim0_lr0=7.242e-05, train_time=1.185 +[gpua006:0/64] 2024-02-11 16:24:24,098 (trainer:756) INFO: 31epoch:train:7801-7900batch: iter_time=9.200e-05, forward_time=0.143, loss_ctc=82.413, loss_interctc_layer6=82.277, loss_interctc_layer12=68.187, loss_interctc_layer15=62.494, loss_interctc_layer21=85.693, loss=76.213, backward_time=0.209, grad_norm=83.748, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.141, optim0_lr0=7.241e-05, train_time=1.423 +[gpua006:0/64] 2024-02-11 16:26:36,138 (trainer:756) INFO: 31epoch:train:7901-8000batch: iter_time=8.748e-05, forward_time=0.144, loss_ctc=75.579, loss_interctc_layer6=86.405, loss_interctc_layer12=72.734, loss_interctc_layer15=67.189, loss_interctc_layer21=78.130, loss=76.007, backward_time=0.209, grad_norm=69.617, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.141, optim0_lr0=7.240e-05, train_time=1.320 +[gpua006:0/64] 2024-02-11 16:28:38,086 (trainer:756) INFO: 31epoch:train:8001-8100batch: iter_time=8.294e-05, forward_time=0.143, loss_ctc=63.164, loss_interctc_layer6=68.988, loss_interctc_layer12=56.730, loss_interctc_layer15=51.807, loss_interctc_layer21=65.310, loss=61.200, backward_time=0.208, grad_norm=57.712, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.140, optim0_lr0=7.239e-05, train_time=1.219 +[gpua006:0/64] 2024-02-11 16:31:07,553 (trainer:756) INFO: 31epoch:train:8101-8200batch: iter_time=9.126e-05, forward_time=0.144, loss_ctc=83.379, loss_interctc_layer6=88.471, loss_interctc_layer12=73.494, loss_interctc_layer15=67.489, loss_interctc_layer21=86.143, loss=79.795, backward_time=0.207, grad_norm=309.826, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.139, optim0_lr0=7.239e-05, train_time=1.494 +[gpua006:0/64] 2024-02-11 16:33:27,203 (trainer:756) INFO: 31epoch:train:8201-8300batch: iter_time=9.465e-05, forward_time=0.164, loss_ctc=69.067, loss_interctc_layer6=78.597, loss_interctc_layer12=64.310, loss_interctc_layer15=58.615, loss_interctc_layer21=71.580, loss=68.434, backward_time=0.222, grad_norm=101.787, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.142, optim0_lr0=7.238e-05, train_time=1.396 +[gpua006:0/64] 2024-02-11 16:36:12,684 (trainer:756) INFO: 31epoch:train:8301-8400batch: iter_time=9.171e-05, forward_time=0.167, loss_ctc=72.564, loss_interctc_layer6=83.925, loss_interctc_layer12=69.556, loss_interctc_layer15=63.674, loss_interctc_layer21=75.063, loss=72.956, backward_time=0.214, grad_norm=83.097, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.139, optim0_lr0=7.237e-05, train_time=1.655 +[gpua006:0/64] 2024-02-11 16:38:29,891 (trainer:756) INFO: 31epoch:train:8401-8500batch: iter_time=9.087e-05, forward_time=0.148, loss_ctc=84.508, loss_interctc_layer6=82.712, loss_interctc_layer12=67.991, loss_interctc_layer15=62.063, loss_interctc_layer21=87.653, loss=76.985, backward_time=0.209, grad_norm=72.177, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.139, optim0_lr0=7.236e-05, train_time=1.372 +[gpua006:0/64] 2024-02-11 16:40:37,755 (trainer:756) INFO: 31epoch:train:8501-8600batch: iter_time=8.935e-05, forward_time=0.161, loss_ctc=82.679, loss_interctc_layer6=80.130, loss_interctc_layer12=66.366, loss_interctc_layer15=60.916, loss_interctc_layer21=86.030, loss=75.224, backward_time=0.239, grad_norm=79.446, clip=100.000, loss_scale=1.663e+31, optim_step_time=0.141, optim0_lr0=7.235e-05, train_time=1.278 +[gpua006:0/64] 2024-02-11 16:42:29,953 (trainer:756) INFO: 31epoch:train:8601-8700batch: iter_time=8.802e-05, forward_time=0.143, loss_ctc=69.166, loss_interctc_layer6=70.377, loss_interctc_layer12=57.992, loss_interctc_layer15=52.929, loss_interctc_layer21=71.822, loss=64.457, backward_time=0.209, grad_norm=54.578, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.139, optim0_lr0=7.235e-05, train_time=1.121 +[gpua006:0/64] 2024-02-11 16:43:57,212 (multiple_iter_factory:32) INFO: Building 7th iter-factory... +[gpua006:0/64] 2024-02-11 16:44:15,806 (s2t:401) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpua006:0/64] 2024-02-11 16:44:19,193 (abs_task:1660) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.8", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.8", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.8", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.8", "type": "text"} + preprocess: ) +[gpua006:0/64] 2024-02-11 16:44:19,193 (abs_task:1661) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.8, +[gpua006:0/64] 2024-02-11 16:44:19,196 (abs_task:1662) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257 +[gpua006:0/64] 2024-02-11 16:52:20,111 (trainer:756) INFO: 31epoch:train:8701-8800batch: iter_time=2.869, forward_time=0.152, loss_ctc=82.198, loss_interctc_layer6=81.979, loss_interctc_layer12=68.030, loss_interctc_layer15=62.364, loss_interctc_layer21=85.083, loss=75.931, backward_time=0.209, grad_norm=67.093, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.140, optim0_lr0=7.234e-05, train_time=5.903 +[gpua006:0/64] 2024-02-11 16:52:46,313 (trainer:687) WARNING: The grad norm is nan. Skipping updating the model. +[gpua006:0/64] 2024-02-11 16:54:10,321 (trainer:756) INFO: 31epoch:train:8801-8900batch: iter_time=8.038e-05, forward_time=0.145, loss_ctc=79.370, loss_interctc_layer6=97.896, loss_interctc_layer12=80.978, loss_interctc_layer15=74.183, loss_interctc_layer21=81.764, loss=82.838, backward_time=0.209, grad_norm=118.476, clip=100.000, loss_scale=1.270e+31, optim_step_time=0.140, optim0_lr0=7.233e-05, train_time=1.102 +[gpua006:0/64] 2024-02-11 16:55:59,506 (trainer:756) INFO: 31epoch:train:8901-9000batch: iter_time=8.142e-05, forward_time=0.144, loss_ctc=88.908, loss_interctc_layer6=89.801, loss_interctc_layer12=74.203, loss_interctc_layer15=68.050, loss_interctc_layer21=92.047, loss=82.602, backward_time=0.208, grad_norm=88.423, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.140, optim0_lr0=7.232e-05, train_time=1.092 +[gpua006:0/64] 2024-02-11 16:58:08,989 (trainer:756) INFO: 31epoch:train:9001-9100batch: iter_time=8.321e-05, forward_time=0.143, loss_ctc=78.224, loss_interctc_layer6=82.524, loss_interctc_layer12=68.279, loss_interctc_layer15=62.536, loss_interctc_layer21=81.090, loss=74.531, backward_time=0.209, grad_norm=109.601, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.139, optim0_lr0=7.231e-05, train_time=1.295 +[gpua006:0/64] 2024-02-11 17:00:19,643 (trainer:756) INFO: 31epoch:train:9101-9200batch: iter_time=8.226e-05, forward_time=0.143, loss_ctc=86.712, loss_interctc_layer6=91.282, loss_interctc_layer12=76.725, loss_interctc_layer15=70.647, loss_interctc_layer21=89.972, loss=83.068, backward_time=0.208, grad_norm=151.570, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.139, optim0_lr0=7.231e-05, train_time=1.306 +[gpua006:0/64] 2024-02-11 17:02:53,327 (trainer:756) INFO: 31epoch:train:9201-9300batch: iter_time=8.518e-05, forward_time=0.142, loss_ctc=69.545, loss_interctc_layer6=73.330, loss_interctc_layer12=61.137, loss_interctc_layer15=56.014, loss_interctc_layer21=71.774, loss=66.360, backward_time=0.207, grad_norm=64.483, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.139, optim0_lr0=7.230e-05, train_time=1.537 +[gpua006:0/64] 2024-02-11 17:05:15,297 (trainer:756) INFO: 31epoch:train:9301-9400batch: iter_time=8.299e-05, forward_time=0.157, loss_ctc=57.643, loss_interctc_layer6=72.749, loss_interctc_layer12=60.406, loss_interctc_layer15=55.304, loss_interctc_layer21=59.468, loss=61.114, backward_time=0.221, grad_norm=58.833, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.141, optim0_lr0=7.229e-05, train_time=1.419 +[gpua006:0/64] 2024-02-11 17:07:55,237 (trainer:756) INFO: 31epoch:train:9401-9500batch: iter_time=8.173e-05, forward_time=0.184, loss_ctc=88.175, loss_interctc_layer6=88.205, loss_interctc_layer12=72.835, loss_interctc_layer15=66.627, loss_interctc_layer21=91.269, loss=81.422, backward_time=0.239, grad_norm=86.623, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.142, optim0_lr0=7.228e-05, train_time=1.599 +[gpua006:0/64] 2024-02-11 17:09:48,410 (trainer:756) INFO: 31epoch:train:9501-9600batch: iter_time=8.347e-05, forward_time=0.143, loss_ctc=71.187, loss_interctc_layer6=84.604, loss_interctc_layer12=69.787, loss_interctc_layer15=63.711, loss_interctc_layer21=73.547, loss=72.567, backward_time=0.209, grad_norm=74.692, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.139, optim0_lr0=7.227e-05, train_time=1.131 +[gpua006:0/64] 2024-02-11 17:12:22,598 (trainer:756) INFO: 31epoch:train:9601-9700batch: iter_time=8.367e-05, forward_time=0.142, loss_ctc=71.199, loss_interctc_layer6=74.810, loss_interctc_layer12=61.542, loss_interctc_layer15=56.071, loss_interctc_layer21=73.895, loss=67.503, backward_time=0.208, grad_norm=88.726, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.139, optim0_lr0=7.227e-05, train_time=1.541 +[gpua006:0/64] 2024-02-11 17:14:19,385 (trainer:756) INFO: 31epoch:train:9701-9800batch: iter_time=7.968e-05, forward_time=0.143, loss_ctc=95.121, loss_interctc_layer6=92.480, loss_interctc_layer12=76.611, loss_interctc_layer15=69.969, loss_interctc_layer21=98.513, loss=86.539, backward_time=0.208, grad_norm=74.436, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.139, optim0_lr0=7.226e-05, train_time=1.168 +[gpua006:0/64] 2024-02-11 17:16:38,543 (trainer:756) INFO: 31epoch:train:9801-9900batch: iter_time=7.870e-05, forward_time=0.161, loss_ctc=72.405, loss_interctc_layer6=72.234, loss_interctc_layer12=59.506, loss_interctc_layer15=54.289, loss_interctc_layer21=75.375, loss=66.762, backward_time=0.208, grad_norm=64.018, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.139, optim0_lr0=7.225e-05, train_time=1.391 +[gpua006:0/64] 2024-02-11 17:18:33,235 (trainer:756) INFO: 31epoch:train:9901-10000batch: iter_time=7.954e-05, forward_time=0.142, loss_ctc=65.464, loss_interctc_layer6=69.482, loss_interctc_layer12=57.381, loss_interctc_layer15=52.274, loss_interctc_layer21=67.957, loss=62.511, backward_time=0.209, grad_norm=65.329, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.139, optim0_lr0=7.224e-05, train_time=1.147 +[gpua006:0/64] 2024-02-11 17:18:53,264 (multiple_iter_factory:32) INFO: Building 8th iter-factory... +[gpua006:0/64] 2024-02-11 17:19:11,962 (s2t:401) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpua006:0/64] 2024-02-11 17:19:15,297 (abs_task:1660) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.10", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.10", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.10", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.10", "type": "text"} + preprocess: ) +[gpua006:0/64] 2024-02-11 17:19:15,297 (abs_task:1661) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.10, +[gpua006:0/64] 2024-02-11 17:19:15,301 (abs_task:1662) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257 +[gpua006:0/64] 2024-02-11 17:25:22,141 (trainer:756) INFO: 31epoch:train:10001-10100batch: iter_time=2.999, forward_time=0.172, loss_ctc=81.590, loss_interctc_layer6=88.796, loss_interctc_layer12=73.395, loss_interctc_layer15=67.159, loss_interctc_layer21=84.159, loss=79.020, backward_time=0.210, grad_norm=73.432, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.140, optim0_lr0=7.224e-05, train_time=4.089 +[gpua006:0/64] 2024-02-11 17:28:13,795 (trainer:756) INFO: 31epoch:train:10101-10200batch: iter_time=8.456e-05, forward_time=0.145, loss_ctc=91.503, loss_interctc_layer6=98.331, loss_interctc_layer12=82.132, loss_interctc_layer15=75.463, loss_interctc_layer21=94.554, loss=88.396, backward_time=0.206, grad_norm=94.337, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.140, optim0_lr0=7.223e-05, train_time=1.716 +[gpua006:0/64] 2024-02-11 17:30:52,718 (trainer:756) INFO: 31epoch:train:10201-10300batch: iter_time=8.675e-05, forward_time=0.178, loss_ctc=81.727, loss_interctc_layer6=85.962, loss_interctc_layer12=71.165, loss_interctc_layer15=65.201, loss_interctc_layer21=84.688, loss=77.749, backward_time=0.218, grad_norm=75.177, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.140, optim0_lr0=7.222e-05, train_time=1.589 +[gpua006:0/64] 2024-02-11 17:33:07,967 (trainer:756) INFO: 31epoch:train:10301-10400batch: iter_time=8.838e-05, forward_time=0.168, loss_ctc=81.763, loss_interctc_layer6=81.915, loss_interctc_layer12=67.756, loss_interctc_layer15=61.981, loss_interctc_layer21=84.901, loss=75.663, backward_time=0.211, grad_norm=77.938, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.140, optim0_lr0=7.221e-05, train_time=1.352 +[gpua006:0/64] 2024-02-11 17:35:20,922 (trainer:756) INFO: 31epoch:train:10401-10500batch: iter_time=9.405e-05, forward_time=0.143, loss_ctc=76.191, loss_interctc_layer6=86.827, loss_interctc_layer12=73.142, loss_interctc_layer15=67.477, loss_interctc_layer21=78.433, loss=76.414, backward_time=0.206, grad_norm=77.010, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.139, optim0_lr0=7.220e-05, train_time=1.329 +[gpua006:0/64] 2024-02-11 17:37:21,772 (trainer:756) INFO: 31epoch:train:10501-10600batch: iter_time=8.970e-05, forward_time=0.141, loss_ctc=62.753, loss_interctc_layer6=68.054, loss_interctc_layer12=55.804, loss_interctc_layer15=50.841, loss_interctc_layer21=64.969, loss=60.484, backward_time=0.207, grad_norm=61.609, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.139, optim0_lr0=7.220e-05, train_time=1.208 +[gpua006:0/64] 2024-02-11 17:39:20,666 (trainer:756) INFO: 31epoch:train:10601-10700batch: iter_time=9.612e-05, forward_time=0.144, loss_ctc=82.807, loss_interctc_layer6=87.954, loss_interctc_layer12=73.068, loss_interctc_layer15=67.007, loss_interctc_layer21=85.676, loss=79.303, backward_time=0.208, grad_norm=81.927, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.139, optim0_lr0=7.219e-05, train_time=1.189 +[gpua006:0/64] 2024-02-11 17:41:39,114 (trainer:756) INFO: 31epoch:train:10701-10800batch: iter_time=9.228e-05, forward_time=0.147, loss_ctc=69.317, loss_interctc_layer6=78.842, loss_interctc_layer12=64.412, loss_interctc_layer15=58.617, loss_interctc_layer21=71.697, loss=68.577, backward_time=0.209, grad_norm=82.865, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.140, optim0_lr0=7.218e-05, train_time=1.384 +[gpua006:0/64] 2024-02-11 17:44:35,105 (trainer:756) INFO: 31epoch:train:10801-10900batch: iter_time=9.204e-05, forward_time=0.144, loss_ctc=72.589, loss_interctc_layer6=84.247, loss_interctc_layer12=69.658, loss_interctc_layer15=63.776, loss_interctc_layer21=74.974, loss=73.049, backward_time=0.207, grad_norm=82.591, clip=100.000, loss_scale=1.765e+31, optim_step_time=0.140, optim0_lr0=7.217e-05, train_time=1.760 +[gpua006:0/64] 2024-02-11 17:46:27,018 (trainer:756) INFO: 31epoch:train:10901-11000batch: iter_time=8.002e-05, forward_time=0.146, loss_ctc=84.726, loss_interctc_layer6=82.456, loss_interctc_layer12=67.794, loss_interctc_layer15=61.829, loss_interctc_layer21=87.807, loss=76.923, backward_time=0.216, grad_norm=65.810, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.142, optim0_lr0=7.216e-05, train_time=1.119 +[gpua006:0/64] 2024-02-11 17:48:37,838 (trainer:756) INFO: 31epoch:train:11001-11100batch: iter_time=8.854e-05, forward_time=0.143, loss_ctc=82.375, loss_interctc_layer6=80.490, loss_interctc_layer12=66.608, loss_interctc_layer15=60.943, loss_interctc_layer21=85.381, loss=75.160, backward_time=0.207, grad_norm=67.866, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.139, optim0_lr0=7.216e-05, train_time=1.308 +[gpua006:0/64] 2024-02-11 17:50:41,530 (trainer:756) INFO: 31epoch:train:11101-11200batch: iter_time=8.366e-05, forward_time=0.142, loss_ctc=69.470, loss_interctc_layer6=70.382, loss_interctc_layer12=58.047, loss_interctc_layer15=53.042, loss_interctc_layer21=72.138, loss=64.616, backward_time=0.208, grad_norm=92.810, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.139, optim0_lr0=7.215e-05, train_time=1.237 +[gpua006:0/64] 2024-02-11 17:51:54,647 (multiple_iter_factory:32) INFO: Building 9th iter-factory... +[gpua006:0/64] 2024-02-11 17:52:13,196 (s2t:401) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpua006:0/64] 2024-02-11 17:52:16,538 (abs_task:1660) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.6", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.6", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.6", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.6", "type": "text"} + preprocess: ) +[gpua006:0/64] 2024-02-11 17:52:16,538 (abs_task:1661) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.6, +[gpua006:0/64] 2024-02-11 17:52:16,542 (abs_task:1662) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257 +[gpua006:0/64] 2024-02-11 18:00:39,783 (trainer:756) INFO: 31epoch:train:11201-11300batch: iter_time=2.989, forward_time=0.208, loss_ctc=81.045, loss_interctc_layer6=81.216, loss_interctc_layer12=67.317, loss_interctc_layer15=61.780, loss_interctc_layer21=83.885, loss=75.049, backward_time=0.230, grad_norm=62.288, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.142, optim0_lr0=7.214e-05, train_time=5.982 +[gpua006:0/64] 2024-02-11 18:02:33,185 (trainer:756) INFO: 31epoch:train:11301-11400batch: iter_time=9.144e-05, forward_time=0.147, loss_ctc=80.202, loss_interctc_layer6=98.693, loss_interctc_layer12=81.773, loss_interctc_layer15=74.991, loss_interctc_layer21=82.721, loss=83.676, backward_time=0.209, grad_norm=95.331, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.140, optim0_lr0=7.213e-05, train_time=1.134 +[gpua006:0/64] 2024-02-11 18:05:03,767 (trainer:756) INFO: 31epoch:train:11401-11500batch: iter_time=8.320e-05, forward_time=0.144, loss_ctc=88.049, loss_interctc_layer6=89.684, loss_interctc_layer12=74.075, loss_interctc_layer15=67.764, loss_interctc_layer21=91.186, loss=82.152, backward_time=0.209, grad_norm=142.023, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.140, optim0_lr0=7.213e-05, train_time=1.506 +[gpua006:0/64] 2024-02-11 18:07:15,896 (trainer:756) INFO: 31epoch:train:11501-11600batch: iter_time=8.716e-05, forward_time=0.144, loss_ctc=77.223, loss_interctc_layer6=82.091, loss_interctc_layer12=67.982, loss_interctc_layer15=62.183, loss_interctc_layer21=80.092, loss=73.914, backward_time=0.209, grad_norm=79.557, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.140, optim0_lr0=7.212e-05, train_time=1.320 +[gpua006:0/64] 2024-02-11 18:09:47,191 (trainer:756) INFO: 31epoch:train:11601-11700batch: iter_time=8.927e-05, forward_time=0.154, loss_ctc=85.932, loss_interctc_layer6=90.981, loss_interctc_layer12=76.402, loss_interctc_layer15=70.312, loss_interctc_layer21=89.128, loss=82.551, backward_time=0.211, grad_norm=80.748, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.141, optim0_lr0=7.211e-05, train_time=1.514 +[gpua006:0/64] 2024-02-11 18:11:52,429 (trainer:756) INFO: 31epoch:train:11701-11800batch: iter_time=9.227e-05, forward_time=0.143, loss_ctc=69.255, loss_interctc_layer6=73.120, loss_interctc_layer12=60.934, loss_interctc_layer15=56.282, loss_interctc_layer21=71.525, loss=66.223, backward_time=0.209, grad_norm=57.462, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.141, optim0_lr0=7.210e-05, train_time=1.252 +[gpua006:0/64] 2024-02-11 18:14:12,048 (trainer:756) INFO: 31epoch:train:11801-11900batch: iter_time=9.293e-05, forward_time=0.152, loss_ctc=57.393, loss_interctc_layer6=71.820, loss_interctc_layer12=59.499, loss_interctc_layer15=54.353, loss_interctc_layer21=59.157, loss=60.444, backward_time=0.219, grad_norm=63.262, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.142, optim0_lr0=7.209e-05, train_time=1.396 +[gpua006:0/64] 2024-02-11 18:16:32,359 (trainer:756) INFO: 31epoch:train:11901-12000batch: iter_time=9.842e-05, forward_time=0.144, loss_ctc=87.652, loss_interctc_layer6=87.769, loss_interctc_layer12=72.360, loss_interctc_layer15=66.365, loss_interctc_layer21=90.700, loss=80.969, backward_time=0.208, grad_norm=78.470, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.140, optim0_lr0=7.209e-05, train_time=1.403 +[gpua006:0/64] 2024-02-11 18:17:36,887 (trainer:687) WARNING: The grad norm is nan. Skipping updating the model. +[gpua006:0/64] 2024-02-11 18:18:15,966 (trainer:756) INFO: 31epoch:train:12001-12100batch: iter_time=8.755e-05, forward_time=0.165, loss_ctc=71.568, loss_interctc_layer6=84.470, loss_interctc_layer12=69.424, loss_interctc_layer15=63.491, loss_interctc_layer21=73.841, loss=72.559, backward_time=0.208, grad_norm=81.048, clip=100.000, loss_scale=1.629e+31, optim_step_time=0.140, optim0_lr0=7.208e-05, train_time=1.036 +[gpua006:0/64] 2024-02-11 18:20:21,331 (trainer:756) INFO: 31epoch:train:12101-12200batch: iter_time=9.291e-05, forward_time=0.143, loss_ctc=70.916, loss_interctc_layer6=74.386, loss_interctc_layer12=61.163, loss_interctc_layer15=55.807, loss_interctc_layer21=73.374, loss=67.129, backward_time=0.208, grad_norm=69.897, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.139, optim0_lr0=7.207e-05, train_time=1.253 +[gpua006:0/64] 2024-02-11 18:23:30,567 (trainer:756) INFO: 31epoch:train:12201-12300batch: iter_time=9.735e-05, forward_time=0.169, loss_ctc=95.074, loss_interctc_layer6=92.121, loss_interctc_layer12=76.173, loss_interctc_layer15=69.684, loss_interctc_layer21=98.522, loss=86.315, backward_time=0.225, grad_norm=95.052, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.142, optim0_lr0=7.206e-05, train_time=1.892 +[gpua006:0/64] 2024-02-11 18:26:31,580 (trainer:756) INFO: 31epoch:train:12301-12400batch: iter_time=8.864e-05, forward_time=0.197, loss_ctc=72.696, loss_interctc_layer6=71.869, loss_interctc_layer12=59.217, loss_interctc_layer15=54.124, loss_interctc_layer21=75.735, loss=66.728, backward_time=0.224, grad_norm=68.887, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.143, optim0_lr0=7.206e-05, train_time=1.810 +[gpua006:0/64] 2024-02-11 18:28:16,871 (trainer:756) INFO: 31epoch:train:12401-12500batch: iter_time=8.099e-05, forward_time=0.144, loss_ctc=65.637, loss_interctc_layer6=69.710, loss_interctc_layer12=57.424, loss_interctc_layer15=52.513, loss_interctc_layer21=67.876, loss=62.632, backward_time=0.210, grad_norm=69.226, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.140, optim0_lr0=7.205e-05, train_time=1.053 +[gpua006:0/64] 2024-02-11 18:28:36,903 (multiple_iter_factory:32) INFO: Building 10th iter-factory... +[gpua006:0/64] 2024-02-11 18:28:55,888 (s2t:401) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpua006:0/64] 2024-02-11 18:28:59,294 (abs_task:1660) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.0", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.0", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.0", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.0", "type": "text"} + preprocess: ) +[gpua006:0/64] 2024-02-11 18:28:59,294 (abs_task:1661) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.0, +[gpua006:0/64] 2024-02-11 18:28:59,297 (abs_task:1662) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257 +[gpua006:0/64] 2024-02-11 18:38:03,096 (trainer:756) INFO: 31epoch:train:12501-12600batch: iter_time=2.995, forward_time=0.164, loss_ctc=81.196, loss_interctc_layer6=88.794, loss_interctc_layer12=73.337, loss_interctc_layer15=67.098, loss_interctc_layer21=83.845, loss=78.854, backward_time=0.210, grad_norm=66.873, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.140, optim0_lr0=7.204e-05, train_time=5.862 +[gpua006:0/64] 2024-02-11 18:41:04,119 (trainer:756) INFO: 31epoch:train:12601-12700batch: iter_time=8.716e-05, forward_time=0.144, loss_ctc=91.701, loss_interctc_layer6=97.746, loss_interctc_layer12=81.416, loss_interctc_layer15=74.885, loss_interctc_layer21=94.938, loss=88.137, backward_time=0.208, grad_norm=274.975, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.139, optim0_lr0=7.203e-05, train_time=1.810 +[gpua006:0/64] 2024-02-11 18:43:27,722 (trainer:756) INFO: 31epoch:train:12701-12800batch: iter_time=8.913e-05, forward_time=0.143, loss_ctc=81.135, loss_interctc_layer6=85.141, loss_interctc_layer12=70.428, loss_interctc_layer15=64.448, loss_interctc_layer21=83.986, loss=77.028, backward_time=0.208, grad_norm=81.551, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.140, optim0_lr0=7.202e-05, train_time=1.436 +[gpua006:0/64] 2024-02-11 18:45:53,907 (trainer:756) INFO: 31epoch:train:12801-12900batch: iter_time=8.815e-05, forward_time=0.143, loss_ctc=83.339, loss_interctc_layer6=81.772, loss_interctc_layer12=67.695, loss_interctc_layer15=61.812, loss_interctc_layer21=86.664, loss=76.256, backward_time=0.206, grad_norm=72.911, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.139, optim0_lr0=7.202e-05, train_time=1.462 +[gpua006:0/64] 2024-02-11 18:48:34,669 (trainer:756) INFO: 31epoch:train:12901-13000batch: iter_time=8.624e-05, forward_time=0.165, loss_ctc=75.194, loss_interctc_layer6=86.029, loss_interctc_layer12=72.754, loss_interctc_layer15=67.396, loss_interctc_layer21=77.260, loss=75.727, backward_time=0.213, grad_norm=87.417, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.139, optim0_lr0=7.201e-05, train_time=1.607 +[gpua006:0/64] 2024-02-11 18:50:53,623 (trainer:756) INFO: 31epoch:train:13001-13100batch: iter_time=8.985e-05, forward_time=0.175, loss_ctc=61.702, loss_interctc_layer6=68.485, loss_interctc_layer12=56.209, loss_interctc_layer15=51.271, loss_interctc_layer21=63.924, loss=60.318, backward_time=0.225, grad_norm=51.453, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.141, optim0_lr0=7.200e-05, train_time=1.389 +[gpua006:0/64] 2024-02-11 18:52:58,790 (trainer:756) INFO: 31epoch:train:13101-13200batch: iter_time=9.556e-05, forward_time=0.145, loss_ctc=83.347, loss_interctc_layer6=87.880, loss_interctc_layer12=73.061, loss_interctc_layer15=66.927, loss_interctc_layer21=86.084, loss=79.460, backward_time=0.210, grad_norm=90.263, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.140, optim0_lr0=7.199e-05, train_time=1.251 +[gpua006:0/64] 2024-02-11 18:55:06,481 (trainer:756) INFO: 31epoch:train:13201-13300batch: iter_time=9.100e-05, forward_time=0.143, loss_ctc=68.262, loss_interctc_layer6=78.100, loss_interctc_layer12=63.857, loss_interctc_layer15=58.238, loss_interctc_layer21=70.487, loss=67.789, backward_time=0.209, grad_norm=65.683, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.140, optim0_lr0=7.199e-05, train_time=1.277 +[gpua006:0/64] 2024-02-11 18:57:18,335 (trainer:756) INFO: 31epoch:train:13301-13400batch: iter_time=9.342e-05, forward_time=0.158, loss_ctc=72.557, loss_interctc_layer6=84.028, loss_interctc_layer12=69.563, loss_interctc_layer15=63.641, loss_interctc_layer21=75.077, loss=72.973, backward_time=0.230, grad_norm=94.135, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.142, optim0_lr0=7.198e-05, train_time=1.317 +[gpua006:0/64] 2024-02-11 18:59:38,568 (trainer:756) INFO: 31epoch:train:13401-13500batch: iter_time=9.846e-05, forward_time=0.144, loss_ctc=84.563, loss_interctc_layer6=82.595, loss_interctc_layer12=67.849, loss_interctc_layer15=61.850, loss_interctc_layer21=87.856, loss=76.943, backward_time=0.208, grad_norm=72.598, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.140, optim0_lr0=7.197e-05, train_time=1.403 +[gpua006:0/64] 2024-02-11 19:02:16,105 (trainer:756) INFO: 31epoch:train:13501-13600batch: iter_time=9.488e-05, forward_time=0.143, loss_ctc=81.324, loss_interctc_layer6=79.896, loss_interctc_layer12=66.020, loss_interctc_layer15=60.333, loss_interctc_layer21=84.395, loss=74.394, backward_time=0.208, grad_norm=76.428, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.140, optim0_lr0=7.196e-05, train_time=1.574 +[gpua006:0/64] 2024-02-11 19:04:30,886 (trainer:756) INFO: 31epoch:train:13601-13700batch: iter_time=9.248e-05, forward_time=0.143, loss_ctc=68.988, loss_interctc_layer6=70.433, loss_interctc_layer12=58.068, loss_interctc_layer15=53.023, loss_interctc_layer21=71.670, loss=64.436, backward_time=0.207, grad_norm=52.860, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.139, optim0_lr0=7.195e-05, train_time=1.348 +[gpua006:0/64] 2024-02-11 19:05:43,996 (multiple_iter_factory:32) INFO: Building 11th iter-factory... +[gpua006:0/64] 2024-02-11 19:06:03,208 (s2t:401) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpua006:0/64] 2024-02-11 19:06:06,615 (abs_task:1660) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.3", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.3", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.3", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.3", "type": "text"} + preprocess: ) +[gpua006:0/64] 2024-02-11 19:06:06,615 (abs_task:1661) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.3, +[gpua006:0/64] 2024-02-11 19:06:06,618 (abs_task:1662) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257 +[gpua006:0/64] 2024-02-11 19:14:41,222 (trainer:756) INFO: 31epoch:train:13701-13800batch: iter_time=3.009, forward_time=0.165, loss_ctc=78.906, loss_interctc_layer6=81.292, loss_interctc_layer12=67.386, loss_interctc_layer15=61.741, loss_interctc_layer21=81.853, loss=74.236, backward_time=0.213, grad_norm=71.357, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.139, optim0_lr0=7.195e-05, train_time=6.103 +[gpua006:0/64] 2024-02-11 19:16:31,765 (trainer:756) INFO: 31epoch:train:13801-13900batch: iter_time=8.229e-05, forward_time=0.143, loss_ctc=77.630, loss_interctc_layer6=98.301, loss_interctc_layer12=81.433, loss_interctc_layer15=74.663, loss_interctc_layer21=80.078, loss=82.421, backward_time=0.208, grad_norm=80.305, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.139, optim0_lr0=7.194e-05, train_time=1.105 +[gpua006:0/64] 2024-02-11 19:18:48,883 (trainer:756) INFO: 31epoch:train:13901-14000batch: iter_time=8.106e-05, forward_time=0.143, loss_ctc=86.117, loss_interctc_layer6=88.526, loss_interctc_layer12=73.009, loss_interctc_layer15=66.763, loss_interctc_layer21=89.342, loss=80.751, backward_time=0.206, grad_norm=72.455, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.139, optim0_lr0=7.193e-05, train_time=1.371 +[gpua006:0/64] 2024-02-11 19:21:13,501 (trainer:756) INFO: 31epoch:train:14001-14100batch: iter_time=8.610e-05, forward_time=0.142, loss_ctc=72.121, loss_interctc_layer6=81.656, loss_interctc_layer12=67.649, loss_interctc_layer15=61.921, loss_interctc_layer21=74.902, loss=71.650, backward_time=0.207, grad_norm=70.050, clip=100.000, loss_scale=1.410e+31, optim_step_time=0.139, optim0_lr0=7.192e-05, train_time=1.446 +[gpua006:0/64] 2024-02-11 19:23:12,739 (trainer:756) INFO: 31epoch:train:14101-14200batch: iter_time=8.432e-05, forward_time=0.159, loss_ctc=81.159, loss_interctc_layer6=90.758, loss_interctc_layer12=76.027, loss_interctc_layer15=70.014, loss_interctc_layer21=84.413, loss=80.474, backward_time=0.209, grad_norm=71.661, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.140, optim0_lr0=7.192e-05, train_time=1.191 +[gpua006:0/64] 2024-02-11 19:25:26,845 (trainer:756) INFO: 31epoch:train:14201-14300batch: iter_time=8.606e-05, forward_time=0.142, loss_ctc=67.360, loss_interctc_layer6=72.458, loss_interctc_layer12=60.097, loss_interctc_layer15=55.428, loss_interctc_layer21=70.070, loss=65.083, backward_time=0.207, grad_norm=73.174, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.139, optim0_lr0=7.191e-05, train_time=1.342 +[gpua006:0/64] 2024-02-11 19:27:47,537 (trainer:756) INFO: 31epoch:train:14301-14400batch: iter_time=8.350e-05, forward_time=0.142, loss_ctc=56.168, loss_interctc_layer6=71.661, loss_interctc_layer12=59.419, loss_interctc_layer15=54.389, loss_interctc_layer21=57.937, loss=59.915, backward_time=0.206, grad_norm=55.791, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.139, optim0_lr0=7.190e-05, train_time=1.407 +[gpua006:0/64] 2024-02-11 19:29:40,581 (trainer:756) INFO: 31epoch:train:14401-14500batch: iter_time=9.393e-05, forward_time=0.146, loss_ctc=84.306, loss_interctc_layer6=87.190, loss_interctc_layer12=71.912, loss_interctc_layer15=65.873, loss_interctc_layer21=87.349, loss=79.326, backward_time=0.208, grad_norm=68.869, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.139, optim0_lr0=7.189e-05, train_time=1.130 +[gpua006:0/64] 2024-02-11 19:31:29,114 (trainer:756) INFO: 31epoch:train:14501-14600batch: iter_time=8.968e-05, forward_time=0.142, loss_ctc=67.687, loss_interctc_layer6=84.102, loss_interctc_layer12=69.168, loss_interctc_layer15=63.098, loss_interctc_layer21=69.951, loss=70.801, backward_time=0.207, grad_norm=62.777, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.139, optim0_lr0=7.188e-05, train_time=1.086 +[gpua006:0/64] 2024-02-11 19:33:56,298 (trainer:756) INFO: 31epoch:train:14601-14700batch: iter_time=8.907e-05, forward_time=0.145, loss_ctc=67.618, loss_interctc_layer6=74.553, loss_interctc_layer12=61.249, loss_interctc_layer15=55.796, loss_interctc_layer21=70.190, loss=65.881, backward_time=0.207, grad_norm=71.094, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.140, optim0_lr0=7.188e-05, train_time=1.472 +[gpua006:0/64] 2024-02-11 19:36:28,500 (trainer:756) INFO: 31epoch:train:14701-14800batch: iter_time=8.524e-05, forward_time=0.202, loss_ctc=89.110, loss_interctc_layer6=92.356, loss_interctc_layer12=76.295, loss_interctc_layer15=69.851, loss_interctc_layer21=92.476, loss=84.018, backward_time=0.241, grad_norm=144.471, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.145, optim0_lr0=7.187e-05, train_time=1.522 +[gpua006:0/64] 2024-02-11 19:38:08,841 (trainer:756) INFO: 31epoch:train:14801-14900batch: iter_time=8.737e-05, forward_time=0.142, loss_ctc=68.080, loss_interctc_layer6=72.062, loss_interctc_layer12=59.415, loss_interctc_layer15=54.237, loss_interctc_layer21=70.939, loss=64.946, backward_time=0.207, grad_norm=71.291, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.139, optim0_lr0=7.186e-05, train_time=1.003 +[gpua006:0/64] 2024-02-11 19:40:13,432 (trainer:756) INFO: 31epoch:train:14901-15000batch: iter_time=8.694e-05, forward_time=0.142, loss_ctc=62.755, loss_interctc_layer6=69.290, loss_interctc_layer12=56.939, loss_interctc_layer15=51.898, loss_interctc_layer21=65.121, loss=61.201, backward_time=0.207, grad_norm=56.925, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.139, optim0_lr0=7.185e-05, train_time=1.246 +[gpua006:0/64] 2024-02-11 20:11:19,709 (trainer:355) INFO: 31epoch results: [train] iter_time=0.192, forward_time=0.150, loss_ctc=76.324, loss_interctc_layer6=82.977, loss_interctc_layer12=68.772, loss_interctc_layer15=63.017, loss_interctc_layer21=79.031, loss=74.024, backward_time=0.210, grad_norm=80.090, clip=100.000, loss_scale=2.224e+31, optim_step_time=0.140, optim0_lr0=7.244e-05, train_time=1.641, time=6 hours, 50 minutes and 49.26 seconds, total_count=465000, gpu_max_cached_mem_GB=33.436, [valid] loss_ctc=42.701, cer_ctc=0.199, loss_interctc_layer6=48.537, cer_interctc_layer6=0.214, loss_interctc_layer12=35.490, cer_interctc_layer12=0.148, loss_interctc_layer15=31.224, cer_interctc_layer15=0.124, loss_interctc_layer21=45.219, cer_interctc_layer21=0.209, loss=40.634, time=30 minutes and 41.62 seconds, total_count=144801, gpu_max_cached_mem_GB=33.436 +[gpua006:0/64] 2024-02-11 20:11:49,216 (trainer:410) INFO: The best model has been updated: valid.cer_ctc, valid.loss_ctc, valid.total_count +[gpua006:0/64] 2024-02-11 20:11:49,270 (trainer:289) INFO: 32/45epoch started. Estimated time to finish: 4 days, 7 hours and 8 minutes +[gpua006:0/64] 2024-02-11 20:11:49,285 (multiple_iter_factory:32) INFO: Building 0th iter-factory... +[gpua006:0/64] 2024-02-11 20:12:07,864 (s2t:401) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpua006:0/64] 2024-02-11 20:12:11,236 (abs_task:1660) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.1", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.1", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.1", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.1", "type": "text"} + preprocess: ) +[gpua006:0/64] 2024-02-11 20:12:11,236 (abs_task:1661) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.1, +[gpua006:0/64] 2024-02-11 20:12:11,240 (abs_task:1662) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257 +srun: Job step aborted: Waiting up to 32 seconds for job step to finish.