diff --git "a/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/train.1.log" "b/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/train.1.log" new file mode 100644--- /dev/null +++ "b/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/train.1.log" @@ -0,0 +1,2842 @@ +# Running on gpua006.delta.ncsa.illinois.edu +# Started at Thu Feb 15 11:52:42 CST 2024 +# SLURMD_NODENAME=gpua006 +# SLURM_CLUSTER_NAME=delta +# SLURM_CONF=/var/spool/slurmd/conf-cache/slurm.conf +# SLURM_CPUS_ON_NODE=64 +# SLURM_CPUS_PER_TASK=64 +# SLURM_EXPORT_ENV=PATH +# SLURM_GET_USER_ENV=1 +# SLURM_GPUS_ON_NODE=4 +# SLURM_GTIDS=0 +# SLURM_JOBID=2991674 +# SLURM_JOB_ACCOUNT=bbjs-delta-gpu +# SLURM_JOB_CPUS_PER_NODE='64(x16)' +# SLURM_JOB_END_TIME=1708192328 +# SLURM_JOB_GID=202 +# SLURM_JOB_GPUS=0,1,2,3 +# SLURM_JOB_ID=2991674 +# SLURM_JOB_NAME=exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/train.log +# SLURM_JOB_NODELIST='gpua[006-007,012,016,033,038-040,049,054-055,057,079-080,085,089]' +# SLURM_JOB_NUM_NODES=16 +# SLURM_JOB_PARTITION=gpuA100x4 +# SLURM_JOB_QOS=bbjs-delta-gpu +# SLURM_JOB_RESERVATION=bbjs +# SLURM_JOB_START_TIME=1708019528 +# SLURM_JOB_UID=68077 +# SLURM_JOB_USER=peng6 +# SLURM_LOCALID=0 +# SLURM_MEM_PER_NODE=240000 +# SLURM_MPI_TYPE=pmi2 +# SLURM_NNODES=16 +# SLURM_NODEID=0 +# SLURM_NODELIST='gpua[006-007,012,016,033,038-040,049,054-055,057,079-080,085,089]' +# SLURM_NODE_ALIASES='(null)' +# SLURM_OPEN_MODE=a +# SLURM_PRIO_PROCESS=0 +# SLURM_PROCID=0 +# SLURM_SUBMIT_DIR=/scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1 +# SLURM_SUBMIT_HOST=dt-login03.delta.ncsa.illinois.edu +# SLURM_TASKS_PER_NODE='1(x16)' +# SLURM_TASK_PID=976156 +# SLURM_TOPOLOGY_ADDR=ss00.ss05.gpua006 +# SLURM_TOPOLOGY_ADDR_PATTERN=switch.switch.node +# SLURM_WORKING_CLUSTER=delta:dt-sched:6817:9984:109 +# srun --export=ALL python3 -m espnet2.bin.s2t_train --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_0838d6cb-1c7e-4970-b085-c26d69a4c412 +GpuFreq=control_disabled +GpuFreq=control_disabled +GpuFreq=control_disabled +GpuFreq=control_disabled +GpuFreq=control_disabled +GpuFreq=control_disabled +GpuFreq=control_disabled +GpuFreq=control_disabled +GpuFreq=control_disabled +GpuFreq=control_disabled +GpuFreq=control_disabled +GpuFreq=control_disabled +GpuFreq=control_disabled +GpuFreq=control_disabled +GpuFreq=control_disabled +GpuFreq=control_disabled +/scratch/bbjs/peng6/espnet-owsm-ctc-2/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-owsm-ctc-2/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_0838d6cb-1c7e-4970-b085-c26d69a4c412 +/scratch/bbjs/peng6/espnet-owsm-ctc-2/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-owsm-ctc-2/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_0838d6cb-1c7e-4970-b085-c26d69a4c412 +/scratch/bbjs/peng6/espnet-owsm-ctc-2/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-owsm-ctc-2/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_0838d6cb-1c7e-4970-b085-c26d69a4c412 +/scratch/bbjs/peng6/espnet-owsm-ctc-2/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-owsm-ctc-2/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_0838d6cb-1c7e-4970-b085-c26d69a4c412 +/scratch/bbjs/peng6/espnet-owsm-ctc-2/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-owsm-ctc-2/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_0838d6cb-1c7e-4970-b085-c26d69a4c412 +/scratch/bbjs/peng6/espnet-owsm-ctc-2/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-owsm-ctc-2/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_st/scratch/bbjs/peng6/espnet-owsm-ctc-2/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-owsm-ctc-2/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_0838d6cb-1c7e-4970-b085-c26d69a4c412 +ats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_0838d6cb-1c7e-4970-b085-c26d69a4c412 +/scratch/bbjs/peng6/espnet-owsm-ctc-2/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-owsm-ctc-2/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_0838d6cb-1c7e-4970-b085-c26d69a4c412 +/scratch/bbjs/peng6/espnet-owsm-ctc-2/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-owsm-ctc-2/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_st/scratch/bbjs/peng6/espnet-owsm-ctc-2/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-owsm-ctc-2/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_st/scratch/bbjs/peng6/espnet-owsm-ctc-2/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-owsm-ctc-2/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_st/scratch/bbjs/peng6/espnet-owsm-ctc-2/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-owsm-ctc-2/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_st/scratch/bbjs/peng6/espnet-owsm-ctc-2/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-owsm-ctc-2/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_st/scratch/bbjs/peng6/espnet-owsm-ctc-2/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-owsm-ctc-2/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_st/scratch/bbjs/peng6/espnet-owsm-ctc-2/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-owsm-ctc-2/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_st/scratch/bbjs/peng6/espnet-owsm-ctc-2/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-owsm-ctc-2/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000 --config conf/train_s2t_multitask-ctc_ebf27_conv2d8_size1024.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_0838d6cb-1c7e-4970-b085-c26d69a4c412 +ats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_0838d6cb-1c7e-4970-b085-c26d69a4c412 +ats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_methats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_0838d6cb-1c7e-4970-b085-c26d69a4c412 +od file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_0838d6cb-1c7e-4970-b085-c26d69a4c412 +ats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_0838d6cb-1c7e-4970-b085-c26d69a4c412 +ats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_0838d6cb-1c7e-4970-b085-c26d69a4c412 +ats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_0838d6cb-1c7e-4970-b085-c26d69a4c412 +ats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-owsm-ctc-2/egs2/owsm_v3.1_ctc/s2t1/exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/.dist_init_0838d6cb-1c7e-4970-b085-c26d69a4c412 +[gpua006:0/64] 2024-02-15 12:00:39,764 (distributed_c10d:319) INFO: Added key: store_based_barrier_key:1 to store for rank: 0 +[gpua006:0/64] 2024-02-15 12:01:02,819 (distributed_c10d:353) INFO: Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 64 nodes. +[gpua006:0/64] 2024-02-15 12:01:02,895 (s2t:420) INFO: Vocabulary size: 50002 +[gpua006:0/64] 2024-02-15 12:01:18,961 (abs_task:1270) INFO: pytorch.version=1.13.1, cuda.available=True, cudnn.version=8500, cudnn.benchmark=False, cudnn.deterministic=True +[gpua006:0/64] 2024-02-15 12:01:18,973 (abs_task:1271) INFO: Model structure: +ESPnetS2TCTCModel( + (frontend): DefaultFrontend( + (stft): Stft(n_fft=512, win_length=400, hop_length=160, center=True, normalized=False, onesided=True) + (frontend): Frontend() + (logmel): LogMel(sr=16000, n_fft=512, n_mels=80, fmin=0, fmax=8000.0, htk=False) + ) + (specaug): SpecAug( + (freq_mask): MaskAlongAxis(mask_width_range=[0, 27], num_mask=2, axis=freq) + (time_mask): MaskAlongAxisVariableMaxWidth(mask_width_ratio_range=[0.0, 0.05], num_mask=10, axis=time) + ) + (normalize): GlobalMVN(stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz, norm_means=True, norm_vars=True) + (encoder): EBranchformerCTCEncoder( + (embed): Conv2dSubsampling8( + (conv): Sequential( + (0): Conv2d(1, 1024, kernel_size=(3, 3), stride=(2, 2)) + (1): ReLU() + (2): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(2, 2)) + (3): ReLU() + (4): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(2, 2)) + (5): ReLU() + ) + (out): Linear(in_features=9216, out_features=1024, bias=True) + (pos_enc): PositionalEncoding( + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (encoders): MultiSequential( + (0): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (1): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (2): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (cross_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (norm_cross_attn): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (3): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (4): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (5): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (cross_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (norm_cross_attn): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (6): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (7): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (8): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (cross_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (norm_cross_attn): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (9): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (10): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (11): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (cross_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (norm_cross_attn): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (12): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (13): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (14): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (cross_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (norm_cross_attn): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (15): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (16): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (17): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (cross_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (norm_cross_attn): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (18): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (19): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (20): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (cross_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (norm_cross_attn): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (21): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (22): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (23): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (cross_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (norm_cross_attn): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (24): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (25): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + (26): EBranchformerEncoderLayer( + (attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (cgmlp): ConvolutionalGatingMLP( + (channel_proj1): Sequential( + (0): Linear(in_features=1024, out_features=4096, bias=True) + (1): GELU(approximate='none') + ) + (csgu): ConvolutionalSpatialGatingUnit( + (norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True) + (conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (act): Identity() + (dropout): Dropout(p=0.1, inplace=False) + ) + (channel_proj2): Linear(in_features=2048, out_features=1024, bias=True) + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (feed_forward_macaron): PositionwiseFeedForward( + (w_1): Linear(in_features=1024, out_features=4096, bias=True) + (w_2): Linear(in_features=4096, out_features=1024, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): Swish() + ) + (norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (cross_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=1024, out_features=1024, bias=True) + (linear_k): Linear(in_features=1024, out_features=1024, bias=True) + (linear_v): Linear(in_features=1024, out_features=1024, bias=True) + (linear_out): Linear(in_features=1024, out_features=1024, bias=True) + (dropout): Identity() + ) + (norm_cross_attn): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + (depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048) + (merge_proj): Linear(in_features=2048, out_features=1024, bias=True) + ) + ) + (after_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True) + (conditioning_layer): Linear(in_features=50002, out_features=1024, bias=True) + ) + (prompt_encoder): TransformerEncoder( + (encoders): MultiSequential( + (0): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=512, out_features=512, bias=True) + (linear_k): Linear(in_features=512, out_features=512, bias=True) + (linear_v): Linear(in_features=512, out_features=512, bias=True) + (linear_out): Linear(in_features=512, out_features=512, bias=True) + (dropout): Identity() + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=512, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=512, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((512,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((512,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (1): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=512, out_features=512, bias=True) + (linear_k): Linear(in_features=512, out_features=512, bias=True) + (linear_v): Linear(in_features=512, out_features=512, bias=True) + (linear_out): Linear(in_features=512, out_features=512, bias=True) + (dropout): Identity() + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=512, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=512, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((512,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((512,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (2): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=512, out_features=512, bias=True) + (linear_k): Linear(in_features=512, out_features=512, bias=True) + (linear_v): Linear(in_features=512, out_features=512, bias=True) + (linear_out): Linear(in_features=512, out_features=512, bias=True) + (dropout): Identity() + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=512, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=512, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((512,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((512,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + (3): EncoderLayer( + (self_attn): MultiHeadedAttention( + (linear_q): Linear(in_features=512, out_features=512, bias=True) + (linear_k): Linear(in_features=512, out_features=512, bias=True) + (linear_v): Linear(in_features=512, out_features=512, bias=True) + (linear_out): Linear(in_features=512, out_features=512, bias=True) + (dropout): Identity() + ) + (feed_forward): PositionwiseFeedForward( + (w_1): Linear(in_features=512, out_features=2048, bias=True) + (w_2): Linear(in_features=2048, out_features=512, bias=True) + (dropout): Dropout(p=0.1, inplace=False) + (activation): ReLU() + ) + (norm1): LayerNorm((512,), eps=1e-12, elementwise_affine=True) + (norm2): LayerNorm((512,), eps=1e-12, elementwise_affine=True) + (dropout): Dropout(p=0.1, inplace=False) + ) + ) + (after_norm): LayerNorm((512,), eps=1e-12, elementwise_affine=True) + ) + (embed): Embedding(50002, 512) + (pos_enc): PositionalEncoding( + (dropout): Dropout(p=0.0, inplace=False) + ) + (embed_proj): Linear(in_features=512, out_features=1024, bias=True) + (prompt_proj): Linear(in_features=512, out_features=1024, bias=True) + (ctc): CTC( + (ctc_lo): Linear(in_features=1024, out_features=50002, bias=True) + (ctc_loss): CTCLoss() + ) +) + +Model summary: + Class Name: ESPnetS2TCTCModel + Total Number of model parameters: 1.01 B + Number of trainable parameters: 1.01 B (100.0%) + Size: 4.02 GB + Type: torch.float32 +[gpua006:0/64] 2024-02-15 12:01:18,973 (abs_task:1274) INFO: Optimizer: +AdamW ( +Parameter Group 0 + amsgrad: False + betas: [0.9, 0.98] + capturable: False + eps: 1e-06 + foreach: None + initial_lr: 0.0002 + lr: 1.6666666666666667e-09 + maximize: False + weight_decay: 0.0 +) +[gpua006:0/64] 2024-02-15 12:01:18,974 (abs_task:1275) INFO: Scheduler: PiecewiseLinearWarmupLR(warmup_steps_list=[0, 30000, 60000], warmup_lr_list=[0.0, 5e-05, 0.0002]) +[gpua006:0/64] 2024-02-15 12:01:18,976 (abs_task:1284) INFO: Saving the configuration in exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/config.yaml +[gpua006:0/64] 2024-02-15 12:01:26,056 (s2t:401) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpua006:0/64] 2024-02-15 12:01:27,182 (abs_task:1660) INFO: [valid] dataset: +ESPnetDataset( + speech: {"path": "dump/raw/dev_v3/wav.scp", "type": "kaldi_ark"} + text_prev: {"path": "dump/raw/dev_v3/text.prev", "type": "text"} + text_ctc: {"path": "dump/raw/dev_v3/text.ctc", "type": "text"} + text: {"path": "dump/raw/dev_v3/text", "type": "text"} + preprocess: ) +[gpua006:0/64] 2024-02-15 12:01:27,182 (abs_task:1661) INFO: [valid] Batch sampler: UnsortedBatchSampler(N-batch=4671, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/valid/speech_shape, +[gpua006:0/64] 2024-02-15 12:01:27,191 (abs_task:1662) INFO: [valid] mini-batch sizes summary: N-batch=4671, mean=256.0, min=256, max=257 +[gpua006:0/64] 2024-02-15 12:02:53,964 (trainer:167) INFO: The training was resumed using exp/s2t_train_s2t_multitask-ctc_ebf27_conv2d8_size1024_raw_bpe50000/checkpoint.pth +gpua006:976275:976275 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.6<0> +gpua006:976275:976275 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua006:976275:976275 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua006:976275:976275 [0] NCCL INFO cudaDriverVersion 12020 +NCCL version 2.14.3+cuda11.7 +[gpua006:0/64] 2024-02-15 12:03:18,809 (trainer:301) INFO: 44/45epoch started +[gpua006:0/64] 2024-02-15 12:03:18,852 (multiple_iter_factory:32) INFO: Building 0th iter-factory... +[gpua006:0/64] 2024-02-15 12:03:37,232 (s2t:401) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpua006:0/64] 2024-02-15 12:03:40,774 (abs_task:1660) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.9", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.9", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.9", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.9", "type": "text"} + preprocess: ) +[gpua006:0/64] 2024-02-15 12:03:40,775 (abs_task:1661) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.9, +[gpua006:0/64] 2024-02-15 12:03:40,778 (abs_task:1662) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257 +gpua049:151698:151698 [1] NCCL INFO cudaDriverVersion 12020 +gpua049:151698:151698 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.49<0> +gpua049:151698:151698 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua049:151698:151698 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua049:151698:151826 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua049:151698:151826 [1] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua049:151698:151826 [1] NCCL INFO Using network AWS Libfabric +gpua049:151698:151826 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpua049:151698:151826 [1] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua049:151698:151826 [1] NCCL INFO Trees [0] 34/16/-1->33->32 [1] 34/-1/-1->33->32 +gpua049:151698:151826 [1] NCCL INFO Channel 00/0 : 33[46000] -> 34[85000] via P2P/IPC/read +gpua049:151698:151826 [1] NCCL INFO Channel 01/0 : 33[46000] -> 34[85000] via P2P/IPC/read +gpua049:151698:151826 [1] NCCL INFO Connected all rings +gpua049:151698:151826 [1] NCCL INFO Channel 00/0 : 16[7000] -> 33[46000] [receive] via NET/AWS Libfabric/1 +gpua049:151698:151826 [1] NCCL INFO Channel 00/0 : 33[46000] -> 16[7000] [send] via NET/AWS Libfabric/1 +gpua049:151698:151826 [1] NCCL INFO Channel 00/0 : 33[46000] -> 32[7000] via P2P/IPC/read +gpua049:151698:151826 [1] NCCL INFO Channel 01/0 : 33[46000] -> 32[7000] via P2P/IPC/read +gpua049:151698:151826 [1] NCCL INFO Connected all trees +gpua049:151698:151826 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua049:151698:151826 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua049:151698:151826 [1] NCCL INFO comm 0x5623dac13d90 rank 33 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpua033:1942187:1942187 [0] NCCL INFO cudaDriverVersion 12020 +gpua033:1942187:1942187 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.33<0> +gpua033:1942187:1942187 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua033:1942187:1942187 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua033:1942187:1942312 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua033:1942187:1942312 [0] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua033:1942187:1942312 [0] NCCL INFO Using network AWS Libfabric +gpua033:1942187:1942312 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpua033:1942187:1942312 [0] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua033:1942187:1942312 [0] NCCL INFO Trees [0] 17/24/-1->16->33 [1] 17/-1/-1->16->20 +gpua033:1942187:1942312 [0] NCCL INFO Channel 00/0 : 15[c7000] -> 16[7000] [receive] via NET/AWS Libfabric/1 +gpua007:1877471:1877471 [1] NCCL INFO cudaDriverVersion 12020 +gpua007:1877471:1877471 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.7<0> +gpua007:1877471:1877471 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua007:1877471:1877471 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua007:1877471:1877610 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua007:1877471:1877610 [1] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua007:1877471:1877610 [1] NCCL INFO Using network AWS Libfabric +gpua007:1877471:1877610 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpua007:1877471:1877610 [1] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua007:1877471:1877610 [1] NCCL INFO Trees [0] 6/-1/-1->5->4 [1] 6/8/-1->5->4 +gpua007:1877471:1877610 [1] NCCL INFO Channel 00/0 : 5[46000] -> 6[85000] via P2P/IPC/read +gpua007:1877471:1877610 [1] NCCL INFO Channel 01/0 : 5[46000] -> 6[85000] via P2P/IPC/read +gpua012:3029511:3029511 [2] NCCL INFO cudaDriverVersion 12020 +gpua012:3029511:3029511 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.12<0> +gpua012:3029511:3029511 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua012:3029511:3029511 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua012:3029511:3029602 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua012:3029511:3029602 [2] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua012:3029511:3029602 [2] NCCL INFO Using network AWS Libfabric +gpua012:3029511:3029602 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpua012:3029511:3029602 [2] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua012:3029511:3029602 [2] NCCL INFO Trees [0] 11/-1/-1->10->9 [1] 11/-1/-1->10->9 +gpua012:3029511:3029602 [2] NCCL INFO Channel 00/0 : 10[85000] -> 11[c7000] via P2P/IPC/read +gpua012:3029511:3029602 [2] NCCL INFO Channel 01/0 : 10[85000] -> 11[c7000] via P2P/IPC/read +gpua055:421545:421545 [3] NCCL INFO cudaDriverVersion 12020 +gpua055:421545:421545 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.55<0> +gpua055:421545:421545 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua055:421545:421545 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua055:421545:436565 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua055:421545:436565 [3] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua055:421545:436565 [3] NCCL INFO Using network AWS Libfabric +gpua055:421545:436565 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpua055:421545:436565 [3] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua055:421545:436565 [3] NCCL INFO Trees [0] -1/-1/-1->43->42 [1] -1/-1/-1->43->42 +gpua055:421545:436565 [3] NCCL INFO Channel 00/0 : 43[c7000] -> 44[7000] [send] via NET/AWS Libfabric/1 +gpua055:421545:436565 [3] NCCL INFO Channel 01/0 : 43[c7000] -> 44[7000] [send] via NET/AWS Libfabric/1 +gpua039:3958110:3958110 [3] NCCL INFO cudaDriverVersion 12020 +gpua039:3958110:3958110 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.39<0> +gpua039:3958110:3958110 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua039:3958110:3958110 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua039:3958110:3958199 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua039:3958110:3958199 [3] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua039:3958110:3958199 [3] NCCL INFO Using network AWS Libfabric +gpua039:3958110:3958199 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpua039:3958110:3958199 [3] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua039:3958110:3958199 [3] NCCL INFO Trees [0] -1/-1/-1->27->26 [1] -1/-1/-1->27->26 +gpua039:3958110:3958199 [3] NCCL INFO Channel 00/0 : 27[c7000] -> 28[7000] [send] via NET/AWS Libfabric/1 +gpua039:3958110:3958199 [3] NCCL INFO Channel 01/0 : 27[c7000] -> 28[7000] [send] via NET/AWS Libfabric/1 +gpua049:151700:151700 [3] NCCL INFO cudaDriverVersion 12020 +gpua049:151700:151700 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.49<0> +gpua049:151700:151700 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua049:151700:151700 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua049:151700:151825 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua049:151700:151825 [3] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua049:151700:151825 [3] NCCL INFO Using network AWS Libfabric +gpua049:151700:151825 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpua049:151700:151825 [3] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua049:151700:151825 [3] NCCL INFO Trees [0] -1/-1/-1->35->34 [1] -1/-1/-1->35->34 +gpua049:151700:151825 [3] NCCL INFO Channel 00/0 : 35[c7000] -> 36[7000] [send] via NET/AWS Libfabric/1 +gpua049:151700:151825 [3] NCCL INFO Channel 01/0 : 35[c7000] -> 36[7000] [send] via NET/AWS Libfabric/1 +gpua079:4011106:4011106 [0] NCCL INFO cudaDriverVersion 12020 +gpua079:4011106:4011106 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.79<0> +gpua079:4011106:4011106 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua079:4011106:4011106 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua079:4011106:4011246 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua079:4011106:4011246 [0] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua079:4011106:4011246 [0] NCCL INFO Using network AWS Libfabric +gpua079:4011106:4011246 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpua079:4011106:4011246 [0] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua079:4011106:4011246 [0] NCCL INFO Trees [0] 49/56/-1->48->32 [1] 49/-1/-1->48->52 +gpua079:4011106:4011246 [0] NCCL INFO Channel 00/0 : 47[c7000] -> 48[7000] [receive] via NET/AWS Libfabric/1 +gpua038:474862:474862 [1] NCCL INFO cudaDriverVersion 12020 +gpua038:474862:474862 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.38<0> +gpua038:474862:474862 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua038:474862:474862 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua038:474862:474944 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua038:474862:474944 [1] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua038:474862:474944 [1] NCCL INFO Using network AWS Libfabric +gpua038:474862:474944 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpua038:474862:474944 [1] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua038:474862:474944 [1] NCCL INFO Trees [0] 22/-1/-1->21->20 [1] 22/24/-1->21->20 +gpua038:474862:474944 [1] NCCL INFO Channel 00/0 : 21[46000] -> 22[85000] via P2P/IPC/read +gpua038:474862:474944 [1] NCCL INFO Channel 01/0 : 21[46000] -> 22[85000] via P2P/IPC/read +gpua038:474862:474944 [1] NCCL INFO Connected all rings +gpua016:879605:879605 [1] NCCL INFO cudaDriverVersion 12020 +gpua016:879605:879605 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.16<0> +gpua016:879605:879605 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua016:879605:879605 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua016:879605:879689 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua016:879605:879689 [1] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua016:879605:879689 [1] NCCL INFO Using network AWS Libfabric +gpua016:879605:879689 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpua016:879605:879689 [1] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua016:879605:879689 [1] NCCL INFO Trees [0] 14/-1/-1->13->12 [1] 14/20/-1->13->12 +gpua016:879605:879689 [1] NCCL INFO Channel 00/0 : 13[46000] -> 14[85000] via P2P/IPC/read +gpua016:879605:879689 [1] NCCL INFO Channel 01/0 : 13[46000] -> 14[85000] via P2P/IPC/read +gpua016:879605:879689 [1] NCCL INFO Connected all rings +gpua089:1166997:1166997 [2] NCCL INFO cudaDriverVersion 12020 +gpua089:1166997:1166997 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.89<0> +gpua089:1166997:1166997 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua089:1166997:1166997 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua089:1166997:1167308 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua089:1166997:1167308 [2] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua089:1166997:1167308 [2] NCCL INFO Using network AWS Libfabric +gpua089:1166997:1167308 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpua089:1166997:1167308 [2] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua089:1166997:1167308 [2] NCCL INFO Trees [0] 63/-1/-1->62->61 [1] 63/-1/-1->62->61 +gpua089:1166997:1167308 [2] NCCL INFO Channel 00/0 : 62[85000] -> 63[c7000] via P2P/IPC/read +gpua089:1166997:1167308 [2] NCCL INFO Channel 01/0 : 62[85000] -> 63[c7000] via P2P/IPC/read +gpua006:976275:976366 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua006:976275:976366 [0] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua006:976275:976366 [0] NCCL INFO Using network AWS Libfabric +gpua006:976275:976366 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpua006:976275:976366 [0] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua006:976275:976366 [0] NCCL INFO Channel 00/02 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 +gpua006:976275:976366 [0] NCCL INFO Channel 01/02 : 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 +gpua006:976275:976366 [0] NCCL INFO Trees [0] 1/32/-1->0->-1 [1] 1/-1/-1->0->4 +gpua006:976275:976366 [0] NCCL INFO Channel 00/0 : 63[c7000] -> 0[7000] [receive] via NET/AWS Libfabric/1 +gpua006:976275:976366 [0] NCCL INFO Channel 01/0 : 63[c7000] -> 0[7000] [receive] via NET/AWS Libfabric/1 +gpua006:976275:976366 [0] NCCL INFO Channel 00/0 : 0[7000] -> 1[46000] via P2P/IPC/read +gpua085:120732:120732 [0] NCCL INFO cudaDriverVersion 12020 +gpua085:120732:120732 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.85<0> +gpua085:120732:120732 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua085:120732:120732 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua085:120732:120822 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua085:120732:120822 [0] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua085:120732:120822 [0] NCCL INFO Using network AWS Libfabric +gpua085:120732:120822 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpua085:120732:120822 [0] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua085:120732:120822 [0] NCCL INFO Trees [0] 57/60/-1->56->48 [1] 57/-1/-1->56->53 +gpua085:120732:120822 [0] NCCL INFO Channel 00/0 : 55[c7000] -> 56[7000] [receive] via NET/AWS Libfabric/1 +gpua085:120732:120822 [0] NCCL INFO Channel 01/0 : 55[c7000] -> 56[7000] [receive] via NET/AWS Libfabric/1 +gpua057:4182114:4182114 [2] NCCL INFO cudaDriverVersion 12020 +gpua057:4182114:4182114 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.57<0> +gpua057:4182114:4182114 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua057:4182114:4182114 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua057:4182114:4182220 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua057:4182114:4182220 [2] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua057:4182114:4182220 [2] NCCL INFO Using network AWS Libfabric +gpua057:4182114:4182220 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpua057:4182114:4182220 [2] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua057:4182114:4182220 [2] NCCL INFO Trees [0] 47/-1/-1->46->45 [1] 47/-1/-1->46->45 +gpua057:4182114:4182220 [2] NCCL INFO Channel 00/0 : 46[85000] -> 47[c7000] via P2P/IPC/read +gpua057:4182114:4182220 [2] NCCL INFO Channel 01/0 : 46[85000] -> 47[c7000] via P2P/IPC/read +gpua040:4155648:4155648 [2] NCCL INFO cudaDriverVersion 12020 +gpua040:4155648:4155648 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.40<0> +gpua040:4155648:4155648 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua040:4155648:4155648 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua040:4155648:4155739 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua040:4155648:4155739 [2] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua040:4155648:4155739 [2] NCCL INFO Using network AWS Libfabric +gpua040:4155648:4155739 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpua040:4155648:4155739 [2] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua040:4155648:4155739 [2] NCCL INFO Trees [0] 31/-1/-1->30->29 [1] 31/-1/-1->30->29 +gpua040:4155648:4155739 [2] NCCL INFO Channel 00/0 : 30[85000] -> 31[c7000] via P2P/IPC/read +gpua040:4155648:4155739 [2] NCCL INFO Channel 01/0 : 30[85000] -> 31[c7000] via P2P/IPC/read +gpua080:3566292:3566292 [3] NCCL INFO cudaDriverVersion 12020 +gpua080:3566292:3566292 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.80<0> +gpua080:3566292:3566292 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua080:3566292:3566292 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua080:3566292:3566382 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua080:3566292:3566382 [3] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua080:3566292:3566382 [3] NCCL INFO Using network AWS Libfabric +gpua080:3566292:3566382 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpua080:3566292:3566382 [3] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua080:3566292:3566382 [3] NCCL INFO Trees [0] -1/-1/-1->55->54 [1] -1/-1/-1->55->54 +gpua080:3566292:3566382 [3] NCCL INFO Channel 00/0 : 55[c7000] -> 56[7000] [send] via NET/AWS Libfabric/1 +gpua080:3566292:3566382 [3] NCCL INFO Channel 01/0 : 55[c7000] -> 56[7000] [send] via NET/AWS Libfabric/1 +gpua054:719588:719588 [1] NCCL INFO cudaDriverVersion 12020 +gpua054:719588:719588 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.54<0> +gpua054:719588:719588 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua054:719588:719588 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua054:719588:719718 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua054:719588:719718 [1] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua054:719588:719718 [1] NCCL INFO Using network AWS Libfabric +gpua054:719588:719718 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpua054:719588:719718 [1] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua054:719588:719718 [1] NCCL INFO Trees [0] 38/-1/-1->37->36 [1] 38/40/-1->37->36 +gpua054:719588:719718 [1] NCCL INFO Channel 00/0 : 37[46000] -> 38[85000] via P2P/IPC/read +gpua054:719588:719718 [1] NCCL INFO Channel 01/0 : 37[46000] -> 38[85000] via P2P/IPC/read +gpua054:719588:719718 [1] NCCL INFO Connected all rings +gpua033:1942187:1942312 [0] NCCL INFO Channel 01/0 : 15[c7000] -> 16[7000] [receive] via NET/AWS Libfabric/1 +gpua033:1942187:1942312 [0] NCCL INFO Channel 00/0 : 16[7000] -> 17[46000] via P2P/IPC/read +gpua033:1942187:1942312 [0] NCCL INFO Channel 01/0 : 16[7000] -> 17[46000] via P2P/IPC/read +gpua033:1942187:1942312 [0] NCCL INFO Connected all rings +gpua033:1942187:1942312 [0] NCCL INFO Channel 01/0 : 16[7000] -> 20[7000] [send] via NET/AWS Libfabric/1 +gpua033:1942187:1942312 [0] NCCL INFO Channel 00/0 : 16[7000] -> 24[7000] [send] via NET/AWS Libfabric/1 +gpua033:1942187:1942312 [0] NCCL INFO Channel 00/0 : 16[7000] -> 33[46000] [send] via NET/AWS Libfabric/1 +gpua033:1942187:1942312 [0] NCCL INFO Channel 00/0 : 33[46000] -> 16[7000] [receive] via NET/AWS Libfabric/1 +gpua033:1942187:1942312 [0] NCCL INFO Channel 00/0 : 24[7000] -> 16[7000] [receive] via NET/AWS Libfabric/1 +gpua033:1942187:1942312 [0] NCCL INFO Channel 01/0 : 20[7000] -> 16[7000] [receive] via NET/AWS Libfabric/1 +gpua007:1877471:1877610 [1] NCCL INFO Connected all rings +gpua007:1877471:1877610 [1] NCCL INFO Channel 01/0 : 5[46000] -> 8[7000] [send] via NET/AWS Libfabric/1 +gpua007:1877471:1877610 [1] NCCL INFO Channel 01/0 : 8[7000] -> 5[46000] [receive] via NET/AWS Libfabric/1 +gpua007:1877471:1877610 [1] NCCL INFO Channel 00/0 : 5[46000] -> 4[7000] via P2P/IPC/read +gpua007:1877471:1877610 [1] NCCL INFO Channel 01/0 : 5[46000] -> 4[7000] via P2P/IPC/read +gpua007:1877471:1877610 [1] NCCL INFO Connected all trees +gpua007:1877471:1877610 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua007:1877471:1877610 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua007:1877471:1877610 [1] NCCL INFO comm 0x5572e6f75260 rank 5 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpua012:3029511:3029602 [2] NCCL INFO Connected all rings +gpua012:3029511:3029602 [2] NCCL INFO Channel 00/0 : 10[85000] -> 9[46000] via P2P/IPC/read +gpua012:3029511:3029602 [2] NCCL INFO Channel 01/0 : 10[85000] -> 9[46000] via P2P/IPC/read +gpua012:3029511:3029602 [2] NCCL INFO Connected all trees +gpua012:3029511:3029602 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua012:3029511:3029602 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua012:3029511:3029602 [2] NCCL INFO comm 0x563293232be0 rank 10 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpua055:421545:436565 [3] NCCL INFO Connected all rings +gpua055:421545:436565 [3] NCCL INFO Channel 00/0 : 43[c7000] -> 42[85000] via P2P/IPC/read +gpua055:421545:436565 [3] NCCL INFO Channel 01/0 : 43[c7000] -> 42[85000] via P2P/IPC/read +gpua055:421545:436565 [3] NCCL INFO Connected all trees +gpua055:421545:436565 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua055:421545:436565 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua055:421545:436565 [3] NCCL INFO comm 0x5617cc287110 rank 43 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpua039:3958110:3958199 [3] NCCL INFO Connected all rings +gpua039:3958110:3958199 [3] NCCL INFO Channel 00/0 : 27[c7000] -> 26[85000] via P2P/IPC/read +gpua039:3958110:3958199 [3] NCCL INFO Channel 01/0 : 27[c7000] -> 26[85000] via P2P/IPC/read +gpua039:3958110:3958199 [3] NCCL INFO Connected all trees +gpua039:3958110:3958199 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua039:3958110:3958199 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua039:3958110:3958199 [3] NCCL INFO comm 0x55d87bf6d410 rank 27 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpua049:151700:151825 [3] NCCL INFO Connected all rings +gpua049:151700:151825 [3] NCCL INFO Channel 00/0 : 35[c7000] -> 34[85000] via P2P/IPC/read +gpua049:151700:151825 [3] NCCL INFO Channel 01/0 : 35[c7000] -> 34[85000] via P2P/IPC/read +gpua049:151700:151825 [3] NCCL INFO Connected all trees +gpua049:151700:151825 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua049:151700:151825 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua049:151700:151825 [3] NCCL INFO comm 0x555aad91f7e0 rank 35 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpua079:4011106:4011246 [0] NCCL INFO Channel 01/0 : 47[c7000] -> 48[7000] [receive] via NET/AWS Libfabric/1 +gpua079:4011106:4011246 [0] NCCL INFO Channel 00/0 : 48[7000] -> 49[46000] via P2P/IPC/read +gpua079:4011106:4011246 [0] NCCL INFO Channel 01/0 : 48[7000] -> 49[46000] via P2P/IPC/read +gpua079:4011106:4011246 [0] NCCL INFO Connected all rings +gpua079:4011106:4011246 [0] NCCL INFO Channel 01/0 : 48[7000] -> 52[7000] [send] via NET/AWS Libfabric/1 +gpua079:4011106:4011246 [0] NCCL INFO Channel 00/0 : 48[7000] -> 56[7000] [send] via NET/AWS Libfabric/1 +gpua079:4011106:4011246 [0] NCCL INFO Channel 00/0 : 32[7000] -> 48[7000] [receive] via NET/AWS Libfabric/1 +gpua079:4011106:4011246 [0] NCCL INFO Channel 00/0 : 48[7000] -> 32[7000] [send] via NET/AWS Libfabric/1 +gpua079:4011106:4011246 [0] NCCL INFO Channel 00/0 : 56[7000] -> 48[7000] [receive] via NET/AWS Libfabric/1 +gpua079:4011106:4011246 [0] NCCL INFO Channel 01/0 : 52[7000] -> 48[7000] [receive] via NET/AWS Libfabric/1 +gpua038:474862:474944 [1] NCCL INFO Channel 01/0 : 21[46000] -> 24[7000] [send] via NET/AWS Libfabric/1 +gpua038:474862:474944 [1] NCCL INFO Channel 01/0 : 24[7000] -> 21[46000] [receive] via NET/AWS Libfabric/1 +gpua038:474862:474944 [1] NCCL INFO Channel 00/0 : 21[46000] -> 20[7000] via P2P/IPC/read +gpua038:474862:474944 [1] NCCL INFO Channel 01/0 : 21[46000] -> 20[7000] via P2P/IPC/read +gpua038:474862:474944 [1] NCCL INFO Connected all trees +gpua038:474862:474944 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua038:474862:474944 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua038:474862:474944 [1] NCCL INFO comm 0x55b238cb59e0 rank 21 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpua016:879605:879689 [1] NCCL INFO Channel 01/0 : 13[46000] -> 20[7000] [send] via NET/AWS Libfabric/1 +gpua016:879605:879689 [1] NCCL INFO Channel 01/0 : 20[7000] -> 13[46000] [receive] via NET/AWS Libfabric/1 +gpua016:879605:879689 [1] NCCL INFO Channel 00/0 : 13[46000] -> 12[7000] via P2P/IPC/read +gpua016:879605:879689 [1] NCCL INFO Channel 01/0 : 13[46000] -> 12[7000] via P2P/IPC/read +gpua016:879605:879689 [1] NCCL INFO Connected all trees +gpua016:879605:879689 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua016:879605:879689 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua016:879605:879689 [1] NCCL INFO comm 0x55f84cd7d220 rank 13 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpua016:879607:879607 [3] NCCL INFO cudaDriverVersion 12020 +gpua016:879607:879607 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.16<0> +gpua016:879607:879607 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua089:1166997:1167308 [2] NCCL INFO Connected all rings +gpua089:1166997:1167308 [2] NCCL INFO Channel 00/0 : 62[85000] -> 61[46000] via P2P/IPC/read +gpua089:1166997:1167308 [2] NCCL INFO Channel 01/0 : 62[85000] -> 61[46000] via P2P/IPC/read +gpua089:1166997:1167308 [2] NCCL INFO Connected all trees +gpua089:1166997:1167308 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua089:1166997:1167308 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua089:1166997:1167308 [2] NCCL INFO comm 0x557deaf40ee0 rank 62 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpua006:976275:976366 [0] NCCL INFO Channel 01/0 : 0[7000] -> 1[46000] via P2P/IPC/read +gpua006:976275:976366 [0] NCCL INFO Connected all rings +gpua006:976275:976366 [0] NCCL INFO Channel 01/0 : 0[7000] -> 4[7000] [send] via NET/AWS Libfabric/1 +gpua006:976275:976366 [0] NCCL INFO Channel 00/0 : 32[7000] -> 0[7000] [receive] via NET/AWS Libfabric/1 +gpua006:976275:976366 [0] NCCL INFO Channel 00/0 : 0[7000] -> 32[7000] [send] via NET/AWS Libfabric/1 +gpua006:976275:976366 [0] NCCL INFO Channel 01/0 : 4[7000] -> 0[7000] [receive] via NET/AWS Libfabric/1 +gpua006:976275:976366 [0] NCCL INFO Connected all trees +gpua006:976275:976366 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua006:976275:976366 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua006:976275:976366 [0] NCCL INFO comm 0x55e162d728e0 rank 0 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpua085:120732:120822 [0] NCCL INFO Channel 00/0 : 56[7000] -> 57[46000] via P2P/IPC/read +gpua085:120732:120822 [0] NCCL INFO Channel 01/0 : 56[7000] -> 57[46000] via P2P/IPC/read +gpua085:120732:120822 [0] NCCL INFO Connected all rings +gpua085:120732:120822 [0] NCCL INFO Channel 01/0 : 53[46000] -> 56[7000] [receive] via NET/AWS Libfabric/1 +gpua085:120732:120822 [0] NCCL INFO Channel 00/0 : 56[7000] -> 60[7000] [send] via NET/AWS Libfabric/1 +gpua085:120732:120822 [0] NCCL INFO Channel 00/0 : 48[7000] -> 56[7000] [receive] via NET/AWS Libfabric/1 +gpua085:120732:120822 [0] NCCL INFO Channel 00/0 : 56[7000] -> 48[7000] [send] via NET/AWS Libfabric/1 +gpua085:120732:120822 [0] NCCL INFO Channel 00/0 : 60[7000] -> 56[7000] [receive] via NET/AWS Libfabric/1 +gpua085:120732:120822 [0] NCCL INFO Channel 01/0 : 56[7000] -> 53[46000] [send] via NET/AWS Libfabric/1 +gpua085:120732:120822 [0] NCCL INFO Connected all trees +gpua085:120732:120822 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua057:4182114:4182220 [2] NCCL INFO Connected all rings +gpua057:4182114:4182220 [2] NCCL INFO Channel 00/0 : 46[85000] -> 45[46000] via P2P/IPC/read +gpua057:4182114:4182220 [2] NCCL INFO Channel 01/0 : 46[85000] -> 45[46000] via P2P/IPC/read +gpua057:4182114:4182220 [2] NCCL INFO Connected all trees +gpua057:4182114:4182220 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua057:4182114:4182220 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua057:4182114:4182220 [2] NCCL INFO comm 0x561ec8413430 rank 46 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpua040:4155648:4155739 [2] NCCL INFO Connected all rings +gpua040:4155648:4155739 [2] NCCL INFO Channel 00/0 : 30[85000] -> 29[46000] via P2P/IPC/read +gpua040:4155648:4155739 [2] NCCL INFO Channel 01/0 : 30[85000] -> 29[46000] via P2P/IPC/read +gpua040:4155648:4155739 [2] NCCL INFO Connected all trees +gpua040:4155648:4155739 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua040:4155648:4155739 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua040:4155648:4155739 [2] NCCL INFO comm 0x555fc236f280 rank 30 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpua080:3566292:3566382 [3] NCCL INFO Connected all rings +gpua080:3566292:3566382 [3] NCCL INFO Channel 00/0 : 55[c7000] -> 54[85000] via P2P/IPC/read +gpua080:3566292:3566382 [3] NCCL INFO Channel 01/0 : 55[c7000] -> 54[85000] via P2P/IPC/read +gpua080:3566292:3566382 [3] NCCL INFO Connected all trees +gpua080:3566292:3566382 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua080:3566292:3566382 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua080:3566292:3566382 [3] NCCL INFO comm 0x55f6f137f630 rank 55 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpua054:719588:719718 [1] NCCL INFO Channel 01/0 : 37[46000] -> 40[7000] [send] via NET/AWS Libfabric/1 +gpua054:719588:719718 [1] NCCL INFO Channel 01/0 : 40[7000] -> 37[46000] [receive] via NET/AWS Libfabric/1 +gpua054:719588:719718 [1] NCCL INFO Channel 00/0 : 37[46000] -> 36[7000] via P2P/IPC/read +gpua054:719588:719718 [1] NCCL INFO Channel 01/0 : 37[46000] -> 36[7000] via P2P/IPC/read +gpua054:719588:719718 [1] NCCL INFO Connected all trees +gpua054:719588:719718 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua054:719588:719718 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua054:719588:719718 [1] NCCL INFO comm 0x5580f5821c20 rank 37 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpua033:1942187:1942312 [0] NCCL INFO Connected all trees +gpua033:1942187:1942312 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua033:1942187:1942312 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua033:1942187:1942312 [0] NCCL INFO comm 0x56190bef37d0 rank 16 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpua007:1877470:1877470 [0] NCCL INFO cudaDriverVersion 12020 +gpua007:1877470:1877470 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.7<0> +gpua007:1877470:1877470 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua007:1877470:1877470 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua007:1877470:1877609 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua007:1877470:1877609 [0] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua007:1877470:1877609 [0] NCCL INFO Using network AWS Libfabric +gpua007:1877470:1877609 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpua007:1877470:1877609 [0] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua007:1877470:1877609 [0] NCCL INFO Trees [0] 5/-1/-1->4->9 [1] 5/0/-1->4->12 +gpua007:1877470:1877609 [0] NCCL INFO Channel 00/0 : 3[c7000] -> 4[7000] [receive] via NET/AWS Libfabric/1 +gpua007:1877470:1877609 [0] NCCL INFO Channel 01/0 : 3[c7000] -> 4[7000] [receive] via NET/AWS Libfabric/1 +gpua012:3029509:3029509 [0] NCCL INFO cudaDriverVersion 12020 +gpua012:3029509:3029509 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.12<0> +gpua012:3029509:3029509 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua012:3029509:3029509 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua012:3029509:3029603 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua012:3029509:3029603 [0] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua012:3029509:3029603 [0] NCCL INFO Using network AWS Libfabric +gpua012:3029509:3029603 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpua012:3029509:3029603 [0] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua012:3029509:3029603 [0] NCCL INFO Trees [0] 9/12/-1->8->17 [1] 9/-1/-1->8->5 +gpua012:3029509:3029603 [0] NCCL INFO Channel 00/0 : 7[c7000] -> 8[7000] [receive] via NET/AWS Libfabric/1 +gpua012:3029509:3029603 [0] NCCL INFO Channel 01/0 : 7[c7000] -> 8[7000] [receive] via NET/AWS Libfabric/1 +gpua055:421543:421543 [1] NCCL INFO cudaDriverVersion 12020 +gpua055:421543:421543 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.55<0> +gpua055:421543:421543 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua055:421543:421543 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua055:421543:436564 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua055:421543:436564 [1] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua055:421543:436564 [1] NCCL INFO Using network AWS Libfabric +gpua055:421543:436564 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpua055:421543:436564 [1] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua055:421543:436564 [1] NCCL INFO Trees [0] 42/36/-1->41->40 [1] 42/-1/-1->41->40 +gpua055:421543:436564 [1] NCCL INFO Channel 00/0 : 41[46000] -> 42[85000] via P2P/IPC/read +gpua055:421543:436564 [1] NCCL INFO Channel 01/0 : 41[46000] -> 42[85000] via P2P/IPC/read +gpua055:421543:436564 [1] NCCL INFO Connected all rings +gpua039:3958108:3958108 [1] NCCL INFO cudaDriverVersion 12020 +gpua039:3958108:3958108 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.39<0> +gpua039:3958108:3958108 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua039:3958108:3958108 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua039:3958108:3958197 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua039:3958108:3958197 [1] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua039:3958108:3958197 [1] NCCL INFO Using network AWS Libfabric +gpua039:3958108:3958197 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpua039:3958108:3958197 [1] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua039:3958108:3958197 [1] NCCL INFO Trees [0] 26/20/-1->25->24 [1] 26/-1/-1->25->24 +gpua039:3958108:3958197 [1] NCCL INFO Channel 00/0 : 25[46000] -> 26[85000] via P2P/IPC/read +gpua039:3958108:3958197 [1] NCCL INFO Channel 01/0 : 25[46000] -> 26[85000] via P2P/IPC/read +gpua049:151697:151697 [0] NCCL INFO cudaDriverVersion 12020 +gpua049:151697:151697 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.49<0> +gpua049:151697:151697 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua049:151697:151697 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua049:151697:151824 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua049:151697:151824 [0] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua049:151697:151824 [0] NCCL INFO Using network AWS Libfabric +gpua049:151697:151824 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpua049:151697:151824 [0] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua049:151697:151824 [0] NCCL INFO Trees [0] 33/48/-1->32->0 [1] 33/-1/-1->32->36 +gpua049:151697:151824 [0] NCCL INFO Channel 00/0 : 31[c7000] -> 32[7000] [receive] via NET/AWS Libfabric/1 +gpua049:151697:151824 [0] NCCL INFO Channel 01/0 : 31[c7000] -> 32[7000] [receive] via NET/AWS Libfabric/1 +gpua079:4011106:4011246 [0] NCCL INFO Connected all trees +gpua079:4011106:4011246 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua079:4011106:4011246 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua079:4011106:4011246 [0] NCCL INFO comm 0x56504a976be0 rank 48 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpua038:474863:474863 [2] NCCL INFO cudaDriverVersion 12020 +gpua038:474863:474863 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.38<0> +gpua038:474863:474863 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua038:474863:474863 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua038:474863:474947 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua038:474863:474947 [2] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua038:474863:474947 [2] NCCL INFO Using network AWS Libfabric +gpua038:474863:474947 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpua038:474863:474947 [2] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua038:474863:474947 [2] NCCL INFO Trees [0] 23/-1/-1->22->21 [1] 23/-1/-1->22->21 +gpua038:474863:474947 [2] NCCL INFO Channel 00/0 : 22[85000] -> 23[c7000] via P2P/IPC/read +gpua038:474863:474947 [2] NCCL INFO Channel 01/0 : 22[85000] -> 23[c7000] via P2P/IPC/read +gpua038:474863:474947 [2] NCCL INFO Connected all rings +gpua016:879607:879607 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua016:879607:879688 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua016:879607:879688 [3] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua016:879607:879688 [3] NCCL INFO Using network AWS Libfabric +gpua016:879607:879688 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpua016:879607:879688 [3] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua016:879607:879688 [3] NCCL INFO Trees [0] -1/-1/-1->15->14 [1] -1/-1/-1->15->14 +gpua016:879607:879688 [3] NCCL INFO Channel 00/0 : 15[c7000] -> 16[7000] [send] via NET/AWS Libfabric/1 +gpua016:879607:879688 [3] NCCL INFO Channel 01/0 : 15[c7000] -> 16[7000] [send] via NET/AWS Libfabric/1 +gpua016:879607:879688 [3] NCCL INFO Connected all rings +gpua016:879607:879688 [3] NCCL INFO Channel 00/0 : 15[c7000] -> 14[85000] via P2P/IPC/read +gpua016:879607:879688 [3] NCCL INFO Channel 01/0 : 15[c7000] -> 14[85000] via P2P/IPC/read +gpua089:1166995:1166995 [0] NCCL INFO cudaDriverVersion 12020 +gpua089:1166995:1166995 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.89<0> +gpua089:1166995:1166995 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua089:1166995:1166995 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua089:1166995:1167306 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua089:1166995:1167306 [0] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua089:1166995:1167306 [0] NCCL INFO Using network AWS Libfabric +gpua089:1166995:1167306 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpua089:1166995:1167306 [0] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua089:1166995:1167306 [0] NCCL INFO Trees [0] 61/-1/-1->60->56 [1] 61/28/-1->60->-1 +gpua089:1166995:1167306 [0] NCCL INFO Channel 00/0 : 59[c7000] -> 60[7000] [receive] via NET/AWS Libfabric/1 +gpua006:976278:976278 [3] NCCL INFO cudaDriverVersion 12020 +gpua006:976278:976278 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.6<0> +gpua006:976278:976278 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua006:976278:976278 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua006:976278:976364 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua006:976278:976364 [3] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua006:976278:976364 [3] NCCL INFO Using network AWS Libfabric +gpua006:976278:976364 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpua006:976278:976364 [3] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua006:976278:976364 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2 +gpua006:976278:976364 [3] NCCL INFO Channel 00/0 : 3[c7000] -> 4[7000] [send] via NET/AWS Libfabric/1 +gpua006:976278:976364 [3] NCCL INFO Channel 01/0 : 3[c7000] -> 4[7000] [send] via NET/AWS Libfabric/1 +gpua085:120732:120822 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua085:120732:120822 [0] NCCL INFO comm 0x5577a1c0e150 rank 56 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpua057:4182113:4182113 [1] NCCL INFO cudaDriverVersion 12020 +gpua057:4182113:4182113 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.57<0> +gpua057:4182113:4182113 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua057:4182113:4182113 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua057:4182113:4182222 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua057:4182113:4182222 [1] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua057:4182113:4182222 [1] NCCL INFO Using network AWS Libfabric +gpua057:4182113:4182222 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpua057:4182113:4182222 [1] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua057:4182113:4182222 [1] NCCL INFO Trees [0] 46/-1/-1->45->44 [1] 46/52/-1->45->44 +gpua057:4182113:4182222 [1] NCCL INFO Channel 00/0 : 45[46000] -> 46[85000] via P2P/IPC/read +gpua057:4182113:4182222 [1] NCCL INFO Channel 01/0 : 45[46000] -> 46[85000] via P2P/IPC/read +gpua040:4155647:4155647 [1] NCCL INFO cudaDriverVersion 12020 +gpua040:4155647:4155647 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.40<0> +gpua040:4155647:4155647 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua040:4155647:4155647 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua040:4155647:4155740 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua040:4155647:4155740 [1] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua040:4155647:4155740 [1] NCCL INFO Using network AWS Libfabric +gpua040:4155647:4155740 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpua040:4155647:4155740 [1] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua040:4155647:4155740 [1] NCCL INFO Trees [0] 30/-1/-1->29->28 [1] 30/44/-1->29->28 +gpua040:4155647:4155740 [1] NCCL INFO Channel 00/0 : 29[46000] -> 30[85000] via P2P/IPC/read +gpua040:4155647:4155740 [1] NCCL INFO Channel 01/0 : 29[46000] -> 30[85000] via P2P/IPC/read +gpua080:3566291:3566291 [2] NCCL INFO cudaDriverVersion 12020 +gpua080:3566291:3566291 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.80<0> +gpua080:3566291:3566291 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua080:3566291:3566291 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua080:3566291:3566381 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua080:3566291:3566381 [2] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua080:3566291:3566381 [2] NCCL INFO Using network AWS Libfabric +gpua080:3566291:3566381 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpua080:3566291:3566381 [2] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua080:3566291:3566381 [2] NCCL INFO Trees [0] 55/-1/-1->54->53 [1] 55/-1/-1->54->53 +gpua080:3566291:3566381 [2] NCCL INFO Channel 00/0 : 54[85000] -> 55[c7000] via P2P/IPC/read +gpua080:3566291:3566381 [2] NCCL INFO Channel 01/0 : 54[85000] -> 55[c7000] via P2P/IPC/read +gpua054:719587:719587 [0] NCCL INFO cudaDriverVersion 12020 +gpua054:719587:719587 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.54<0> +gpua054:719587:719587 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua054:719587:719587 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua054:719587:719716 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua054:719587:719716 [0] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua054:719587:719716 [0] NCCL INFO Using network AWS Libfabric +gpua054:719587:719716 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpua054:719587:719716 [0] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua054:719587:719716 [0] NCCL INFO Trees [0] 37/-1/-1->36->41 [1] 37/32/-1->36->44 +gpua054:719587:719716 [0] NCCL INFO Channel 00/0 : 35[c7000] -> 36[7000] [receive] via NET/AWS Libfabric/1 +gpua054:719587:719716 [0] NCCL INFO Channel 01/0 : 35[c7000] -> 36[7000] [receive] via NET/AWS Libfabric/1 +gpua033:1942190:1942190 [3] NCCL INFO cudaDriverVersion 12020 +gpua033:1942190:1942190 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.33<0> +gpua033:1942190:1942190 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua033:1942190:1942190 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua033:1942190:1942311 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua033:1942190:1942311 [3] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua033:1942190:1942311 [3] NCCL INFO Using network AWS Libfabric +gpua033:1942190:1942311 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpua033:1942190:1942311 [3] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua033:1942190:1942311 [3] NCCL INFO Trees [0] -1/-1/-1->19->18 [1] -1/-1/-1->19->18 +gpua033:1942190:1942311 [3] NCCL INFO Channel 00/0 : 19[c7000] -> 20[7000] [send] via NET/AWS Libfabric/1 +gpua033:1942190:1942311 [3] NCCL INFO Channel 01/0 : 19[c7000] -> 20[7000] [send] via NET/AWS Libfabric/1 +gpua007:1877470:1877609 [0] NCCL INFO Channel 00/0 : 4[7000] -> 5[46000] via P2P/IPC/read +gpua007:1877470:1877609 [0] NCCL INFO Channel 01/0 : 4[7000] -> 5[46000] via P2P/IPC/read +gpua007:1877470:1877609 [0] NCCL INFO Connected all rings +gpua007:1877470:1877609 [0] NCCL INFO Channel 01/0 : 0[7000] -> 4[7000] [receive] via NET/AWS Libfabric/1 +gpua007:1877470:1877609 [0] NCCL INFO Channel 00/0 : 4[7000] -> 9[46000] [send] via NET/AWS Libfabric/1 +gpua007:1877470:1877609 [0] NCCL INFO Channel 01/0 : 4[7000] -> 12[7000] [send] via NET/AWS Libfabric/1 +gpua007:1877470:1877609 [0] NCCL INFO Channel 01/0 : 12[7000] -> 4[7000] [receive] via NET/AWS Libfabric/1 +gpua007:1877470:1877609 [0] NCCL INFO Channel 00/0 : 9[46000] -> 4[7000] [receive] via NET/AWS Libfabric/1 +gpua007:1877470:1877609 [0] NCCL INFO Channel 01/0 : 4[7000] -> 0[7000] [send] via NET/AWS Libfabric/1 +gpua007:1877470:1877609 [0] NCCL INFO Connected all trees +gpua007:1877470:1877609 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua012:3029509:3029603 [0] NCCL INFO Channel 00/0 : 8[7000] -> 9[46000] via P2P/IPC/read +gpua012:3029509:3029603 [0] NCCL INFO Channel 01/0 : 8[7000] -> 9[46000] via P2P/IPC/read +gpua012:3029509:3029603 [0] NCCL INFO Connected all rings +gpua012:3029509:3029603 [0] NCCL INFO Channel 01/0 : 5[46000] -> 8[7000] [receive] via NET/AWS Libfabric/1 +gpua012:3029509:3029603 [0] NCCL INFO Channel 00/0 : 8[7000] -> 12[7000] [send] via NET/AWS Libfabric/1 +gpua012:3029509:3029603 [0] NCCL INFO Channel 00/0 : 8[7000] -> 17[46000] [send] via NET/AWS Libfabric/1 +gpua012:3029509:3029603 [0] NCCL INFO Channel 00/0 : 17[46000] -> 8[7000] [receive] via NET/AWS Libfabric/1 +gpua012:3029509:3029603 [0] NCCL INFO Channel 00/0 : 12[7000] -> 8[7000] [receive] via NET/AWS Libfabric/1 +gpua012:3029509:3029603 [0] NCCL INFO Channel 01/0 : 8[7000] -> 5[46000] [send] via NET/AWS Libfabric/1 +gpua012:3029509:3029603 [0] NCCL INFO Connected all trees +gpua012:3029509:3029603 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua055:421543:436564 [1] NCCL INFO Channel 00/0 : 36[7000] -> 41[46000] [receive] via NET/AWS Libfabric/1 +gpua055:421543:436564 [1] NCCL INFO Channel 00/0 : 41[46000] -> 36[7000] [send] via NET/AWS Libfabric/1 +gpua055:421543:436564 [1] NCCL INFO Channel 00/0 : 41[46000] -> 40[7000] via P2P/IPC/read +gpua055:421543:436564 [1] NCCL INFO Channel 01/0 : 41[46000] -> 40[7000] via P2P/IPC/read +gpua055:421543:436564 [1] NCCL INFO Connected all trees +gpua055:421543:436564 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua055:421543:436564 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua055:421543:436564 [1] NCCL INFO comm 0x559505c28f50 rank 41 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpua039:3958108:3958197 [1] NCCL INFO Connected all rings +gpua039:3958108:3958197 [1] NCCL INFO Channel 00/0 : 20[7000] -> 25[46000] [receive] via NET/AWS Libfabric/1 +gpua039:3958108:3958197 [1] NCCL INFO Channel 00/0 : 25[46000] -> 20[7000] [send] via NET/AWS Libfabric/1 +gpua039:3958108:3958197 [1] NCCL INFO Channel 00/0 : 25[46000] -> 24[7000] via P2P/IPC/read +gpua039:3958108:3958197 [1] NCCL INFO Channel 01/0 : 25[46000] -> 24[7000] via P2P/IPC/read +gpua039:3958108:3958197 [1] NCCL INFO Connected all trees +gpua039:3958108:3958197 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua039:3958108:3958197 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua039:3958108:3958197 [1] NCCL INFO comm 0x558092361c20 rank 25 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpua049:151697:151824 [0] NCCL INFO Channel 00/0 : 32[7000] -> 33[46000] via P2P/IPC/read +gpua049:151697:151824 [0] NCCL INFO Channel 01/0 : 32[7000] -> 33[46000] via P2P/IPC/read +gpua049:151697:151824 [0] NCCL INFO Connected all rings +gpua049:151697:151824 [0] NCCL INFO Channel 01/0 : 32[7000] -> 36[7000] [send] via NET/AWS Libfabric/1 +gpua049:151697:151824 [0] NCCL INFO Channel 00/0 : 32[7000] -> 48[7000] [send] via NET/AWS Libfabric/1 +gpua049:151697:151824 [0] NCCL INFO Channel 00/0 : 0[7000] -> 32[7000] [receive] via NET/AWS Libfabric/1 +gpua049:151697:151824 [0] NCCL INFO Channel 00/0 : 32[7000] -> 0[7000] [send] via NET/AWS Libfabric/1 +gpua049:151697:151824 [0] NCCL INFO Channel 00/0 : 48[7000] -> 32[7000] [receive] via NET/AWS Libfabric/1 +gpua049:151697:151824 [0] NCCL INFO Channel 01/0 : 36[7000] -> 32[7000] [receive] via NET/AWS Libfabric/1 +gpua049:151697:151824 [0] NCCL INFO Connected all trees +gpua049:151697:151824 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua079:4011109:4011109 [3] NCCL INFO cudaDriverVersion 12020 +gpua079:4011109:4011109 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.79<0> +gpua079:4011109:4011109 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua079:4011109:4011109 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua079:4011109:4011244 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua079:4011109:4011244 [3] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua079:4011109:4011244 [3] NCCL INFO Using network AWS Libfabric +gpua079:4011109:4011244 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpua079:4011109:4011244 [3] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua079:4011109:4011244 [3] NCCL INFO Trees [0] -1/-1/-1->51->50 [1] -1/-1/-1->51->50 +gpua079:4011109:4011244 [3] NCCL INFO Channel 00/0 : 51[c7000] -> 52[7000] [send] via NET/AWS Libfabric/1 +gpua079:4011109:4011244 [3] NCCL INFO Channel 01/0 : 51[c7000] -> 52[7000] [send] via NET/AWS Libfabric/1 +gpua038:474863:474947 [2] NCCL INFO Channel 00/0 : 22[85000] -> 21[46000] via P2P/IPC/read +gpua038:474863:474947 [2] NCCL INFO Channel 01/0 : 22[85000] -> 21[46000] via P2P/IPC/read +gpua038:474863:474947 [2] NCCL INFO Connected all trees +gpua038:474863:474947 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua038:474863:474947 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua038:474863:474947 [2] NCCL INFO comm 0x55e0352774d0 rank 22 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpua016:879607:879688 [3] NCCL INFO Connected all trees +gpua016:879607:879688 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua016:879607:879688 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua016:879607:879688 [3] NCCL INFO comm 0x563a0300a7e0 rank 15 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpua089:1166995:1167306 [0] NCCL INFO Channel 01/0 : 59[c7000] -> 60[7000] [receive] via NET/AWS Libfabric/1 +gpua089:1166995:1167306 [0] NCCL INFO Channel 00/0 : 60[7000] -> 61[46000] via P2P/IPC/read +gpua089:1166995:1167306 [0] NCCL INFO Channel 01/0 : 60[7000] -> 61[46000] via P2P/IPC/read +gpua089:1166995:1167306 [0] NCCL INFO Connected all rings +gpua089:1166995:1167306 [0] NCCL INFO Channel 00/0 : 56[7000] -> 60[7000] [receive] via NET/AWS Libfabric/1 +gpua089:1166995:1167306 [0] NCCL INFO Channel 01/0 : 28[7000] -> 60[7000] [receive] via NET/AWS Libfabric/1 +gpua089:1166995:1167306 [0] NCCL INFO Channel 01/0 : 60[7000] -> 28[7000] [send] via NET/AWS Libfabric/1 +gpua089:1166995:1167306 [0] NCCL INFO Channel 00/0 : 60[7000] -> 56[7000] [send] via NET/AWS Libfabric/1 +gpua089:1166995:1167306 [0] NCCL INFO Connected all trees +gpua089:1166995:1167306 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua089:1166995:1167306 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua006:976278:976364 [3] NCCL INFO Connected all rings +gpua006:976278:976364 [3] NCCL INFO Channel 00/0 : 3[c7000] -> 2[85000] via P2P/IPC/read +gpua006:976278:976364 [3] NCCL INFO Channel 01/0 : 3[c7000] -> 2[85000] via P2P/IPC/read +gpua006:976278:976364 [3] NCCL INFO Connected all trees +gpua006:976278:976364 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua006:976278:976364 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua006:976278:976364 [3] NCCL INFO comm 0x56023d21c6e0 rank 3 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpua085:120735:120735 [3] NCCL INFO cudaDriverVersion 12020 +gpua085:120735:120735 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.85<0> +gpua085:120735:120735 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua085:120735:120735 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua085:120735:120824 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua085:120735:120824 [3] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua085:120735:120824 [3] NCCL INFO Using network AWS Libfabric +gpua085:120735:120824 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpua085:120735:120824 [3] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua085:120735:120824 [3] NCCL INFO Trees [0] -1/-1/-1->59->58 [1] -1/-1/-1->59->58 +gpua085:120735:120824 [3] NCCL INFO Channel 00/0 : 59[c7000] -> 60[7000] [send] via NET/AWS Libfabric/1 +gpua085:120735:120824 [3] NCCL INFO Channel 01/0 : 59[c7000] -> 60[7000] [send] via NET/AWS Libfabric/1 +gpua057:4182113:4182222 [1] NCCL INFO Connected all rings +gpua057:4182113:4182222 [1] NCCL INFO Channel 01/0 : 45[46000] -> 52[7000] [send] via NET/AWS Libfabric/1 +gpua057:4182113:4182222 [1] NCCL INFO Channel 01/0 : 52[7000] -> 45[46000] [receive] via NET/AWS Libfabric/1 +gpua057:4182113:4182222 [1] NCCL INFO Channel 00/0 : 45[46000] -> 44[7000] via P2P/IPC/read +gpua057:4182113:4182222 [1] NCCL INFO Channel 01/0 : 45[46000] -> 44[7000] via P2P/IPC/read +gpua057:4182113:4182222 [1] NCCL INFO Connected all trees +gpua057:4182113:4182222 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua057:4182113:4182222 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua057:4182113:4182222 [1] NCCL INFO comm 0x557d12aaefd0 rank 45 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpua057:4182115:4182115 [3] NCCL INFO cudaDriverVersion 12020 +gpua057:4182115:4182115 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.57<0> +gpua040:4155647:4155740 [1] NCCL INFO Connected all rings +gpua040:4155647:4155740 [1] NCCL INFO Channel 01/0 : 29[46000] -> 44[7000] [send] via NET/AWS Libfabric/1 +gpua040:4155647:4155740 [1] NCCL INFO Channel 01/0 : 44[7000] -> 29[46000] [receive] via NET/AWS Libfabric/1 +gpua040:4155647:4155740 [1] NCCL INFO Channel 00/0 : 29[46000] -> 28[7000] via P2P/IPC/read +gpua040:4155647:4155740 [1] NCCL INFO Channel 01/0 : 29[46000] -> 28[7000] via P2P/IPC/read +gpua040:4155647:4155740 [1] NCCL INFO Connected all trees +gpua040:4155647:4155740 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua040:4155647:4155740 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua040:4155647:4155740 [1] NCCL INFO comm 0x558ea55b2320 rank 29 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpua080:3566291:3566381 [2] NCCL INFO Connected all rings +gpua080:3566291:3566381 [2] NCCL INFO Channel 00/0 : 54[85000] -> 53[46000] via P2P/IPC/read +gpua080:3566291:3566381 [2] NCCL INFO Channel 01/0 : 54[85000] -> 53[46000] via P2P/IPC/read +gpua080:3566291:3566381 [2] NCCL INFO Connected all trees +gpua080:3566291:3566381 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua080:3566291:3566381 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua080:3566291:3566381 [2] NCCL INFO comm 0x55c0439085b0 rank 54 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpua054:719587:719716 [0] NCCL INFO Channel 00/0 : 36[7000] -> 37[46000] via P2P/IPC/read +gpua054:719587:719716 [0] NCCL INFO Channel 01/0 : 36[7000] -> 37[46000] via P2P/IPC/read +gpua054:719587:719716 [0] NCCL INFO Connected all rings +gpua054:719587:719716 [0] NCCL INFO Channel 01/0 : 32[7000] -> 36[7000] [receive] via NET/AWS Libfabric/1 +gpua054:719587:719716 [0] NCCL INFO Channel 00/0 : 36[7000] -> 41[46000] [send] via NET/AWS Libfabric/1 +gpua054:719587:719716 [0] NCCL INFO Channel 01/0 : 36[7000] -> 44[7000] [send] via NET/AWS Libfabric/1 +gpua054:719587:719716 [0] NCCL INFO Channel 01/0 : 44[7000] -> 36[7000] [receive] via NET/AWS Libfabric/1 +gpua054:719587:719716 [0] NCCL INFO Channel 00/0 : 41[46000] -> 36[7000] [receive] via NET/AWS Libfabric/1 +gpua054:719587:719716 [0] NCCL INFO Channel 01/0 : 36[7000] -> 32[7000] [send] via NET/AWS Libfabric/1 +gpua054:719587:719716 [0] NCCL INFO Connected all trees +gpua054:719587:719716 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua033:1942190:1942311 [3] NCCL INFO Connected all rings +gpua033:1942190:1942311 [3] NCCL INFO Channel 00/0 : 19[c7000] -> 18[85000] via P2P/IPC/read +gpua033:1942190:1942311 [3] NCCL INFO Channel 01/0 : 19[c7000] -> 18[85000] via P2P/IPC/read +gpua033:1942190:1942311 [3] NCCL INFO Connected all trees +gpua033:1942190:1942311 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua033:1942190:1942311 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua033:1942190:1942311 [3] NCCL INFO comm 0x55fdb8a5ac60 rank 19 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpua007:1877470:1877609 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua007:1877470:1877609 [0] NCCL INFO comm 0x560bd5f339a0 rank 4 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpua012:3029509:3029603 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua012:3029509:3029603 [0] NCCL INFO comm 0x564492351e50 rank 8 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpua055:421544:421544 [2] NCCL INFO cudaDriverVersion 12020 +gpua055:421544:421544 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.55<0> +gpua055:421544:421544 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua055:421544:421544 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua055:421544:436567 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua055:421544:436567 [2] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua055:421544:436567 [2] NCCL INFO Using network AWS Libfabric +gpua055:421544:436567 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpua055:421544:436567 [2] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua055:421544:436567 [2] NCCL INFO Trees [0] 43/-1/-1->42->41 [1] 43/-1/-1->42->41 +gpua055:421544:436567 [2] NCCL INFO Channel 00/0 : 42[85000] -> 43[c7000] via P2P/IPC/read +gpua055:421544:436567 [2] NCCL INFO Channel 01/0 : 42[85000] -> 43[c7000] via P2P/IPC/read +gpua055:421544:436567 [2] NCCL INFO Connected all rings +gpua039:3958109:3958109 [2] NCCL INFO cudaDriverVersion 12020 +gpua039:3958109:3958109 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.39<0> +gpua039:3958109:3958109 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua039:3958109:3958109 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua039:3958109:3958198 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua039:3958109:3958198 [2] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua039:3958109:3958198 [2] NCCL INFO Using network AWS Libfabric +gpua039:3958109:3958198 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpua039:3958109:3958198 [2] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua039:3958109:3958198 [2] NCCL INFO Trees [0] 27/-1/-1->26->25 [1] 27/-1/-1->26->25 +gpua039:3958109:3958198 [2] NCCL INFO Channel 00/0 : 26[85000] -> 27[c7000] via P2P/IPC/read +gpua039:3958109:3958198 [2] NCCL INFO Channel 01/0 : 26[85000] -> 27[c7000] via P2P/IPC/read +gpua049:151697:151824 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua049:151697:151824 [0] NCCL INFO comm 0x55feb3e4d740 rank 32 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpua079:4011109:4011244 [3] NCCL INFO Connected all rings +gpua079:4011109:4011244 [3] NCCL INFO Channel 00/0 : 51[c7000] -> 50[85000] via P2P/IPC/read +gpua079:4011109:4011244 [3] NCCL INFO Channel 01/0 : 51[c7000] -> 50[85000] via P2P/IPC/read +gpua079:4011109:4011244 [3] NCCL INFO Connected all trees +gpua079:4011109:4011244 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua079:4011109:4011244 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua079:4011109:4011244 [3] NCCL INFO comm 0x5606001058e0 rank 51 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpua038:474861:474861 [0] NCCL INFO cudaDriverVersion 12020 +gpua038:474861:474861 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.38<0> +gpua038:474861:474861 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua038:474861:474861 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua038:474861:474946 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua038:474861:474946 [0] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua038:474861:474946 [0] NCCL INFO Using network AWS Libfabric +gpua038:474861:474946 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpua038:474861:474946 [0] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua038:474861:474946 [0] NCCL INFO Trees [0] 21/-1/-1->20->25 [1] 21/16/-1->20->13 +gpua038:474861:474946 [0] NCCL INFO Channel 00/0 : 19[c7000] -> 20[7000] [receive] via NET/AWS Libfabric/1 +gpua038:474861:474946 [0] NCCL INFO Channel 01/0 : 19[c7000] -> 20[7000] [receive] via NET/AWS Libfabric/1 +gpua016:879604:879604 [0] NCCL INFO cudaDriverVersion 12020 +gpua016:879604:879604 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.16<0> +gpua016:879604:879604 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua016:879604:879604 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua016:879604:879690 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua016:879604:879690 [0] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua016:879604:879690 [0] NCCL INFO Using network AWS Libfabric +gpua016:879604:879690 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpua016:879604:879690 [0] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua016:879604:879690 [0] NCCL INFO Trees [0] 13/-1/-1->12->8 [1] 13/4/-1->12->28 +gpua016:879604:879690 [0] NCCL INFO Channel 00/0 : 11[c7000] -> 12[7000] [receive] via NET/AWS Libfabric/1 +gpua016:879604:879690 [0] NCCL INFO Channel 01/0 : 11[c7000] -> 12[7000] [receive] via NET/AWS Libfabric/1 +gpua089:1166995:1167306 [0] NCCL INFO comm 0x560b92eace10 rank 60 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpua006:976276:976276 [1] NCCL INFO cudaDriverVersion 12020 +gpua006:976276:976276 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.6<0> +gpua006:976276:976276 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua006:976276:976276 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua006:976276:976367 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua006:976276:976367 [1] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua006:976276:976367 [1] NCCL INFO Using network AWS Libfabric +gpua006:976276:976367 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpua006:976276:976367 [1] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua006:976276:976367 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 +gpua006:976276:976367 [1] NCCL INFO Channel 00/0 : 1[46000] -> 2[85000] via P2P/IPC/read +gpua006:976276:976367 [1] NCCL INFO Channel 01/0 : 1[46000] -> 2[85000] via P2P/IPC/read +gpua006:976276:976367 [1] NCCL INFO Connected all rings +gpua085:120735:120824 [3] NCCL INFO Connected all rings +gpua085:120735:120824 [3] NCCL INFO Channel 00/0 : 59[c7000] -> 58[85000] via P2P/IPC/read +gpua085:120735:120824 [3] NCCL INFO Channel 01/0 : 59[c7000] -> 58[85000] via P2P/IPC/read +gpua085:120735:120824 [3] NCCL INFO Connected all trees +gpua085:120735:120824 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua085:120735:120824 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua085:120735:120824 [3] NCCL INFO comm 0x56070f183a80 rank 59 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpua057:4182115:4182115 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua057:4182115:4182115 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua057:4182115:4182221 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua057:4182115:4182221 [3] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua057:4182115:4182221 [3] NCCL INFO Using network AWS Libfabric +gpua057:4182115:4182221 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpua057:4182115:4182221 [3] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua057:4182115:4182221 [3] NCCL INFO Trees [0] -1/-1/-1->47->46 [1] -1/-1/-1->47->46 +gpua057:4182115:4182221 [3] NCCL INFO Channel 00/0 : 47[c7000] -> 48[7000] [send] via NET/AWS Libfabric/1 +gpua057:4182115:4182221 [3] NCCL INFO Channel 01/0 : 47[c7000] -> 48[7000] [send] via NET/AWS Libfabric/1 +gpua057:4182115:4182221 [3] NCCL INFO Connected all rings +gpua057:4182115:4182221 [3] NCCL INFO Channel 00/0 : 47[c7000] -> 46[85000] via P2P/IPC/read +gpua040:4155649:4155649 [3] NCCL INFO cudaDriverVersion 12020 +gpua040:4155649:4155649 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.40<0> +gpua040:4155649:4155649 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua040:4155649:4155649 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua040:4155649:4155738 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua040:4155649:4155738 [3] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua040:4155649:4155738 [3] NCCL INFO Using network AWS Libfabric +gpua040:4155649:4155738 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpua040:4155649:4155738 [3] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua040:4155649:4155738 [3] NCCL INFO Trees [0] -1/-1/-1->31->30 [1] -1/-1/-1->31->30 +gpua040:4155649:4155738 [3] NCCL INFO Channel 00/0 : 31[c7000] -> 32[7000] [send] via NET/AWS Libfabric/1 +gpua040:4155649:4155738 [3] NCCL INFO Channel 01/0 : 31[c7000] -> 32[7000] [send] via NET/AWS Libfabric/1 +gpua080:3566289:3566289 [0] NCCL INFO cudaDriverVersion 12020 +gpua080:3566289:3566289 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.80<0> +gpua080:3566289:3566289 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua080:3566289:3566289 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua080:3566289:3566380 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua080:3566289:3566380 [0] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua080:3566289:3566380 [0] NCCL INFO Using network AWS Libfabric +gpua080:3566289:3566380 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpua080:3566289:3566380 [0] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua080:3566289:3566380 [0] NCCL INFO Trees [0] 53/-1/-1->52->57 [1] 53/48/-1->52->45 +gpua080:3566289:3566380 [0] NCCL INFO Channel 00/0 : 51[c7000] -> 52[7000] [receive] via NET/AWS Libfabric/1 +gpua054:719587:719716 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua054:719587:719716 [0] NCCL INFO comm 0x5559df580ee0 rank 36 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpua033:1942189:1942189 [2] NCCL INFO cudaDriverVersion 12020 +gpua033:1942189:1942189 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.33<0> +gpua033:1942189:1942189 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua033:1942189:1942189 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua033:1942189:1942313 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua033:1942189:1942313 [2] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua033:1942189:1942313 [2] NCCL INFO Using network AWS Libfabric +gpua033:1942189:1942313 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpua033:1942189:1942313 [2] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua033:1942189:1942313 [2] NCCL INFO Trees [0] 19/-1/-1->18->17 [1] 19/-1/-1->18->17 +gpua033:1942189:1942313 [2] NCCL INFO Channel 00/0 : 18[85000] -> 19[c7000] via P2P/IPC/read +gpua033:1942189:1942313 [2] NCCL INFO Channel 01/0 : 18[85000] -> 19[c7000] via P2P/IPC/read +gpua007:1877472:1877472 [2] NCCL INFO cudaDriverVersion 12020 +gpua007:1877472:1877472 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.7<0> +gpua007:1877472:1877472 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua007:1877472:1877472 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua007:1877472:1877611 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua007:1877472:1877611 [2] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua007:1877472:1877611 [2] NCCL INFO Using network AWS Libfabric +gpua007:1877472:1877611 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpua007:1877472:1877611 [2] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua007:1877472:1877611 [2] NCCL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 +gpua007:1877472:1877611 [2] NCCL INFO Channel 00/0 : 6[85000] -> 7[c7000] via P2P/IPC/read +gpua007:1877472:1877611 [2] NCCL INFO Channel 01/0 : 6[85000] -> 7[c7000] via P2P/IPC/read +gpua012:3029512:3029512 [3] NCCL INFO cudaDriverVersion 12020 +gpua012:3029512:3029512 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.12<0> +gpua012:3029512:3029512 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua012:3029512:3029512 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua012:3029512:3029601 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua012:3029512:3029601 [3] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua012:3029512:3029601 [3] NCCL INFO Using network AWS Libfabric +gpua012:3029512:3029601 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpua012:3029512:3029601 [3] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua012:3029512:3029601 [3] NCCL INFO Trees [0] -1/-1/-1->11->10 [1] -1/-1/-1->11->10 +gpua012:3029512:3029601 [3] NCCL INFO Channel 00/0 : 11[c7000] -> 12[7000] [send] via NET/AWS Libfabric/1 +gpua012:3029512:3029601 [3] NCCL INFO Channel 01/0 : 11[c7000] -> 12[7000] [send] via NET/AWS Libfabric/1 +gpua055:421544:436567 [2] NCCL INFO Channel 00/0 : 42[85000] -> 41[46000] via P2P/IPC/read +gpua055:421544:436567 [2] NCCL INFO Channel 01/0 : 42[85000] -> 41[46000] via P2P/IPC/read +gpua055:421544:436567 [2] NCCL INFO Connected all trees +gpua055:421544:436567 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua055:421544:436567 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua055:421544:436567 [2] NCCL INFO comm 0x5595a5720150 rank 42 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpua039:3958109:3958198 [2] NCCL INFO Connected all rings +gpua039:3958109:3958198 [2] NCCL INFO Channel 00/0 : 26[85000] -> 25[46000] via P2P/IPC/read +gpua039:3958109:3958198 [2] NCCL INFO Channel 01/0 : 26[85000] -> 25[46000] via P2P/IPC/read +gpua039:3958109:3958198 [2] NCCL INFO Connected all trees +gpua039:3958109:3958198 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua039:3958109:3958198 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua039:3958109:3958198 [2] NCCL INFO comm 0x5578d383c850 rank 26 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpua039:3958107:3958107 [0] NCCL INFO cudaDriverVersion 12020 +gpua039:3958107:3958107 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.39<0> +gpua039:3958107:3958107 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua039:3958107:3958107 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua039:3958107:3958196 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua049:151699:151699 [2] NCCL INFO cudaDriverVersion 12020 +gpua049:151699:151699 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.49<0> +gpua049:151699:151699 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua049:151699:151699 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua049:151699:151827 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua049:151699:151827 [2] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua049:151699:151827 [2] NCCL INFO Using network AWS Libfabric +gpua049:151699:151827 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpua049:151699:151827 [2] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua049:151699:151827 [2] NCCL INFO Trees [0] 35/-1/-1->34->33 [1] 35/-1/-1->34->33 +gpua049:151699:151827 [2] NCCL INFO Channel 00/0 : 34[85000] -> 35[c7000] via P2P/IPC/read +gpua049:151699:151827 [2] NCCL INFO Channel 01/0 : 34[85000] -> 35[c7000] via P2P/IPC/read +gpua049:151699:151827 [2] NCCL INFO Connected all rings +gpua079:4011107:4011107 [1] NCCL INFO cudaDriverVersion 12020 +gpua079:4011107:4011107 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.79<0> +gpua079:4011107:4011107 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua079:4011107:4011107 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua079:4011107:4011243 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua079:4011107:4011243 [1] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua079:4011107:4011243 [1] NCCL INFO Using network AWS Libfabric +gpua079:4011107:4011243 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpua079:4011107:4011243 [1] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua079:4011107:4011243 [1] NCCL INFO Trees [0] 50/40/-1->49->48 [1] 50/-1/-1->49->48 +gpua079:4011107:4011243 [1] NCCL INFO Channel 00/0 : 49[46000] -> 50[85000] via P2P/IPC/read +gpua079:4011107:4011243 [1] NCCL INFO Channel 01/0 : 49[46000] -> 50[85000] via P2P/IPC/read +gpua038:474861:474946 [0] NCCL INFO Channel 00/0 : 20[7000] -> 21[46000] via P2P/IPC/read +gpua038:474861:474946 [0] NCCL INFO Channel 01/0 : 20[7000] -> 21[46000] via P2P/IPC/read +gpua038:474861:474946 [0] NCCL INFO Connected all rings +gpua038:474861:474946 [0] NCCL INFO Channel 01/0 : 16[7000] -> 20[7000] [receive] via NET/AWS Libfabric/1 +gpua038:474861:474946 [0] NCCL INFO Channel 00/0 : 20[7000] -> 25[46000] [send] via NET/AWS Libfabric/1 +gpua038:474861:474946 [0] NCCL INFO Channel 01/0 : 13[46000] -> 20[7000] [receive] via NET/AWS Libfabric/1 +gpua038:474861:474946 [0] NCCL INFO Channel 01/0 : 20[7000] -> 13[46000] [send] via NET/AWS Libfabric/1 +gpua038:474861:474946 [0] NCCL INFO Channel 00/0 : 25[46000] -> 20[7000] [receive] via NET/AWS Libfabric/1 +gpua038:474861:474946 [0] NCCL INFO Channel 01/0 : 20[7000] -> 16[7000] [send] via NET/AWS Libfabric/1 +gpua038:474861:474946 [0] NCCL INFO Connected all trees +gpua038:474861:474946 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua016:879604:879690 [0] NCCL INFO Channel 00/0 : 12[7000] -> 13[46000] via P2P/IPC/read +gpua016:879604:879690 [0] NCCL INFO Channel 01/0 : 12[7000] -> 13[46000] via P2P/IPC/read +gpua016:879604:879690 [0] NCCL INFO Connected all rings +gpua016:879604:879690 [0] NCCL INFO Channel 00/0 : 8[7000] -> 12[7000] [receive] via NET/AWS Libfabric/1 +gpua016:879604:879690 [0] NCCL INFO Channel 01/0 : 4[7000] -> 12[7000] [receive] via NET/AWS Libfabric/1 +gpua016:879604:879690 [0] NCCL INFO Channel 01/0 : 12[7000] -> 28[7000] [send] via NET/AWS Libfabric/1 +gpua016:879604:879690 [0] NCCL INFO Channel 01/0 : 28[7000] -> 12[7000] [receive] via NET/AWS Libfabric/1 +gpua016:879604:879690 [0] NCCL INFO Channel 01/0 : 12[7000] -> 4[7000] [send] via NET/AWS Libfabric/1 +gpua016:879604:879690 [0] NCCL INFO Channel 00/0 : 12[7000] -> 8[7000] [send] via NET/AWS Libfabric/1 +gpua016:879604:879690 [0] NCCL INFO Connected all trees +gpua016:879604:879690 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua089:1166996:1166996 [1] NCCL INFO cudaDriverVersion 12020 +gpua089:1166996:1166996 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.89<0> +gpua089:1166996:1166996 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua089:1166996:1166996 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua089:1166996:1167307 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua089:1166996:1167307 [1] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua089:1166996:1167307 [1] NCCL INFO Using network AWS Libfabric +gpua089:1166996:1167307 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpua089:1166996:1167307 [1] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua089:1166996:1167307 [1] NCCL INFO Trees [0] 62/-1/-1->61->60 [1] 62/-1/-1->61->60 +gpua089:1166996:1167307 [1] NCCL INFO Channel 00/0 : 61[46000] -> 62[85000] via P2P/IPC/read +gpua089:1166996:1167307 [1] NCCL INFO Channel 01/0 : 61[46000] -> 62[85000] via P2P/IPC/read +gpua006:976276:976367 [1] NCCL INFO Channel 00/0 : 1[46000] -> 0[7000] via P2P/IPC/read +gpua006:976276:976367 [1] NCCL INFO Channel 01/0 : 1[46000] -> 0[7000] via P2P/IPC/read +gpua006:976276:976367 [1] NCCL INFO Connected all trees +gpua006:976276:976367 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua006:976276:976367 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua006:976276:976367 [1] NCCL INFO comm 0x55a07acd23c0 rank 1 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpua085:120733:120733 [1] NCCL INFO cudaDriverVersion 12020 +gpua085:120733:120733 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.85<0> +gpua085:120733:120733 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua085:120733:120733 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua085:120733:120821 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua085:120733:120821 [1] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua085:120733:120821 [1] NCCL INFO Using network AWS Libfabric +gpua085:120733:120821 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpua085:120733:120821 [1] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua085:120733:120821 [1] NCCL INFO Trees [0] 58/52/-1->57->56 [1] 58/-1/-1->57->56 +gpua085:120733:120821 [1] NCCL INFO Channel 00/0 : 57[46000] -> 58[85000] via P2P/IPC/read +gpua085:120733:120821 [1] NCCL INFO Channel 01/0 : 57[46000] -> 58[85000] via P2P/IPC/read +gpua085:120733:120821 [1] NCCL INFO Connected all rings +gpua057:4182115:4182221 [3] NCCL INFO Channel 01/0 : 47[c7000] -> 46[85000] via P2P/IPC/read +gpua057:4182115:4182221 [3] NCCL INFO Connected all trees +gpua057:4182115:4182221 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua057:4182115:4182221 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua057:4182115:4182221 [3] NCCL INFO comm 0x559231f91830 rank 47 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpua040:4155649:4155738 [3] NCCL INFO Connected all rings +gpua040:4155649:4155738 [3] NCCL INFO Channel 00/0 : 31[c7000] -> 30[85000] via P2P/IPC/read +gpua040:4155649:4155738 [3] NCCL INFO Channel 01/0 : 31[c7000] -> 30[85000] via P2P/IPC/read +gpua040:4155649:4155738 [3] NCCL INFO Connected all trees +gpua040:4155649:4155738 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua040:4155649:4155738 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua040:4155649:4155738 [3] NCCL INFO comm 0x5589e3260e30 rank 31 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpua040:4155646:4155646 [0] NCCL INFO cudaDriverVersion 12020 +gpua040:4155646:4155646 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.40<0> +gpua040:4155646:4155646 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua040:4155646:4155646 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua040:4155646:4155741 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua080:3566289:3566380 [0] NCCL INFO Channel 01/0 : 51[c7000] -> 52[7000] [receive] via NET/AWS Libfabric/1 +gpua080:3566289:3566380 [0] NCCL INFO Channel 00/0 : 52[7000] -> 53[46000] via P2P/IPC/read +gpua080:3566289:3566380 [0] NCCL INFO Channel 01/0 : 52[7000] -> 53[46000] via P2P/IPC/read +gpua080:3566289:3566380 [0] NCCL INFO Connected all rings +gpua080:3566289:3566380 [0] NCCL INFO Channel 01/0 : 48[7000] -> 52[7000] [receive] via NET/AWS Libfabric/1 +gpua080:3566289:3566380 [0] NCCL INFO Channel 00/0 : 52[7000] -> 57[46000] [send] via NET/AWS Libfabric/1 +gpua080:3566289:3566380 [0] NCCL INFO Channel 01/0 : 45[46000] -> 52[7000] [receive] via NET/AWS Libfabric/1 +gpua080:3566289:3566380 [0] NCCL INFO Channel 01/0 : 52[7000] -> 45[46000] [send] via NET/AWS Libfabric/1 +gpua080:3566289:3566380 [0] NCCL INFO Channel 00/0 : 57[46000] -> 52[7000] [receive] via NET/AWS Libfabric/1 +gpua080:3566289:3566380 [0] NCCL INFO Channel 01/0 : 52[7000] -> 48[7000] [send] via NET/AWS Libfabric/1 +gpua054:719590:719590 [3] NCCL INFO cudaDriverVersion 12020 +gpua054:719590:719590 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.54<0> +gpua054:719590:719590 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua054:719590:719590 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua054:719590:719715 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua054:719590:719715 [3] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua054:719590:719715 [3] NCCL INFO Using network AWS Libfabric +gpua054:719590:719715 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpua054:719590:719715 [3] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua054:719590:719715 [3] NCCL INFO Trees [0] -1/-1/-1->39->38 [1] -1/-1/-1->39->38 +gpua054:719590:719715 [3] NCCL INFO Channel 00/0 : 39[c7000] -> 40[7000] [send] via NET/AWS Libfabric/1 +gpua054:719590:719715 [3] NCCL INFO Channel 01/0 : 39[c7000] -> 40[7000] [send] via NET/AWS Libfabric/1 +gpua033:1942189:1942313 [2] NCCL INFO Connected all rings +gpua033:1942189:1942313 [2] NCCL INFO Channel 00/0 : 18[85000] -> 17[46000] via P2P/IPC/read +gpua033:1942189:1942313 [2] NCCL INFO Channel 01/0 : 18[85000] -> 17[46000] via P2P/IPC/read +gpua033:1942189:1942313 [2] NCCL INFO Connected all trees +gpua033:1942189:1942313 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua033:1942189:1942313 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua033:1942189:1942313 [2] NCCL INFO comm 0x56003d4913e0 rank 18 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpua007:1877472:1877611 [2] NCCL INFO Connected all rings +gpua007:1877472:1877611 [2] NCCL INFO Channel 00/0 : 6[85000] -> 5[46000] via P2P/IPC/read +gpua007:1877472:1877611 [2] NCCL INFO Channel 01/0 : 6[85000] -> 5[46000] via P2P/IPC/read +gpua007:1877472:1877611 [2] NCCL INFO Connected all trees +gpua007:1877472:1877611 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua007:1877472:1877611 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua007:1877472:1877611 [2] NCCL INFO comm 0x56137d36aaa0 rank 6 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpua012:3029512:3029601 [3] NCCL INFO Connected all rings +gpua012:3029512:3029601 [3] NCCL INFO Channel 00/0 : 11[c7000] -> 10[85000] via P2P/IPC/read +gpua012:3029512:3029601 [3] NCCL INFO Channel 01/0 : 11[c7000] -> 10[85000] via P2P/IPC/read +gpua012:3029512:3029601 [3] NCCL INFO Connected all trees +gpua012:3029512:3029601 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua012:3029512:3029601 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua012:3029512:3029601 [3] NCCL INFO comm 0x55972f7bc700 rank 11 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpua055:421542:421542 [0] NCCL INFO cudaDriverVersion 12020 +gpua055:421542:421542 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.55<0> +gpua055:421542:421542 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua055:421542:421542 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua055:421542:436566 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua055:421542:436566 [0] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua055:421542:436566 [0] NCCL INFO Using network AWS Libfabric +gpua055:421542:436566 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpua055:421542:436566 [0] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua055:421542:436566 [0] NCCL INFO Trees [0] 41/44/-1->40->49 [1] 41/-1/-1->40->37 +gpua055:421542:436566 [0] NCCL INFO Channel 00/0 : 39[c7000] -> 40[7000] [receive] via NET/AWS Libfabric/1 +gpua055:421542:436566 [0] NCCL INFO Channel 01/0 : 39[c7000] -> 40[7000] [receive] via NET/AWS Libfabric/1 +gpua039:3958107:3958196 [0] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua039:3958107:3958196 [0] NCCL INFO Using network AWS Libfabric +gpua039:3958107:3958196 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpua039:3958107:3958196 [0] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua039:3958107:3958196 [0] NCCL INFO Trees [0] 25/28/-1->24->16 [1] 25/-1/-1->24->21 +gpua039:3958107:3958196 [0] NCCL INFO Channel 00/0 : 23[c7000] -> 24[7000] [receive] via NET/AWS Libfabric/1 +gpua039:3958107:3958196 [0] NCCL INFO Channel 01/0 : 23[c7000] -> 24[7000] [receive] via NET/AWS Libfabric/1 +gpua039:3958107:3958196 [0] NCCL INFO Channel 00/0 : 24[7000] -> 25[46000] via P2P/IPC/read +gpua039:3958107:3958196 [0] NCCL INFO Channel 01/0 : 24[7000] -> 25[46000] via P2P/IPC/read +gpua039:3958107:3958196 [0] NCCL INFO Connected all rings +gpua039:3958107:3958196 [0] NCCL INFO Channel 01/0 : 21[46000] -> 24[7000] [receive] via NET/AWS Libfabric/1 +gpua049:151699:151827 [2] NCCL INFO Channel 00/0 : 34[85000] -> 33[46000] via P2P/IPC/read +gpua049:151699:151827 [2] NCCL INFO Channel 01/0 : 34[85000] -> 33[46000] via P2P/IPC/read +gpua049:151699:151827 [2] NCCL INFO Connected all trees +gpua049:151699:151827 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua049:151699:151827 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua049:151699:151827 [2] NCCL INFO comm 0x55e2acb47da0 rank 34 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpua079:4011107:4011243 [1] NCCL INFO Connected all rings +gpua079:4011107:4011243 [1] NCCL INFO Channel 00/0 : 40[7000] -> 49[46000] [receive] via NET/AWS Libfabric/1 +gpua079:4011107:4011243 [1] NCCL INFO Channel 00/0 : 49[46000] -> 40[7000] [send] via NET/AWS Libfabric/1 +gpua079:4011107:4011243 [1] NCCL INFO Channel 00/0 : 49[46000] -> 48[7000] via P2P/IPC/read +gpua079:4011107:4011243 [1] NCCL INFO Channel 01/0 : 49[46000] -> 48[7000] via P2P/IPC/read +gpua079:4011107:4011243 [1] NCCL INFO Connected all trees +gpua079:4011107:4011243 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua079:4011107:4011243 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua079:4011107:4011243 [1] NCCL INFO comm 0x56239f56e7e0 rank 49 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpua038:474861:474946 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua038:474861:474946 [0] NCCL INFO comm 0x5579ce541530 rank 20 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpua016:879604:879690 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua016:879604:879690 [0] NCCL INFO comm 0x55ce3dfe6580 rank 12 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpua089:1166996:1167307 [1] NCCL INFO Connected all rings +gpua089:1166996:1167307 [1] NCCL INFO Channel 00/0 : 61[46000] -> 60[7000] via P2P/IPC/read +gpua089:1166996:1167307 [1] NCCL INFO Channel 01/0 : 61[46000] -> 60[7000] via P2P/IPC/read +gpua089:1166996:1167307 [1] NCCL INFO Connected all trees +gpua089:1166996:1167307 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua089:1166996:1167307 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua089:1166996:1167307 [1] NCCL INFO comm 0x563defc70860 rank 61 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpua006:976277:976277 [2] NCCL INFO cudaDriverVersion 12020 +gpua006:976277:976277 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.6<0> +gpua006:976277:976277 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua006:976277:976277 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua006:976277:976365 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua006:976277:976365 [2] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua006:976277:976365 [2] NCCL INFO Using network AWS Libfabric +gpua006:976277:976365 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpua006:976277:976365 [2] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua006:976277:976365 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1 +gpua006:976277:976365 [2] NCCL INFO Channel 00/0 : 2[85000] -> 3[c7000] via P2P/IPC/read +gpua006:976277:976365 [2] NCCL INFO Channel 01/0 : 2[85000] -> 3[c7000] via P2P/IPC/read +gpua006:976277:976365 [2] NCCL INFO Connected all rings +gpua085:120733:120821 [1] NCCL INFO Channel 00/0 : 52[7000] -> 57[46000] [receive] via NET/AWS Libfabric/1 +gpua085:120733:120821 [1] NCCL INFO Channel 00/0 : 57[46000] -> 52[7000] [send] via NET/AWS Libfabric/1 +gpua085:120733:120821 [1] NCCL INFO Channel 00/0 : 57[46000] -> 56[7000] via P2P/IPC/read +gpua085:120733:120821 [1] NCCL INFO Channel 01/0 : 57[46000] -> 56[7000] via P2P/IPC/read +gpua085:120733:120821 [1] NCCL INFO Connected all trees +gpua085:120733:120821 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua085:120733:120821 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua085:120733:120821 [1] NCCL INFO comm 0x55ddfd2e43a0 rank 57 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpua057:4182112:4182112 [0] NCCL INFO cudaDriverVersion 12020 +gpua057:4182112:4182112 [0] NCCL INFO Bootstrap : Using eth1:172.28.23.57<0> +gpua057:4182112:4182112 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua057:4182112:4182112 [0] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua057:4182112:4182223 [0] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua057:4182112:4182223 [0] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua057:4182112:4182223 [0] NCCL INFO Using network AWS Libfabric +gpua057:4182112:4182223 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpua057:4182112:4182223 [0] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua057:4182112:4182223 [0] NCCL INFO Trees [0] 45/-1/-1->44->40 [1] 45/36/-1->44->29 +gpua057:4182112:4182223 [0] NCCL INFO Channel 00/0 : 43[c7000] -> 44[7000] [receive] via NET/AWS Libfabric/1 +gpua040:4155646:4155741 [0] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua040:4155646:4155741 [0] NCCL INFO Using network AWS Libfabric +gpua040:4155646:4155741 [0] NCCL INFO Setting affinity for GPU 0 to ffff0000,00000000 +gpua040:4155646:4155741 [0] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua040:4155646:4155741 [0] NCCL INFO Trees [0] 29/-1/-1->28->24 [1] 29/12/-1->28->60 +gpua040:4155646:4155741 [0] NCCL INFO Channel 00/0 : 27[c7000] -> 28[7000] [receive] via NET/AWS Libfabric/1 +gpua040:4155646:4155741 [0] NCCL INFO Channel 01/0 : 27[c7000] -> 28[7000] [receive] via NET/AWS Libfabric/1 +gpua040:4155646:4155741 [0] NCCL INFO Channel 00/0 : 28[7000] -> 29[46000] via P2P/IPC/read +gpua040:4155646:4155741 [0] NCCL INFO Channel 01/0 : 28[7000] -> 29[46000] via P2P/IPC/read +gpua040:4155646:4155741 [0] NCCL INFO Connected all rings +gpua040:4155646:4155741 [0] NCCL INFO Channel 00/0 : 24[7000] -> 28[7000] [receive] via NET/AWS Libfabric/1 +gpua080:3566289:3566380 [0] NCCL INFO Connected all trees +gpua080:3566289:3566380 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua080:3566289:3566380 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua080:3566289:3566380 [0] NCCL INFO comm 0x561722ae1a70 rank 52 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpua054:719590:719715 [3] NCCL INFO Connected all rings +gpua054:719590:719715 [3] NCCL INFO Channel 00/0 : 39[c7000] -> 38[85000] via P2P/IPC/read +gpua054:719590:719715 [3] NCCL INFO Channel 01/0 : 39[c7000] -> 38[85000] via P2P/IPC/read +gpua054:719590:719715 [3] NCCL INFO Connected all trees +gpua054:719590:719715 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua054:719590:719715 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua054:719590:719715 [3] NCCL INFO comm 0x558eeffda3c0 rank 39 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpua054:719589:719589 [2] NCCL INFO cudaDriverVersion 12020 +gpua054:719589:719589 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.54<0> +gpua054:719589:719589 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua054:719589:719589 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua054:719589:719717 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua033:1942188:1942188 [1] NCCL INFO cudaDriverVersion 12020 +gpua033:1942188:1942188 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.33<0> +gpua033:1942188:1942188 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua033:1942188:1942188 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua033:1942188:1942310 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua033:1942188:1942310 [1] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua033:1942188:1942310 [1] NCCL INFO Using network AWS Libfabric +gpua033:1942188:1942310 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpua033:1942188:1942310 [1] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua033:1942188:1942310 [1] NCCL INFO Trees [0] 18/8/-1->17->16 [1] 18/-1/-1->17->16 +gpua033:1942188:1942310 [1] NCCL INFO Channel 00/0 : 17[46000] -> 18[85000] via P2P/IPC/read +gpua033:1942188:1942310 [1] NCCL INFO Channel 01/0 : 17[46000] -> 18[85000] via P2P/IPC/read +gpua007:1877473:1877473 [3] NCCL INFO cudaDriverVersion 12020 +gpua007:1877473:1877473 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.7<0> +gpua007:1877473:1877473 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua007:1877473:1877473 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua007:1877473:1877612 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua007:1877473:1877612 [3] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua007:1877473:1877612 [3] NCCL INFO Using network AWS Libfabric +gpua007:1877473:1877612 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpua007:1877473:1877612 [3] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua007:1877473:1877612 [3] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] -1/-1/-1->7->6 +gpua007:1877473:1877612 [3] NCCL INFO Channel 00/0 : 7[c7000] -> 8[7000] [send] via NET/AWS Libfabric/1 +gpua007:1877473:1877612 [3] NCCL INFO Channel 01/0 : 7[c7000] -> 8[7000] [send] via NET/AWS Libfabric/1 +gpua012:3029510:3029510 [1] NCCL INFO cudaDriverVersion 12020 +gpua012:3029510:3029510 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.12<0> +gpua012:3029510:3029510 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua012:3029510:3029510 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua012:3029510:3029604 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua012:3029510:3029604 [1] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua012:3029510:3029604 [1] NCCL INFO Using network AWS Libfabric +gpua012:3029510:3029604 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpua012:3029510:3029604 [1] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua012:3029510:3029604 [1] NCCL INFO Trees [0] 10/4/-1->9->8 [1] 10/-1/-1->9->8 +gpua012:3029510:3029604 [1] NCCL INFO Channel 00/0 : 9[46000] -> 10[85000] via P2P/IPC/read +gpua012:3029510:3029604 [1] NCCL INFO Channel 01/0 : 9[46000] -> 10[85000] via P2P/IPC/read +gpua055:421542:436566 [0] NCCL INFO Channel 00/0 : 40[7000] -> 41[46000] via P2P/IPC/read +gpua055:421542:436566 [0] NCCL INFO Channel 01/0 : 40[7000] -> 41[46000] via P2P/IPC/read +gpua055:421542:436566 [0] NCCL INFO Connected all rings +gpua055:421542:436566 [0] NCCL INFO Channel 01/0 : 37[46000] -> 40[7000] [receive] via NET/AWS Libfabric/1 +gpua055:421542:436566 [0] NCCL INFO Channel 00/0 : 40[7000] -> 44[7000] [send] via NET/AWS Libfabric/1 +gpua055:421542:436566 [0] NCCL INFO Channel 00/0 : 40[7000] -> 49[46000] [send] via NET/AWS Libfabric/1 +gpua055:421542:436566 [0] NCCL INFO Channel 00/0 : 49[46000] -> 40[7000] [receive] via NET/AWS Libfabric/1 +gpua055:421542:436566 [0] NCCL INFO Channel 00/0 : 44[7000] -> 40[7000] [receive] via NET/AWS Libfabric/1 +gpua055:421542:436566 [0] NCCL INFO Channel 01/0 : 40[7000] -> 37[46000] [send] via NET/AWS Libfabric/1 +gpua055:421542:436566 [0] NCCL INFO Connected all trees +gpua055:421542:436566 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua039:3958107:3958196 [0] NCCL INFO Channel 00/0 : 24[7000] -> 28[7000] [send] via NET/AWS Libfabric/1 +gpua039:3958107:3958196 [0] NCCL INFO Channel 00/0 : 16[7000] -> 24[7000] [receive] via NET/AWS Libfabric/1 +gpua039:3958107:3958196 [0] NCCL INFO Channel 00/0 : 24[7000] -> 16[7000] [send] via NET/AWS Libfabric/1 +gpua039:3958107:3958196 [0] NCCL INFO Channel 00/0 : 28[7000] -> 24[7000] [receive] via NET/AWS Libfabric/1 +gpua039:3958107:3958196 [0] NCCL INFO Channel 01/0 : 24[7000] -> 21[46000] [send] via NET/AWS Libfabric/1 +gpua039:3958107:3958196 [0] NCCL INFO Connected all trees +gpua039:3958107:3958196 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua039:3958107:3958196 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua039:3958107:3958196 [0] NCCL INFO comm 0x564bce104cf0 rank 24 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpua079:4011108:4011108 [2] NCCL INFO cudaDriverVersion 12020 +gpua079:4011108:4011108 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.79<0> +gpua079:4011108:4011108 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua079:4011108:4011108 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua079:4011108:4011245 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua079:4011108:4011245 [2] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua079:4011108:4011245 [2] NCCL INFO Using network AWS Libfabric +gpua079:4011108:4011245 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpua079:4011108:4011245 [2] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua079:4011108:4011245 [2] NCCL INFO Trees [0] 51/-1/-1->50->49 [1] 51/-1/-1->50->49 +gpua079:4011108:4011245 [2] NCCL INFO Channel 00/0 : 50[85000] -> 51[c7000] via P2P/IPC/read +gpua079:4011108:4011245 [2] NCCL INFO Channel 01/0 : 50[85000] -> 51[c7000] via P2P/IPC/read +gpua038:474864:474864 [3] NCCL INFO cudaDriverVersion 12020 +gpua038:474864:474864 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.38<0> +gpua038:474864:474864 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua038:474864:474864 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua038:474864:474945 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua038:474864:474945 [3] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua038:474864:474945 [3] NCCL INFO Using network AWS Libfabric +gpua038:474864:474945 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpua038:474864:474945 [3] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua038:474864:474945 [3] NCCL INFO Trees [0] -1/-1/-1->23->22 [1] -1/-1/-1->23->22 +gpua038:474864:474945 [3] NCCL INFO Channel 00/0 : 23[c7000] -> 24[7000] [send] via NET/AWS Libfabric/1 +gpua038:474864:474945 [3] NCCL INFO Channel 01/0 : 23[c7000] -> 24[7000] [send] via NET/AWS Libfabric/1 +gpua016:879606:879606 [2] NCCL INFO cudaDriverVersion 12020 +gpua016:879606:879606 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.16<0> +gpua016:879606:879606 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua016:879606:879606 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua016:879606:879691 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua016:879606:879691 [2] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua016:879606:879691 [2] NCCL INFO Using network AWS Libfabric +gpua016:879606:879691 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpua016:879606:879691 [2] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua016:879606:879691 [2] NCCL INFO Trees [0] 15/-1/-1->14->13 [1] 15/-1/-1->14->13 +gpua016:879606:879691 [2] NCCL INFO Channel 00/0 : 14[85000] -> 15[c7000] via P2P/IPC/read +gpua016:879606:879691 [2] NCCL INFO Channel 01/0 : 14[85000] -> 15[c7000] via P2P/IPC/read +gpua016:879606:879691 [2] NCCL INFO Connected all rings +gpua089:1166998:1166998 [3] NCCL INFO cudaDriverVersion 12020 +gpua089:1166998:1166998 [3] NCCL INFO Bootstrap : Using eth1:172.28.23.89<0> +gpua089:1166998:1166998 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua089:1166998:1166998 [3] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua089:1166998:1167309 [3] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua089:1166998:1167309 [3] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua089:1166998:1167309 [3] NCCL INFO Using network AWS Libfabric +gpua089:1166998:1167309 [3] NCCL INFO Setting affinity for GPU 3 to ffff +gpua089:1166998:1167309 [3] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua089:1166998:1167309 [3] NCCL INFO Trees [0] -1/-1/-1->63->62 [1] -1/-1/-1->63->62 +gpua089:1166998:1167309 [3] NCCL INFO Channel 00/0 : 63[c7000] -> 0[7000] [send] via NET/AWS Libfabric/1 +gpua089:1166998:1167309 [3] NCCL INFO Channel 01/0 : 63[c7000] -> 0[7000] [send] via NET/AWS Libfabric/1 +gpua006:976277:976365 [2] NCCL INFO Channel 00/0 : 2[85000] -> 1[46000] via P2P/IPC/read +gpua006:976277:976365 [2] NCCL INFO Channel 01/0 : 2[85000] -> 1[46000] via P2P/IPC/read +gpua006:976277:976365 [2] NCCL INFO Connected all trees +gpua006:976277:976365 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua006:976277:976365 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua006:976277:976365 [2] NCCL INFO comm 0x55f7aa255960 rank 2 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpua085:120734:120734 [2] NCCL INFO cudaDriverVersion 12020 +gpua085:120734:120734 [2] NCCL INFO Bootstrap : Using eth1:172.28.23.85<0> +gpua085:120734:120734 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua085:120734:120734 [2] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua085:120734:120823 [2] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua085:120734:120823 [2] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua085:120734:120823 [2] NCCL INFO Using network AWS Libfabric +gpua085:120734:120823 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpua085:120734:120823 [2] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua085:120734:120823 [2] NCCL INFO Trees [0] 59/-1/-1->58->57 [1] 59/-1/-1->58->57 +gpua085:120734:120823 [2] NCCL INFO Channel 00/0 : 58[85000] -> 59[c7000] via P2P/IPC/read +gpua085:120734:120823 [2] NCCL INFO Channel 01/0 : 58[85000] -> 59[c7000] via P2P/IPC/read +gpua085:120734:120823 [2] NCCL INFO Connected all rings +gpua057:4182112:4182223 [0] NCCL INFO Channel 01/0 : 43[c7000] -> 44[7000] [receive] via NET/AWS Libfabric/1 +gpua057:4182112:4182223 [0] NCCL INFO Channel 00/0 : 44[7000] -> 45[46000] via P2P/IPC/read +gpua057:4182112:4182223 [0] NCCL INFO Channel 01/0 : 44[7000] -> 45[46000] via P2P/IPC/read +gpua057:4182112:4182223 [0] NCCL INFO Connected all rings +gpua057:4182112:4182223 [0] NCCL INFO Channel 00/0 : 40[7000] -> 44[7000] [receive] via NET/AWS Libfabric/1 +gpua057:4182112:4182223 [0] NCCL INFO Channel 01/0 : 36[7000] -> 44[7000] [receive] via NET/AWS Libfabric/1 +gpua057:4182112:4182223 [0] NCCL INFO Channel 01/0 : 29[46000] -> 44[7000] [receive] via NET/AWS Libfabric/1 +gpua057:4182112:4182223 [0] NCCL INFO Channel 01/0 : 44[7000] -> 29[46000] [send] via NET/AWS Libfabric/1 +gpua057:4182112:4182223 [0] NCCL INFO Channel 01/0 : 44[7000] -> 36[7000] [send] via NET/AWS Libfabric/1 +gpua057:4182112:4182223 [0] NCCL INFO Channel 00/0 : 44[7000] -> 40[7000] [send] via NET/AWS Libfabric/1 +gpua040:4155646:4155741 [0] NCCL INFO Channel 01/0 : 12[7000] -> 28[7000] [receive] via NET/AWS Libfabric/1 +gpua040:4155646:4155741 [0] NCCL INFO Channel 01/0 : 60[7000] -> 28[7000] [receive] via NET/AWS Libfabric/1 +gpua040:4155646:4155741 [0] NCCL INFO Channel 01/0 : 28[7000] -> 60[7000] [send] via NET/AWS Libfabric/1 +gpua040:4155646:4155741 [0] NCCL INFO Channel 01/0 : 28[7000] -> 12[7000] [send] via NET/AWS Libfabric/1 +gpua040:4155646:4155741 [0] NCCL INFO Channel 00/0 : 28[7000] -> 24[7000] [send] via NET/AWS Libfabric/1 +gpua040:4155646:4155741 [0] NCCL INFO Connected all trees +gpua040:4155646:4155741 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua040:4155646:4155741 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua040:4155646:4155741 [0] NCCL INFO comm 0x5637999380e0 rank 28 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpua080:3566290:3566290 [1] NCCL INFO cudaDriverVersion 12020 +gpua080:3566290:3566290 [1] NCCL INFO Bootstrap : Using eth1:172.28.23.80<0> +gpua080:3566290:3566290 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin_v6 symbol. +gpua080:3566290:3566290 [1] NCCL INFO NET/Plugin: Failed to find ncclCollNetPlugin symbol (v4 or v5). +gpua080:3566290:3566383 [1] NCCL INFO NET/OFI Using aws-ofi-nccl 1.6.0 +gpua080:3566290:3566383 [1] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua080:3566290:3566383 [1] NCCL INFO Using network AWS Libfabric +gpua080:3566290:3566383 [1] NCCL INFO Setting affinity for GPU 1 to ffff,00000000 +gpua080:3566290:3566383 [1] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua080:3566290:3566383 [1] NCCL INFO Trees [0] 54/-1/-1->53->52 [1] 54/56/-1->53->52 +gpua080:3566290:3566383 [1] NCCL INFO Channel 00/0 : 53[46000] -> 54[85000] via P2P/IPC/read +gpua080:3566290:3566383 [1] NCCL INFO Channel 01/0 : 53[46000] -> 54[85000] via P2P/IPC/read +gpua054:719589:719717 [2] NCCL INFO NET/OFI Selected Provider is cxi (found 2 nics) +gpua054:719589:719717 [2] NCCL INFO Using network AWS Libfabric +gpua054:719589:719717 [2] NCCL INFO Setting affinity for GPU 2 to ffff0000 +gpua054:719589:719717 [2] NCCL INFO NCCL_CROSS_NIC set by environment to 1. +gpua054:719589:719717 [2] NCCL INFO Trees [0] 39/-1/-1->38->37 [1] 39/-1/-1->38->37 +gpua054:719589:719717 [2] NCCL INFO Channel 00/0 : 38[85000] -> 39[c7000] via P2P/IPC/read +gpua054:719589:719717 [2] NCCL INFO Channel 01/0 : 38[85000] -> 39[c7000] via P2P/IPC/read +gpua054:719589:719717 [2] NCCL INFO Connected all rings +gpua054:719589:719717 [2] NCCL INFO Channel 00/0 : 38[85000] -> 37[46000] via P2P/IPC/read +gpua054:719589:719717 [2] NCCL INFO Channel 01/0 : 38[85000] -> 37[46000] via P2P/IPC/read +gpua054:719589:719717 [2] NCCL INFO Connected all trees +gpua054:719589:719717 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua033:1942188:1942310 [1] NCCL INFO Connected all rings +gpua033:1942188:1942310 [1] NCCL INFO Channel 00/0 : 8[7000] -> 17[46000] [receive] via NET/AWS Libfabric/1 +gpua033:1942188:1942310 [1] NCCL INFO Channel 00/0 : 17[46000] -> 8[7000] [send] via NET/AWS Libfabric/1 +gpua033:1942188:1942310 [1] NCCL INFO Channel 00/0 : 17[46000] -> 16[7000] via P2P/IPC/read +gpua033:1942188:1942310 [1] NCCL INFO Channel 01/0 : 17[46000] -> 16[7000] via P2P/IPC/read +gpua033:1942188:1942310 [1] NCCL INFO Connected all trees +gpua033:1942188:1942310 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua033:1942188:1942310 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua033:1942188:1942310 [1] NCCL INFO comm 0x561c1f1a10d0 rank 17 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpua007:1877473:1877612 [3] NCCL INFO Connected all rings +gpua007:1877473:1877612 [3] NCCL INFO Channel 00/0 : 7[c7000] -> 6[85000] via P2P/IPC/read +gpua007:1877473:1877612 [3] NCCL INFO Channel 01/0 : 7[c7000] -> 6[85000] via P2P/IPC/read +gpua007:1877473:1877612 [3] NCCL INFO Connected all trees +gpua007:1877473:1877612 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua007:1877473:1877612 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua007:1877473:1877612 [3] NCCL INFO comm 0x5574419027e0 rank 7 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpua012:3029510:3029604 [1] NCCL INFO Connected all rings +gpua012:3029510:3029604 [1] NCCL INFO Channel 00/0 : 4[7000] -> 9[46000] [receive] via NET/AWS Libfabric/1 +gpua012:3029510:3029604 [1] NCCL INFO Channel 00/0 : 9[46000] -> 4[7000] [send] via NET/AWS Libfabric/1 +gpua012:3029510:3029604 [1] NCCL INFO Channel 00/0 : 9[46000] -> 8[7000] via P2P/IPC/read +gpua012:3029510:3029604 [1] NCCL INFO Channel 01/0 : 9[46000] -> 8[7000] via P2P/IPC/read +gpua012:3029510:3029604 [1] NCCL INFO Connected all trees +gpua012:3029510:3029604 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua012:3029510:3029604 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua012:3029510:3029604 [1] NCCL INFO comm 0x5603785a78d0 rank 9 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpua055:421542:436566 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua055:421542:436566 [0] NCCL INFO comm 0x557dc7538880 rank 40 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpua079:4011108:4011245 [2] NCCL INFO Connected all rings +gpua079:4011108:4011245 [2] NCCL INFO Channel 00/0 : 50[85000] -> 49[46000] via P2P/IPC/read +gpua079:4011108:4011245 [2] NCCL INFO Channel 01/0 : 50[85000] -> 49[46000] via P2P/IPC/read +gpua079:4011108:4011245 [2] NCCL INFO Connected all trees +gpua079:4011108:4011245 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua079:4011108:4011245 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua079:4011108:4011245 [2] NCCL INFO comm 0x556234387e40 rank 50 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpua038:474864:474945 [3] NCCL INFO Connected all rings +gpua038:474864:474945 [3] NCCL INFO Channel 00/0 : 23[c7000] -> 22[85000] via P2P/IPC/read +gpua038:474864:474945 [3] NCCL INFO Channel 01/0 : 23[c7000] -> 22[85000] via P2P/IPC/read +gpua038:474864:474945 [3] NCCL INFO Connected all trees +gpua038:474864:474945 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua038:474864:474945 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua038:474864:474945 [3] NCCL INFO comm 0x55e60dc3daa0 rank 23 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpua016:879606:879691 [2] NCCL INFO Channel 00/0 : 14[85000] -> 13[46000] via P2P/IPC/read +gpua016:879606:879691 [2] NCCL INFO Channel 01/0 : 14[85000] -> 13[46000] via P2P/IPC/read +gpua016:879606:879691 [2] NCCL INFO Connected all trees +gpua016:879606:879691 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua016:879606:879691 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua016:879606:879691 [2] NCCL INFO comm 0x5648224d1260 rank 14 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpua089:1166998:1167309 [3] NCCL INFO Connected all rings +gpua089:1166998:1167309 [3] NCCL INFO Channel 00/0 : 63[c7000] -> 62[85000] via P2P/IPC/read +gpua089:1166998:1167309 [3] NCCL INFO Channel 01/0 : 63[c7000] -> 62[85000] via P2P/IPC/read +gpua089:1166998:1167309 [3] NCCL INFO Connected all trees +gpua089:1166998:1167309 [3] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua089:1166998:1167309 [3] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua089:1166998:1167309 [3] NCCL INFO comm 0x556df7b172a0 rank 63 nranks 64 cudaDev 3 busId c7000 - Init COMPLETE +gpua085:120734:120823 [2] NCCL INFO Channel 00/0 : 58[85000] -> 57[46000] via P2P/IPC/read +gpua085:120734:120823 [2] NCCL INFO Channel 01/0 : 58[85000] -> 57[46000] via P2P/IPC/read +gpua085:120734:120823 [2] NCCL INFO Connected all trees +gpua085:120734:120823 [2] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua085:120734:120823 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua085:120734:120823 [2] NCCL INFO comm 0x5561443c27b0 rank 58 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +gpua057:4182112:4182223 [0] NCCL INFO Connected all trees +gpua057:4182112:4182223 [0] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua057:4182112:4182223 [0] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua057:4182112:4182223 [0] NCCL INFO comm 0x560a56bbc810 rank 44 nranks 64 cudaDev 0 busId 7000 - Init COMPLETE +gpua080:3566290:3566383 [1] NCCL INFO Connected all rings +gpua080:3566290:3566383 [1] NCCL INFO Channel 01/0 : 53[46000] -> 56[7000] [send] via NET/AWS Libfabric/1 +gpua080:3566290:3566383 [1] NCCL INFO Channel 01/0 : 56[7000] -> 53[46000] [receive] via NET/AWS Libfabric/1 +gpua080:3566290:3566383 [1] NCCL INFO Channel 00/0 : 53[46000] -> 52[7000] via P2P/IPC/read +gpua080:3566290:3566383 [1] NCCL INFO Channel 01/0 : 53[46000] -> 52[7000] via P2P/IPC/read +gpua080:3566290:3566383 [1] NCCL INFO Connected all trees +gpua080:3566290:3566383 [1] NCCL INFO threadThresholds 8/8/64 | 512/8/64 | 512 | 512 +gpua080:3566290:3566383 [1] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua080:3566290:3566383 [1] NCCL INFO comm 0x55d491c078d0 rank 53 nranks 64 cudaDev 1 busId 46000 - Init COMPLETE +gpua054:719589:719717 [2] NCCL INFO 2 coll channels, 2 p2p channels, 2 p2p channels per peer +gpua054:719589:719717 [2] NCCL INFO comm 0x564abc81db40 rank 38 nranks 64 cudaDev 2 busId 85000 - Init COMPLETE +[gpua006:0/64] 2024-02-15 12:29:34,261 (distributed:1027) INFO: Reducer buckets have been rebuilt in this iteration. +[gpua006:0/64] 2024-02-15 12:31:51,645 (trainer:756) INFO: 44epoch:train:1-100batch: iter_time=5.322, forward_time=0.402, loss_ctc=77.600, loss_interctc_layer6=82.729, loss_interctc_layer12=68.542, loss_interctc_layer15=62.876, loss_interctc_layer21=80.765, loss=74.502, backward_time=0.253, grad_norm=72.290, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.143, optim0_lr0=6.100e-05, train_time=17.127 +[gpua006:0/64] 2024-02-15 12:34:36,577 (trainer:756) INFO: 44epoch:train:101-200batch: iter_time=9.603e-05, forward_time=0.140, loss_ctc=56.273, loss_interctc_layer6=67.344, loss_interctc_layer12=55.524, loss_interctc_layer15=50.836, loss_interctc_layer21=58.342, loss=57.664, backward_time=0.208, grad_norm=61.756, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.139, optim0_lr0=6.100e-05, train_time=1.650 +[gpua006:0/64] 2024-02-15 12:40:41,539 (trainer:756) INFO: 44epoch:train:201-300batch: iter_time=9.204e-05, forward_time=0.140, loss_ctc=63.383, loss_interctc_layer6=70.134, loss_interctc_layer12=57.930, loss_interctc_layer15=53.043, loss_interctc_layer21=65.653, loss=62.029, backward_time=0.204, grad_norm=70.198, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.138, optim0_lr0=6.100e-05, train_time=3.649 +[gpua006:0/64] 2024-02-15 12:44:41,099 (trainer:756) INFO: 44epoch:train:301-400batch: iter_time=9.106e-05, forward_time=0.140, loss_ctc=83.749, loss_interctc_layer6=87.785, loss_interctc_layer12=72.950, loss_interctc_layer15=66.785, loss_interctc_layer21=86.953, loss=79.644, backward_time=0.206, grad_norm=96.768, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.138, optim0_lr0=6.099e-05, train_time=2.395 +[gpua006:0/64] 2024-02-15 12:50:14,888 (trainer:756) INFO: 44epoch:train:401-500batch: iter_time=9.719e-05, forward_time=0.322, loss_ctc=71.915, loss_interctc_layer6=74.754, loss_interctc_layer12=61.723, loss_interctc_layer15=56.460, loss_interctc_layer21=74.798, loss=67.930, backward_time=0.398, grad_norm=83.218, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.151, optim0_lr0=6.099e-05, train_time=3.336 +[gpua006:0/64] 2024-02-15 12:53:39,953 (trainer:756) INFO: 44epoch:train:501-600batch: iter_time=9.616e-05, forward_time=0.141, loss_ctc=75.507, loss_interctc_layer6=81.811, loss_interctc_layer12=68.392, loss_interctc_layer15=62.973, loss_interctc_layer21=78.434, loss=73.423, backward_time=0.205, grad_norm=119.487, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.138, optim0_lr0=6.098e-05, train_time=2.052 +[gpua006:0/64] 2024-02-15 12:59:09,174 (trainer:756) INFO: 44epoch:train:601-700batch: iter_time=9.474e-05, forward_time=0.143, loss_ctc=84.165, loss_interctc_layer6=94.822, loss_interctc_layer12=78.774, loss_interctc_layer15=72.420, loss_interctc_layer21=87.155, loss=83.467, backward_time=0.205, grad_norm=90.875, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.138, optim0_lr0=6.098e-05, train_time=3.292 +[gpua006:0/64] 2024-02-15 13:03:10,205 (trainer:756) INFO: 44epoch:train:701-800batch: iter_time=9.946e-05, forward_time=0.140, loss_ctc=72.099, loss_interctc_layer6=88.720, loss_interctc_layer12=73.968, loss_interctc_layer15=68.460, loss_interctc_layer21=74.662, loss=75.582, backward_time=0.206, grad_norm=102.393, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.139, optim0_lr0=6.097e-05, train_time=2.410 +[gpua006:0/64] 2024-02-15 13:05:23,041 (trainer:756) INFO: 44epoch:train:801-900batch: iter_time=9.870e-05, forward_time=0.140, loss_ctc=61.419, loss_interctc_layer6=74.833, loss_interctc_layer12=61.863, loss_interctc_layer15=56.666, loss_interctc_layer21=63.558, loss=63.668, backward_time=0.206, grad_norm=111.320, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.138, optim0_lr0=6.097e-05, train_time=1.328 +[gpua006:0/64] 2024-02-15 13:10:26,895 (trainer:756) INFO: 44epoch:train:901-1000batch: iter_time=2.834e-04, forward_time=0.334, loss_ctc=82.029, loss_interctc_layer6=87.769, loss_interctc_layer12=73.159, loss_interctc_layer15=67.256, loss_interctc_layer21=85.068, loss=79.056, backward_time=0.287, grad_norm=80.625, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.149, optim0_lr0=6.096e-05, train_time=3.037 +[gpua006:0/64] 2024-02-15 13:14:12,320 (trainer:756) INFO: 44epoch:train:1001-1100batch: iter_time=9.039e-05, forward_time=0.191, loss_ctc=79.715, loss_interctc_layer6=90.556, loss_interctc_layer12=76.023, loss_interctc_layer15=70.476, loss_interctc_layer21=82.605, loss=79.875, backward_time=0.253, grad_norm=95.696, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.140, optim0_lr0=6.096e-05, train_time=2.252 +[gpua006:0/64] 2024-02-15 13:18:14,174 (trainer:756) INFO: 44epoch:train:1101-1200batch: iter_time=9.335e-05, forward_time=0.142, loss_ctc=71.375, loss_interctc_layer6=78.806, loss_interctc_layer12=65.631, loss_interctc_layer15=60.292, loss_interctc_layer21=74.099, loss=70.040, backward_time=0.204, grad_norm=104.875, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.139, optim0_lr0=6.095e-05, train_time=2.420 +[gpua006:0/64] 2024-02-15 13:20:01,919 (multiple_iter_factory:32) INFO: Building 1th iter-factory... +[gpua006:0/64] 2024-02-15 13:20:21,077 (s2t:401) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpua006:0/64] 2024-02-15 13:20:24,722 (abs_task:1660) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.1", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.1", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.1", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.1", "type": "text"} + preprocess: ) +[gpua006:0/64] 2024-02-15 13:20:24,723 (abs_task:1661) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.1, +[gpua006:0/64] 2024-02-15 13:20:25,005 (abs_task:1662) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257 +[gpua006:0/64] 2024-02-15 13:40:10,724 (trainer:756) INFO: 44epoch:train:1201-1300batch: iter_time=5.557, forward_time=0.141, loss_ctc=74.267, loss_interctc_layer6=85.036, loss_interctc_layer12=70.278, loss_interctc_layer15=64.366, loss_interctc_layer21=77.052, loss=74.200, backward_time=0.206, grad_norm=176.936, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.139, optim0_lr0=6.095e-05, train_time=13.166 +[gpua006:0/64] 2024-02-15 13:42:52,805 (trainer:756) INFO: 44epoch:train:1301-1400batch: iter_time=8.760e-05, forward_time=0.142, loss_ctc=65.990, loss_interctc_layer6=72.936, loss_interctc_layer12=60.245, loss_interctc_layer15=55.178, loss_interctc_layer21=68.407, loss=64.551, backward_time=0.209, grad_norm=128.369, clip=100.000, loss_scale=2.028e+31, optim_step_time=0.138, optim0_lr0=6.094e-05, train_time=1.621 +[gpua006:0/64] 2024-02-15 13:44:37,286 (trainer:687) WARNING: The grad norm is nan. Skipping updating the model. +[gpua006:0/64] 2024-02-15 13:45:38,467 (trainer:756) INFO: 44epoch:train:1401-1500batch: iter_time=8.678e-05, forward_time=0.306, loss_ctc=62.190, loss_interctc_layer6=70.823, loss_interctc_layer12=58.543, loss_interctc_layer15=53.639, loss_interctc_layer21=64.338, loss=61.907, backward_time=0.299, grad_norm=74.908, clip=100.000, loss_scale=1.711e+31, optim_step_time=0.156, optim0_lr0=6.094e-05, train_time=1.653 +[gpua006:0/64] 2024-02-15 13:49:46,045 (trainer:756) INFO: 44epoch:train:1501-1600batch: iter_time=8.717e-05, forward_time=0.170, loss_ctc=58.098, loss_interctc_layer6=69.488, loss_interctc_layer12=57.350, loss_interctc_layer15=52.513, loss_interctc_layer21=60.165, loss=59.523, backward_time=0.250, grad_norm=66.391, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.139, optim0_lr0=6.093e-05, train_time=2.478 +[gpua006:0/64] 2024-02-15 13:52:37,466 (trainer:756) INFO: 44epoch:train:1601-1700batch: iter_time=8.983e-05, forward_time=0.143, loss_ctc=97.248, loss_interctc_layer6=92.237, loss_interctc_layer12=76.065, loss_interctc_layer15=69.537, loss_interctc_layer21=100.975, loss=87.212, backward_time=0.206, grad_norm=107.402, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.138, optim0_lr0=6.093e-05, train_time=1.713 +[gpua006:0/64] 2024-02-15 13:57:35,159 (trainer:756) INFO: 44epoch:train:1701-1800batch: iter_time=9.110e-05, forward_time=0.144, loss_ctc=64.583, loss_interctc_layer6=71.856, loss_interctc_layer12=59.074, loss_interctc_layer15=53.929, loss_interctc_layer21=67.005, loss=63.290, backward_time=0.206, grad_norm=70.962, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.138, optim0_lr0=6.092e-05, train_time=2.978 +[gpua006:0/64] 2024-02-15 14:02:07,132 (trainer:756) INFO: 44epoch:train:1801-1900batch: iter_time=8.727e-05, forward_time=0.142, loss_ctc=77.353, loss_interctc_layer6=87.252, loss_interctc_layer12=72.766, loss_interctc_layer15=66.952, loss_interctc_layer21=80.343, loss=76.933, backward_time=0.206, grad_norm=75.575, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.138, optim0_lr0=6.092e-05, train_time=2.720 +[gpua006:0/64] 2024-02-15 14:05:22,774 (trainer:756) INFO: 44epoch:train:1901-2000batch: iter_time=8.500e-05, forward_time=0.143, loss_ctc=80.574, loss_interctc_layer6=96.639, loss_interctc_layer12=80.574, loss_interctc_layer15=74.359, loss_interctc_layer21=83.503, loss=83.130, backward_time=0.208, grad_norm=95.884, clip=100.000, loss_scale=1.014e+31, optim_step_time=0.137, optim0_lr0=6.092e-05, train_time=1.956 +[gpua006:0/64] 2024-02-15 14:07:37,193 (trainer:687) WARNING: The grad norm is nan. Skipping updating the model. +[gpua006:0/64] 2024-02-15 14:07:49,031 (trainer:756) INFO: 44epoch:train:2001-2100batch: iter_time=8.733e-05, forward_time=0.180, loss_ctc=61.372, loss_interctc_layer6=74.493, loss_interctc_layer12=61.318, loss_interctc_layer15=56.144, loss_interctc_layer21=63.554, loss=63.376, backward_time=0.239, grad_norm=82.381, clip=100.000, loss_scale=9.834e+30, optim_step_time=0.138, optim0_lr0=6.091e-05, train_time=1.462 +[gpua006:0/64] 2024-02-15 14:12:03,032 (trainer:756) INFO: 44epoch:train:2101-2200batch: iter_time=9.052e-05, forward_time=0.333, loss_ctc=72.763, loss_interctc_layer6=83.301, loss_interctc_layer12=69.266, loss_interctc_layer15=63.766, loss_interctc_layer21=75.447, loss=72.909, backward_time=0.329, grad_norm=87.425, clip=100.000, loss_scale=5.071e+30, optim_step_time=0.156, optim0_lr0=6.091e-05, train_time=2.539 +[gpua006:0/64] 2024-02-15 14:15:22,207 (trainer:756) INFO: 44epoch:train:2201-2300batch: iter_time=9.271e-05, forward_time=0.143, loss_ctc=79.764, loss_interctc_layer6=84.073, loss_interctc_layer12=69.909, loss_interctc_layer15=63.902, loss_interctc_layer21=82.684, loss=76.066, backward_time=0.206, grad_norm=86.251, clip=100.000, loss_scale=5.071e+30, optim_step_time=0.138, optim0_lr0=6.090e-05, train_time=1.993 +[gpua006:0/64] 2024-02-15 14:19:30,723 (trainer:756) INFO: 44epoch:train:2301-2400batch: iter_time=9.251e-05, forward_time=0.143, loss_ctc=74.045, loss_interctc_layer6=83.786, loss_interctc_layer12=70.018, loss_interctc_layer15=64.490, loss_interctc_layer21=76.757, loss=73.819, backward_time=0.206, grad_norm=80.263, clip=100.000, loss_scale=5.071e+30, optim_step_time=0.138, optim0_lr0=6.090e-05, train_time=2.483 +[gpua006:0/64] 2024-02-15 14:22:32,200 (trainer:756) INFO: 44epoch:train:2401-2500batch: iter_time=9.198e-05, forward_time=0.144, loss_ctc=71.036, loss_interctc_layer6=83.202, loss_interctc_layer12=69.054, loss_interctc_layer15=63.397, loss_interctc_layer21=73.627, loss=72.063, backward_time=0.207, grad_norm=80.207, clip=100.000, loss_scale=5.071e+30, optim_step_time=0.138, optim0_lr0=6.089e-05, train_time=1.817 +[gpua006:0/64] 2024-02-15 14:22:52,229 (multiple_iter_factory:32) INFO: Building 2th iter-factory... +[gpua006:0/64] 2024-02-15 14:23:11,439 (s2t:401) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4') +[gpua006:0/64] 2024-02-15 14:23:14,942 (abs_task:1660) INFO: [train] dataset: +ESPnetDataset( + speech: {"path": "exp/s2t_stats_raw_bpe50000/splits12/wav.scp/split.11", "type": "kaldi_ark"} + text_prev: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.prev/split.11", "type": "text"} + text_ctc: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text.ctc/split.11", "type": "text"} + text: {"path": "exp/s2t_stats_raw_bpe50000/splits12/text/split.11", "type": "text"} + preprocess: ) +[gpua006:0/64] 2024-02-15 14:23:14,942 (abs_task:1661) INFO: [train] Batch sampler: UnsortedBatchSampler(N-batch=19027, batch_size=256, key_file=exp/s2t_stats_raw_bpe50000/splits12/speech_shape/split.11, +[gpua006:0/64] 2024-02-15 14:23:14,949 (abs_task:1662) INFO: [train] mini-batch sizes summary: N-batch=19027, mean=256.0, min=256, max=257 +srun: Job step aborted: Waiting up to 32 seconds for job step to finish.