Automatic Speech Recognition
ESPnet
multilingual
audio
speech-translation
pyf98's picture
add model files
57a6472
raw
history blame contribute delete
No virus
178 kB
# Running on gpua031.delta.ncsa.illinois.edu
# Started at Sun Dec 3 12:37:41 CST 2023
# SLURMD_NODENAME=gpua031
# SLURM_CLUSTER_NAME=delta
# SLURM_CONF=/var/spool/slurmd/conf-cache/slurm.conf
# SLURM_CPUS_ON_NODE=64
# SLURM_CPUS_PER_TASK=64
# SLURM_EXPORT_ENV=PATH
# SLURM_GET_USER_ENV=1
# SLURM_GPUS_ON_NODE=4
# SLURM_GTIDS=0
# SLURM_JOBID=2724906
# SLURM_JOB_ACCOUNT=bbjs-delta-gpu
# SLURM_JOB_CPUS_PER_NODE='64(x10)'
# SLURM_JOB_END_TIME=1701801456
# SLURM_JOB_GID=202
# SLURM_JOB_GPUS=0,1,2,3
# SLURM_JOB_ID=2724906
# SLURM_JOB_NAME=exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/train.log
# SLURM_JOB_NODELIST='gpua[031-040]'
# SLURM_JOB_NUM_NODES=10
# SLURM_JOB_PARTITION=gpuA100x4
# SLURM_JOB_QOS=bbjs-delta-gpu
# SLURM_JOB_START_TIME=1701628656
# SLURM_JOB_UID=68077
# SLURM_JOB_USER=peng6
# SLURM_LOCALID=0
# SLURM_MEM_PER_NODE=240000
# SLURM_NNODES=10
# SLURM_NODEID=0
# SLURM_NODELIST='gpua[031-040]'
# SLURM_NODE_ALIASES='(null)'
# SLURM_OPEN_MODE=a
# SLURM_PRIO_PROCESS=0
# SLURM_PROCID=0
# SLURM_SUBMIT_DIR=/scratch/bbjs/peng6/espnet-whisper-public/egs2/owsm_v3.1/s2t1
# SLURM_SUBMIT_HOST=dt-login01.delta.ncsa.illinois.edu
# SLURM_TASKS_PER_NODE='1(x10)'
# SLURM_TASK_PID=538892
# SLURM_TOPOLOGY_ADDR=ss00.ss06.gpua031
# SLURM_TOPOLOGY_ADDR_PATTERN=switch.switch.node
# SLURM_WORKING_CLUSTER=delta:dt-sched:6817:9984:109
# srun --export=ALL python3 -m espnet2.bin.s2t_train --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000 --config conf/train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/owsm_v3.1/s2t1/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/.dist_init_6f0f713e-40cf-4d8b-9708-fe73402d1548
/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000 --config conf/train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/owsm_v3.1/s2t1/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/.dist_init_6f0f713e-40cf-4d8b-9708-fe73402d1548
/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000 --config conf/train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape /scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000 --config conf/train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/owsm_v3.1/s2t1/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/.dist_init_6f0f713e-40cf-4d8b-9708-fe73402d1548
--fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/owsm_v3.1/s2t1/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/.dist_init_6f0f713e-40cf-4d8b-9708-fe73402d1548
[gpua031:0/40] 2023-12-03 12:37:52,266 (distributed_c10d:319) INFO: Added key: store_based_barrier_key:1 to store for rank: 0
[gpua031:0/40] 2023-12-03 12:38:02,273 (distributed_c10d:337) INFO: Waiting in store based barrier to initialize process group for rank: 0, key: store_based_barrier_key:1 (world_size=40, worker_count=12, timeout=0:30:00)
[gpua031:0/40] 2023-12-03 12:38:12,277 (distributed_c10d:337) INFO: Waiting in store based barrier to initialize process group for rank: 0, key: store_based_barrier_key:1 (world_size=40, worker_count=12, timeout=0:30:00)
[gpua031:0/40] 2023-12-03 12:38:22,293 (distributed_c10d:337) INFO: Waiting in store based barrier to initialize process group for rank: 0, key: store_based_barrier_key:1 (world_size=40, worker_count=12, timeout=0:30:00)
[gpua031:0/40] 2023-12-03 12:38:32,309 (distributed_c10d:337) INFO: Waiting in store based barrier to initialize process group for rank: 0, key: store_based_barrier_key:1 (world_size=40, worker_count=12, timeout=0:30:00)
/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000 --config conf/train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape /scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000 --config conf/train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape /scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000 --config conf/train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape /scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000 --config conf/train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape /scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000 --config conf/train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape /scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000 --config conf/train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape /scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/bin/python3 /scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py --use_preprocessor true --bpemodel data/token_list/bpe_unigram50000/bpe.model --token_type bpe --token_list data/token_list/bpe_unigram50000/tokens.txt --non_linguistic_symbols none --cleaner none --g2p none --valid_data_path_and_name_and_type dump/raw/dev_v3/wav.scp,speech,kaldi_ark --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/speech_shape --resume true --fold_length 80000 --output_dir exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000 --config conf/train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn.yaml --frontend_conf fs=16k --normalize=global_mvn --normalize_conf stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/wav.scp,speech,kaldi_ark --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/speech_shape --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/owsm_v3.1/s2t1/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/.dist_init_6f0f713e-40cf-4d8b-9708-fe73402d1548
--fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multipr--fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/owsm_v3.1/s2t1/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/.dist_init_6f0f713e-40cf-4d8b-9708-fe73402d1548
ocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/owsm_v3.1/s2t1/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/.dist_init_6f0f713e-40cf-4d8b-9708-fe73402d1548
--fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/owsm_v3.1/s2t1/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/.dist_init_6f0f713e-40cf-4d8b-9708-fe73402d1548
--fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multipr--fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/owsm_v3.1/s2t1/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/.dist_init_6f0f713e-40cf-4d8b-9708-fe73402d1548
ocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/owsm_v3.1/s2t1/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/.dist_init_6f0f713e-40cf-4d8b-9708-fe73402d1548
--fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.prev,text_prev,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_prev_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text.ctc,text_ctc,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_ctc_shape.bpe --fold_length 150 --train_data_path_and_name_and_type exp/s2t_stats_raw_bpe50000/splits12/text,text,text --train_shape_file exp/s2t_stats_raw_bpe50000/splits12/text_shape.bpe --multiple_iterator true --valid_data_path_and_name_and_type dump/raw/dev_v3/text.prev,text_prev,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_prev_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text.ctc,text_ctc,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_ctc_shape.bpe --valid_data_path_and_name_and_type dump/raw/dev_v3/text,text,text --valid_shape_file exp/s2t_stats_raw_bpe50000/valid/text_shape.bpe --ngpu 4 --multiprocessing_distributed true --dist_launcher slurm --dist_init_method file:///scratch/bbjs/peng6/espnet-whisper-public/egs2/owsm_v3.1/s2t1/exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/.dist_init_6f0f713e-40cf-4d8b-9708-fe73402d1548
[gpua031:0/40] 2023-12-03 12:38:42,323 (distributed_c10d:337) INFO: Waiting in store based barrier to initialize process group for rank: 0, key: store_based_barrier_key:1 (world_size=40, worker_count=12, timeout=0:30:00)
[gpua031:0/40] 2023-12-03 12:38:45,898 (distributed_c10d:353) INFO: Rank 0: Completed store-based barrier for key:store_based_barrier_key:1 with 40 nodes.
[gpua031:0/40] 2023-12-03 12:38:45,976 (s2t:464) INFO: Vocabulary size: 50002
[gpua031:0/40] 2023-12-03 12:38:53,358 (abs_task:1231) INFO: pytorch.version=1.13.1, cuda.available=True, cudnn.version=8500, cudnn.benchmark=False, cudnn.deterministic=True
[gpua031:0/40] 2023-12-03 12:38:53,369 (abs_task:1232) INFO: Model structure:
ESPnetS2TModel(
(frontend): DefaultFrontend(
(stft): Stft(n_fft=512, win_length=400, hop_length=160, center=True, normalized=False, onesided=True)
(frontend): Frontend()
(logmel): LogMel(sr=16000, n_fft=512, n_mels=80, fmin=0, fmax=8000.0, htk=False)
)
(specaug): SpecAug(
(freq_mask): MaskAlongAxis(mask_width_range=[0, 27], num_mask=2, axis=freq)
(time_mask): MaskAlongAxisVariableMaxWidth(mask_width_ratio_range=[0.0, 0.05], num_mask=10, axis=time)
)
(normalize): GlobalMVN(stats_file=exp/s2t_stats_raw_bpe50000/train/feats_stats.npz, norm_means=True, norm_vars=True)
(encoder): EBranchformerEncoder(
(embed): Conv2dSubsampling(
(conv): Sequential(
(0): Conv2d(1, 1024, kernel_size=(3, 3), stride=(2, 2))
(1): ReLU()
(2): Conv2d(1024, 1024, kernel_size=(3, 3), stride=(2, 2))
(3): ReLU()
)
(out): Sequential(
(0): Linear(in_features=19456, out_features=1024, bias=True)
(1): PositionalEncoding(
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(encoders): MultiSequential(
(0): EBranchformerEncoderLayer(
(attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(cgmlp): ConvolutionalGatingMLP(
(channel_proj1): Sequential(
(0): Linear(in_features=1024, out_features=4096, bias=True)
(1): GELU(approximate='none')
)
(csgu): ConvolutionalSpatialGatingUnit(
(norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(act): Identity()
(dropout): Dropout(p=0.1, inplace=False)
)
(channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(feed_forward_macaron): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
(depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
)
(1): EBranchformerEncoderLayer(
(attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(cgmlp): ConvolutionalGatingMLP(
(channel_proj1): Sequential(
(0): Linear(in_features=1024, out_features=4096, bias=True)
(1): GELU(approximate='none')
)
(csgu): ConvolutionalSpatialGatingUnit(
(norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(act): Identity()
(dropout): Dropout(p=0.1, inplace=False)
)
(channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(feed_forward_macaron): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
(depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
)
(2): EBranchformerEncoderLayer(
(attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(cgmlp): ConvolutionalGatingMLP(
(channel_proj1): Sequential(
(0): Linear(in_features=1024, out_features=4096, bias=True)
(1): GELU(approximate='none')
)
(csgu): ConvolutionalSpatialGatingUnit(
(norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(act): Identity()
(dropout): Dropout(p=0.1, inplace=False)
)
(channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(feed_forward_macaron): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
(depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
)
(3): EBranchformerEncoderLayer(
(attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(cgmlp): ConvolutionalGatingMLP(
(channel_proj1): Sequential(
(0): Linear(in_features=1024, out_features=4096, bias=True)
(1): GELU(approximate='none')
)
(csgu): ConvolutionalSpatialGatingUnit(
(norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(act): Identity()
(dropout): Dropout(p=0.1, inplace=False)
)
(channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(feed_forward_macaron): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
(depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
)
(4): EBranchformerEncoderLayer(
(attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(cgmlp): ConvolutionalGatingMLP(
(channel_proj1): Sequential(
(0): Linear(in_features=1024, out_features=4096, bias=True)
(1): GELU(approximate='none')
)
(csgu): ConvolutionalSpatialGatingUnit(
(norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(act): Identity()
(dropout): Dropout(p=0.1, inplace=False)
)
(channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(feed_forward_macaron): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
(depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
)
(5): EBranchformerEncoderLayer(
(attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(cgmlp): ConvolutionalGatingMLP(
(channel_proj1): Sequential(
(0): Linear(in_features=1024, out_features=4096, bias=True)
(1): GELU(approximate='none')
)
(csgu): ConvolutionalSpatialGatingUnit(
(norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(act): Identity()
(dropout): Dropout(p=0.1, inplace=False)
)
(channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(feed_forward_macaron): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
(depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
)
(6): EBranchformerEncoderLayer(
(attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(cgmlp): ConvolutionalGatingMLP(
(channel_proj1): Sequential(
(0): Linear(in_features=1024, out_features=4096, bias=True)
(1): GELU(approximate='none')
)
(csgu): ConvolutionalSpatialGatingUnit(
(norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(act): Identity()
(dropout): Dropout(p=0.1, inplace=False)
)
(channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(feed_forward_macaron): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
(depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
)
(7): EBranchformerEncoderLayer(
(attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(cgmlp): ConvolutionalGatingMLP(
(channel_proj1): Sequential(
(0): Linear(in_features=1024, out_features=4096, bias=True)
(1): GELU(approximate='none')
)
(csgu): ConvolutionalSpatialGatingUnit(
(norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(act): Identity()
(dropout): Dropout(p=0.1, inplace=False)
)
(channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(feed_forward_macaron): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
(depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
)
(8): EBranchformerEncoderLayer(
(attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(cgmlp): ConvolutionalGatingMLP(
(channel_proj1): Sequential(
(0): Linear(in_features=1024, out_features=4096, bias=True)
(1): GELU(approximate='none')
)
(csgu): ConvolutionalSpatialGatingUnit(
(norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(act): Identity()
(dropout): Dropout(p=0.1, inplace=False)
)
(channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(feed_forward_macaron): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
(depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
)
(9): EBranchformerEncoderLayer(
(attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(cgmlp): ConvolutionalGatingMLP(
(channel_proj1): Sequential(
(0): Linear(in_features=1024, out_features=4096, bias=True)
(1): GELU(approximate='none')
)
(csgu): ConvolutionalSpatialGatingUnit(
(norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(act): Identity()
(dropout): Dropout(p=0.1, inplace=False)
)
(channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(feed_forward_macaron): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
(depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
)
(10): EBranchformerEncoderLayer(
(attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(cgmlp): ConvolutionalGatingMLP(
(channel_proj1): Sequential(
(0): Linear(in_features=1024, out_features=4096, bias=True)
(1): GELU(approximate='none')
)
(csgu): ConvolutionalSpatialGatingUnit(
(norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(act): Identity()
(dropout): Dropout(p=0.1, inplace=False)
)
(channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(feed_forward_macaron): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
(depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
)
(11): EBranchformerEncoderLayer(
(attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(cgmlp): ConvolutionalGatingMLP(
(channel_proj1): Sequential(
(0): Linear(in_features=1024, out_features=4096, bias=True)
(1): GELU(approximate='none')
)
(csgu): ConvolutionalSpatialGatingUnit(
(norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(act): Identity()
(dropout): Dropout(p=0.1, inplace=False)
)
(channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(feed_forward_macaron): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
(depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
)
(12): EBranchformerEncoderLayer(
(attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(cgmlp): ConvolutionalGatingMLP(
(channel_proj1): Sequential(
(0): Linear(in_features=1024, out_features=4096, bias=True)
(1): GELU(approximate='none')
)
(csgu): ConvolutionalSpatialGatingUnit(
(norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(act): Identity()
(dropout): Dropout(p=0.1, inplace=False)
)
(channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(feed_forward_macaron): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
(depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
)
(13): EBranchformerEncoderLayer(
(attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(cgmlp): ConvolutionalGatingMLP(
(channel_proj1): Sequential(
(0): Linear(in_features=1024, out_features=4096, bias=True)
(1): GELU(approximate='none')
)
(csgu): ConvolutionalSpatialGatingUnit(
(norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(act): Identity()
(dropout): Dropout(p=0.1, inplace=False)
)
(channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(feed_forward_macaron): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
(depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
)
(14): EBranchformerEncoderLayer(
(attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(cgmlp): ConvolutionalGatingMLP(
(channel_proj1): Sequential(
(0): Linear(in_features=1024, out_features=4096, bias=True)
(1): GELU(approximate='none')
)
(csgu): ConvolutionalSpatialGatingUnit(
(norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(act): Identity()
(dropout): Dropout(p=0.1, inplace=False)
)
(channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(feed_forward_macaron): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
(depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
)
(15): EBranchformerEncoderLayer(
(attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(cgmlp): ConvolutionalGatingMLP(
(channel_proj1): Sequential(
(0): Linear(in_features=1024, out_features=4096, bias=True)
(1): GELU(approximate='none')
)
(csgu): ConvolutionalSpatialGatingUnit(
(norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(act): Identity()
(dropout): Dropout(p=0.1, inplace=False)
)
(channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(feed_forward_macaron): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
(depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
)
(16): EBranchformerEncoderLayer(
(attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(cgmlp): ConvolutionalGatingMLP(
(channel_proj1): Sequential(
(0): Linear(in_features=1024, out_features=4096, bias=True)
(1): GELU(approximate='none')
)
(csgu): ConvolutionalSpatialGatingUnit(
(norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(act): Identity()
(dropout): Dropout(p=0.1, inplace=False)
)
(channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(feed_forward_macaron): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
(depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
)
(17): EBranchformerEncoderLayer(
(attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(cgmlp): ConvolutionalGatingMLP(
(channel_proj1): Sequential(
(0): Linear(in_features=1024, out_features=4096, bias=True)
(1): GELU(approximate='none')
)
(csgu): ConvolutionalSpatialGatingUnit(
(norm): LayerNorm((2048,), eps=1e-12, elementwise_affine=True)
(conv): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(act): Identity()
(dropout): Dropout(p=0.1, inplace=False)
)
(channel_proj2): Linear(in_features=2048, out_features=1024, bias=True)
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(feed_forward_macaron): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): Swish()
)
(norm_ff): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_ff_macaron): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mha): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_mlp): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm_final): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
(depthwise_conv_fusion): Conv1d(2048, 2048, kernel_size=(31,), stride=(1,), padding=(15,), groups=2048)
(merge_proj): Linear(in_features=2048, out_features=1024, bias=True)
)
)
(after_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
)
(decoder): TransformerDecoder(
(embed): Sequential(
(0): Embedding(50002, 1024)
(1): PositionalEncoding(
(dropout): Dropout(p=0.1, inplace=False)
)
)
(after_norm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(output_layer): Linear(in_features=1024, out_features=50002, bias=True)
(decoders): MultiSequential(
(0): DecoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(src_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(1): DecoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(src_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(2): DecoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(src_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(3): DecoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(src_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(4): DecoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(src_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(5): DecoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(src_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(6): DecoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(src_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(7): DecoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(src_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(8): DecoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(src_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(9): DecoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(src_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(10): DecoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(src_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(11): DecoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(src_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(12): DecoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(src_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(13): DecoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(src_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(14): DecoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(src_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(15): DecoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(src_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(16): DecoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(src_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(17): DecoderLayer(
(self_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(src_attn): MultiHeadedAttention(
(linear_q): Linear(in_features=1024, out_features=1024, bias=True)
(linear_k): Linear(in_features=1024, out_features=1024, bias=True)
(linear_v): Linear(in_features=1024, out_features=1024, bias=True)
(linear_out): Linear(in_features=1024, out_features=1024, bias=True)
(dropout): Identity()
(q_norm): Identity()
(k_norm): Identity()
)
(feed_forward): PositionwiseFeedForward(
(w_1): Linear(in_features=1024, out_features=4096, bias=True)
(w_2): Linear(in_features=4096, out_features=1024, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
(activation): ReLU()
)
(norm1): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm2): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(norm3): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(criterion_att): LabelSmoothingLoss(
(criterion): KLDivLoss()
)
(ctc): CTC(
(ctc_lo): Linear(in_features=1024, out_features=50002, bias=True)
(ctc_loss): CTCLoss()
)
)
Model summary:
Class Name: ESPnetS2TModel
Total Number of model parameters: 1.02 B
Number of trainable parameters: 1.02 B (100.0%)
Size: 4.07 GB
Type: torch.float32
[gpua031:0/40] 2023-12-03 12:38:53,369 (abs_task:1235) INFO: Optimizer:
OSS (
Parameter Group 0
amsgrad: False
betas: [0.9, 0.98]
capturable: False
eps: 1e-06
foreach: None
initial_lr: 0.0002
lr: 1.6666666666666667e-09
maximize: False
weight_decay: 0.0
)
[gpua031:0/40] 2023-12-03 12:38:53,369 (abs_task:1236) INFO: Scheduler: PiecewiseLinearWarmupLR(warmup_steps_list=[0, 30000, 60000], warmup_lr_list=[0.0, 5e-05, 0.0002])
[gpua031:0/40] 2023-12-03 12:38:53,372 (abs_task:1245) INFO: Saving the configuration in exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/config.yaml
[gpua031:0/40] 2023-12-03 12:38:58,723 (s2t:445) INFO: Optional Data Names: ('text_prev', 'text_ctc', 'text_spk2', 'text_spk3', 'text_spk4')
[gpua031:0/40] 2023-12-03 12:38:59,634 (abs_task:1616) INFO: [valid] dataset:
ESPnetDataset(
speech: {"path": "dump/raw/dev_v3/wav.scp", "type": "kaldi_ark"}
text_prev: {"path": "dump/raw/dev_v3/text.prev", "type": "text"}
text_ctc: {"path": "dump/raw/dev_v3/text.ctc", "type": "text"}
text: {"path": "dump/raw/dev_v3/text", "type": "text"}
preprocess: <espnet2.train.preprocessor.S2TPreprocessor object at 0x7f7bad3efa00>)
[gpua031:0/40] 2023-12-03 12:38:59,634 (abs_task:1617) INFO: [valid] Batch sampler: UnsortedBatchSampler(N-batch=4982, batch_size=240, key_file=exp/s2t_stats_raw_bpe50000/valid/speech_shape,
[gpua031:0/40] 2023-12-03 12:38:59,635 (abs_task:1618) INFO: [valid] mini-batch sizes summary: N-batch=4982, mean=240.0, min=240, max=241
[gpua031:0/40] 2023-12-03 12:39:10,330 (trainer:159) INFO: The training was resumed using exp/s2t_train_s2t_ebf_conv2d_size1024_e18_d18_piecewise_lr2e-4_warmup60k_flashattn_raw_bpe50000/checkpoint.pth
[gpua031:0/40] 2023-12-03 12:39:10,339 (sharded_ddp:179) INFO: ShardedDDP bucket size: 8.00M parameters, model size 969.78M parameters
Process SpawnProcess-4:
Process SpawnProcess-2:
Process SpawnProcess-1:
Process SpawnProcess-4:
Process SpawnProcess-3:
Process SpawnProcess-1:
Process SpawnProcess-2:
Process SpawnProcess-3:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
RuntimeError: Tensors must be contiguous
Traceback (most recent call last):
RuntimeError: Tensors must be contiguous
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
RuntimeError: Tensors must be contiguous
RuntimeError: Tensors must be contiguous
RuntimeError: Tensors must be contiguous
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
RuntimeError: Tensors must be contiguous
Traceback (most recent call last):
Traceback (most recent call last):
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
RuntimeError: Tensors must be contiguous
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
RuntimeError: Tensors must be contiguous
Process SpawnProcess-3:
Process SpawnProcess-1:
Process SpawnProcess-4:
Process SpawnProcess-2:
Traceback (most recent call last):
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
RuntimeError: Tensors must be contiguous
Traceback (most recent call last):
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
RuntimeError: Tensors must be contiguous
Traceback (most recent call last):
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
RuntimeError: Tensors must be contiguous
Traceback (most recent call last):
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
RuntimeError: Tensors must be contiguous
Traceback (most recent call last):
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in <module>
main()
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main
S2TTask.main(cmd=cmd)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1134, in main
while not ProcessContext(processes, error_queues).join():
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join
raise ProcessExitedException(
torch.multiprocessing.spawn.ProcessExitedException: process 1 terminated with exit code 1
Traceback (most recent call last):
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in <module>
main()
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main
S2TTask.main(cmd=cmd)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1134, in main
Traceback (most recent call last):
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main
while not ProcessContext(processes, error_queues).join():
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join
return _run_code(code, main_globals, None,
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code
raise ProcessExitedException(
torch.multiprocessing.spawn.ProcessExitedException: process 1 terminated with exit code 1
exec(code, run_globals)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in <module>
main()
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main
S2TTask.main(cmd=cmd)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1134, in main
while not ProcessContext(processes, error_queues).join():
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join
raise ProcessExitedException(
torch.multiprocessing.spawn.ProcessExitedException: process 3 terminated with exit code 1
srun: error: gpua031: task 0: Exited with exit code 1
srun: error: gpua032: task 1: Exited with exit code 1
srun: error: gpua033: task 2: Exited with exit code 1
Process SpawnProcess-1:
Process SpawnProcess-4:
Process SpawnProcess-3:
Process SpawnProcess-2:
Process SpawnProcess-3:
Process SpawnProcess-1:
Process SpawnProcess-4:
Process SpawnProcess-1:
Process SpawnProcess-4:
Process SpawnProcess-4:
Process SpawnProcess-2:
Process SpawnProcess-1:
Process SpawnProcess-2:
Process SpawnProcess-2:
Process SpawnProcess-3:
Process SpawnProcess-3:
Process SpawnProcess-2:
Process SpawnProcess-3:
Process SpawnProcess-1:
Process SpawnProcess-4:
Traceback (most recent call last):
Traceback (most recent call last):
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
Traceback (most recent call last):
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
RuntimeError: Tensors must be contiguous
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
RuntimeError: Tensors must be contiguous
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
Traceback (most recent call last):
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
Traceback (most recent call last):
Traceback (most recent call last):
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
Traceback (most recent call last):
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
work = default_pg.broadcast([tensor], opts)
RuntimeError: Tensors must be contiguous
RuntimeError: Tensors must be contiguous
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
Traceback (most recent call last):
Traceback (most recent call last):
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
RuntimeError: Tensors must be contiguous
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
Traceback (most recent call last):
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
RuntimeError: Tensors must be contiguous
RuntimeError: Tensors must be contiguous
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
RuntimeError: Tensors must be contiguous
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
RuntimeError: Tensors must be contiguous
RuntimeError: Tensors must be contiguous
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
Traceback (most recent call last):
RuntimeError: Tensors must be contiguous
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
RuntimeError: Tensors must be contiguous
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
Traceback (most recent call last):
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
RuntimeError: Tensors must be contiguous
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
RuntimeError: Tensors must be contiguous
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
RuntimeError: Tensors must be contiguous
RuntimeError: Tensors must be contiguous
RuntimeError: Tensors must be contiguous
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
Traceback (most recent call last):
Traceback (most recent call last):
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
RuntimeError: Tensors must be contiguous
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
RuntimeError: Tensors must be contiguous
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
RuntimeError: Tensors must be contiguous
Process SpawnProcess-1:
Process SpawnProcess-2:
Process SpawnProcess-2:
Process SpawnProcess-3:
Process SpawnProcess-4:
Process SpawnProcess-3:
Process SpawnProcess-4:
Process SpawnProcess-1:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
RuntimeError: Tensors must be contiguous
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
RuntimeError: Tensors must be contiguous
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
RuntimeError: Tensors must be contiguous
RuntimeError: Tensors must be contiguous
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
RuntimeError: Tensors must be contiguous
RuntimeError: Tensors must be contiguous
Traceback (most recent call last):
Traceback (most recent call last):
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
self.run()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/multiprocessing/process.py", line 108, in run
self._target(*self._args, **self._kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1393, in main_worker
cls.trainer.run(
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/train/trainer.py", line 224, in run
dp_model = fairscale.nn.data_parallel.ShardedDataParallel(
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 200, in __init__
self._sync_params_and_buffers()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/autograd/grad_mode.py", line 27, in decorate_context
return func(*args, **kwargs)
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/fairscale/nn/data_parallel/sharded_ddp.py", line 545, in _sync_params_and_buffers
dist.broadcast(t, src=self._reference_global_rank, group=self._process_group, async_op=True)
RuntimeError: Tensors must be contiguous
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 1400, in broadcast
work = default_pg.broadcast([tensor], opts)
RuntimeError: Tensors must be contiguous
Traceback (most recent call last):
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in <module>
main()
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main
S2TTask.main(cmd=cmd)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1134, in main
while not ProcessContext(processes, error_queues).join():
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join
Traceback (most recent call last):
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in <module>
main()
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main
S2TTask.main(cmd=cmd)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1134, in main
while not ProcessContext(processes, error_queues).join():
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join
Traceback (most recent call last):
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in <module>
main()
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main
S2TTask.main(cmd=cmd)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1134, in main
raise ProcessExitedException(
raise ProcessExitedException(
torch.multiprocessing.spawn.ProcessExitedException: process 0 terminated with exit code 1
torch.multiprocessing.spawn.ProcessExitedException: process 1 terminated with exit code 1
while not ProcessContext(processes, error_queues).join():
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join
raise ProcessExitedException(
torch.multiprocessing.spawn.ProcessExitedException: process 3 terminated with exit code 1
Traceback (most recent call last):
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code
exec(code, run_globals)
Traceback (most recent call last):
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in <module>
main()
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main
return _run_code(code, main_globals, None,
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code
S2TTask.main(cmd=cmd)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1134, in main
exec(code, run_globals)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in <module>
while not ProcessContext(processes, error_queues).join():
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join
main()
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main
S2TTask.main(cmd=cmd)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1134, in main
while not ProcessContext(processes, error_queues).join():
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join
raise ProcessExitedException(
torch.multiprocessing.spawn.ProcessExitedException: process 3 terminated with exit code 1
raise ProcessExitedException(
torch.multiprocessing.spawn.ProcessExitedException: process 2 terminated with exit code 1
Traceback (most recent call last):
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code
Traceback (most recent call last):
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 196, in _run_module_as_main
exec(code, run_globals)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in <module>
return _run_code(code, main_globals, None,
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/runpy.py", line 86, in _run_code
main()
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main
exec(code, run_globals)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 23, in <module>
S2TTask.main(cmd=cmd)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1134, in main
while not ProcessContext(processes, error_queues).join():
main()
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/bin/s2t_train.py", line 19, in main
S2TTask.main(cmd=cmd)
File "/scratch/bbjs/peng6/espnet-whisper-public/espnet2/tasks/abs_task.py", line 1134, in main
while not ProcessContext(processes, error_queues).join():
File "/scratch/bbjs/peng6/espnet-whisper-public/tools/miniconda/envs/espnet/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 149, in join
raise ProcessExitedException(
torch.multiprocessing.spawn.ProcessExitedException: process 1 terminated with exit code 1
raise ProcessExitedException(
torch.multiprocessing.spawn.ProcessExitedException: process 3 terminated with exit code 1
srun: error: gpua036: task 5: Exited with exit code 1
srun: error: gpua035: task 4: Exited with exit code 1
srun: error: gpua037: task 6: Exited with exit code 1
srun: error: gpua038: task 7: Exited with exit code 1
srun: error: gpua039: task 8: Exited with exit code 1
srun: error: gpua040: task 9: Exited with exit code 1
srun: error: gpua034: task 3: Exited with exit code 1
# Accounting: begin_time=1701628661
# Accounting: end_time=1701628779
# Accounting: time=118 threads=1
# Finished at Sun Dec 3 12:39:39 CST 2023 with status 1